NCBI C++ ToolKit
aln_reader.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: aln_reader.cpp 100615 2023-08-17 18:09:35Z stakhovv $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Josh Cherry
27  *
28  * File Description: C++ wrappers for alignment file reading
29  *
30  */
31 
32 #include <ncbi_pch.hpp>
35 #include <objtools/error_codes.hpp>
36 #include <util/format_guess.hpp>
37 
43 #include <objects/seq/Seq_data.hpp>
44 #include <objects/seq/IUPACna.hpp>
45 #include <objects/seq/IUPACaa.hpp>
46 #include <objects/seq/Bioseq.hpp>
53 #include "aln_errors.hpp"
54 
55 #include <cassert>
56 
57 #define NCBI_USE_ERRCODE_X Objtools_Rd_Align
58 
61 
62 
64 {
65  auto lineNumber = error.GetLineNum();
66  if (lineNumber == -1) {
67  return FORMAT(
68  "At ID '" << error.GetID() << "' "
69  "in category '" << static_cast<int>(error.GetCategory()) << "': "
70  << error.GetMsg() << "'");
71  }
72  return FORMAT(
73  "At ID '" << error.GetID() << "' "
74  "in category '" << static_cast<int>(error.GetCategory()) << "' "
75  "at line " << error.GetLineNum() << ": "
76  << error.GetMsg() << "'");
77 }
78 
79 CAlnError::CAlnError(int category, int line_num, string id, string message)
80 {
81  switch (category)
82  {
83  case -1:
85  break;
86  case 0:
88  break;
89  case 1:
91  break;
92  case 2:
94  break;
95  case 3:
97  break;
98  case 4:
100  break;
101  default:
103  break;
104  }
105 
107  m_ID = id;
108  m_Message = message;
109 }
110 
111 
113 {
114  m_Category = e.GetCategory();
115  m_LineNum = e.GetLineNum();
116  m_ID = e.GetID();
117  m_Message = e.GetMsg();
118 }
119 
120 
122 {
123 public:
125  : m_pErrorReporter(pErrorReporter)
127 
128 
129  void operator()(EDiagSev severity,
130  int lineNum,
131  const string& idString,
132  CFastaIdValidate::EErrCode /*errCode*/,
133  const string& msg)
134  {
136  lineNum,
137  severity,
140  msg,
141  idString);
142  }
143 
144 private:
146 };
147 
148 
150 {
151 public:
152  using TIds = list<CRef<CSeq_id>>;
153 
154  void operator()(const TIds& ids,
155  int lineNum,
156  CAlnErrorReporter* pErrorReporter);
157 private:
159 };
160 
161 
163  const TIds& ids,
164  int lineNum,
165  CAlnErrorReporter* pErrorReporter)
166 {
167  m_FastaIdValidate(ids, lineNum, CDefaultIdErrorReporter(pErrorReporter));
168 }
169 
170 
172  m_fValidateIds(fValidateIds),
173  m_AlignFormat(EAlignFormat::UNKNOWN),
174  m_IS(is), m_ReadDone(false), m_ReadSucceeded(false),
175  m_UseNexusInfo(true)
176 {
177  m_Errors.clear();
179  SetAllGap(".-");
180  if (!m_fValidateIds) {
182  }
183 }
184 
185 
187 {
188  if (!fSingleIdValidate) {
189  return CDefaultIdValidate();
190  }
191 
192  return [fSingleIdValidate](const list<CRef<CSeq_id>>& ids,
193  int lineNum,
194  CAlnErrorReporter* errorReporter) {
195  for (const auto& pId : ids) {
196  fSingleIdValidate(*pId, lineNum, errorReporter);
197  }
198  };
199 }
200 
201 
203  CAlnReader(is, s_GetMultiIdValidate(fSingleIdValidate))
204 {}
205 
206 
208  EAlphabet alphaId)
209 {
210  static map<EAlphabet, string> alphaMap{
211 
212  {EAlphabet::eAlpha_Default, // use file type default
213  ""},
214 
215  {EAlphabet::eAlpha_Nucleotide, // non negotiable due to existing code
216  "ABCDGHKMNRSTUVWXYabcdghkmnrstuvwxy"},
217 
218  {EAlphabet::eAlpha_Protein, // non negotiable due to existing code
219  "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz*"},
220 
221  {EAlphabet::eAlpha_Dna, // all ambiguity characters but not U
222  "ABCDGHKMNRSTVWXYabcdghkmnrstvwxy"},
223 
224  {EAlphabet::eAlpha_Rna, // all ambiguity characters but not T
225  "ABCDGHKMNRSTVWXYabcdghkmnrstvwxy"},
226 
227  {EAlphabet::eAlpha_Dna_no_ambiguity,
228  "ACGTNacgtn"}, // DNA + N for unknown
229 
230  {EAlphabet::eAlpha_Rna_no_ambiguity,
231  "ACGUNacgun"}, // RNA + N for unknown
232  };
233  return alphaMap[alphaId];
234 };
235 
236 
238 {
239  SetAlphabet(alpha);
240  SetAllGap("-");
241 }
242 
243 
245 {
246  SetAlphabet(alpha);
247  SetAllGap("-");
248 }
249 
250 
252 {
253  SetAlphabet(alpha);
254  SetAllGap("-");
255 }
256 
257 
259 {
260  SetAlphabet(alpha);
261  SetAllGap("-");
262 }
263 
264 
265 
266 static void
268  ILineErrorListener* pEC,
269  EDiagSev severity,
270  int code,
271  int subcode,
272  const string& seqId,
273  int lineNumber,
274  const string& message,
276 {
277  if (!pEC) {
278  NCBI_THROW2(CObjReaderParseException, eFormat, message, 0);
279  }
282  problemType,
283  severity,
284  code,
285  subcode,
286  seqId,
287  lineNumber,
288  message));
289  pEC->PutError(*pErr);
290 }
291 
292 
293 void CAlnReader::Read(
294  TReadFlags readFlags,
295  ncbi::objects::ILineErrorListener* pErrorListener)
296 {
297 
298  theErrorReporter.reset(new CAlnErrorReporter(pErrorListener));
299  if (m_ReadDone) {
300  return;
301  }
302 
303  // read the alignment stream
304  SAlignmentFile alignmentInfo;
305  try {
307  x_VerifyAlignmentInfo(alignmentInfo, readFlags);
308  }
309  catch (const SShowStopper& showStopper) {
310  theErrorReporter->Fatal(showStopper);
311  return;
312  }
313 
314  m_Dim = m_IdStrings.size();
315  m_ReadDone = true;
316  m_ReadSucceeded = true;
317 }
318 
319 void CAlnReader::Read(
320  bool guess,
321  bool generate_local_ids,
322  ncbi::objects::ILineErrorListener* pErrorListener)
323 {
324  // read the alignment stream
325  SAlignmentFile alignmentInfo;
326  try {
328  m_IS, generate_local_ids, m_UseNexusInfo, mSequenceInfo, alignmentInfo);
329  TReadFlags flags = 0;
330  x_VerifyAlignmentInfo(alignmentInfo, flags);
331  }
332  catch (const SShowStopper& showStopper) {
333  theErrorReporter->Fatal(showStopper);
334  return;
335  }
336  m_Dim = m_IdStrings.size();
337  m_ReadDone = true;
338  m_ReadSucceeded = true;
339 }
340 
341 
344  TIdList& ids)
345 {
346  ids.clear();
347  const auto& idString = seqIdInfo.mData;
348 
350  if (flags^fGenerateLocalIDs) {
351  parseFlags |= CSeq_id::fParse_RawText;
352  }
353 
354 
355  try {
356  CSeq_id::ParseIDs(ids, idString, parseFlags);
357  }
358  catch (...) { // report an error and interpret the id string as a local ID
359  theErrorReporter->Error(
360  seqIdInfo.mNumLine,
362  "Unable to parse sequence ID string.");
363  ids.push_back(Ref(new CSeq_id(CSeq_id::e_Local, idString)));
364  }
365 
366  if (m_fValidateIds) {
367  m_fValidateIds(ids, seqIdInfo.mNumLine, theErrorReporter.get());
368  }
369  return;
370 }
371 
373  const SAlignmentFile& alignmentInfo,
375 {
376 
377  const auto num_sequences = alignmentInfo.NumSequences();
378 
379  if (num_sequences == 0) {
380  throw SShowStopper(
381  -1,
383  "No sequence data was detected in alignment file.");
384  }
385 
386 
387  if (num_sequences == 1) {
388  throw SShowStopper(
389  -1,
391  "Only one sequence was detected in the alignment file. An alignment file must contain more than one sequence.");
392  }
393 
394 
395  m_Seqs.assign(alignmentInfo.mSequences.begin(), alignmentInfo.mSequences.end());
396 
397 
398  for (auto seqIdInfo : alignmentInfo.mIds) {
399  m_IdStrings.push_back(seqIdInfo.mData); // m_IdStrings is redundant and should be removed
400  TIdList ids;
401  x_ParseAndValidateSeqIds(seqIdInfo, flags, ids);
402  m_Ids.push_back(ids);
403  }
404 
405  auto numDeflines = alignmentInfo.NumDeflines();
406  if (numDeflines) {
407  if (numDeflines == m_Ids.size()) {
408  m_DeflineInfo.resize(numDeflines);
409  for (int i=0; i< numDeflines; ++i) {
410  m_DeflineInfo[i] = {
412  alignmentInfo.mDeflines[i].mData),
413  alignmentInfo.mDeflines[i].mNumLine};
414  }
415  }
416  else {
417  string description = ErrorPrintf(
418  "Expected %d deflines but finding %d. ",
419  m_Ids.size(),
420  numDeflines);
421  description +=
422  "If deflines are used, each sequence must have a corresponding defline. "
423  "Note that deflines are optional.",
424  theErrorReporter->Error(
425  -1,
427  description);
428  }
429  }
430 }
431 
432 
434 {
435  m_MiddleSections.clear();
436 
437  for (TNumrow row_i = 0; row_i < m_Dim; row_i++) {
438  TSeqPos begin_len = m_Seqs[row_i].find_first_not_of(GetBeginningGap());
439  TSeqPos end_len = 0;
440  if (begin_len < m_Seqs[row_i].length()) {
441  string::iterator s = m_Seqs[row_i].end();
442  while (s != m_Seqs[row_i].begin()) {
443  --s;
444  if (GetEndGap().find(*s) != string::npos) {
445  end_len++;
446  } else {
447  break;
448  }
449  }
450  }
451  m_MiddleSections.push_back(TAlignMiddleInterval(begin_len, m_Seqs[row_i].length() - end_len - 1));
452  }
453 }
454 
455 
456 bool CAlnReader::x_IsGap(TNumrow row, TSeqPos pos, const string& residue)
457 {
458  if (m_MiddleSections.size() == 0) {
460  }
461  if (row > m_MiddleSections.size()) {
462  return false;
463  }
464  if (pos < m_MiddleSections[row].first) {
465  if (NStr::Find(GetBeginningGap(), residue) == string::npos) {
466  return false;
467  } else {
468  return true;
469  }
470  } else if (pos > m_MiddleSections[row].second) {
471  if (NStr::Find(GetEndGap(), residue) == string::npos) {
472  return false;
473  } else {
474  return true;
475  }
476  } else {
477  if (NStr::Find(GetMiddleGap(), residue) == string::npos) {
478  return false;
479  } else {
480  return true;
481  }
482  }
483 }
484 
485 CRef<CSeq_id> CAlnReader::GenerateID(const string& fasta_defline,
486  const TSeqPos& index,
487  TFastaFlags fasta_flags)
488 {
489  _ASSERT(index < m_Dim);
490  _ASSERT(!m_Ids[index].empty());
491 
492  return FindBestChoice(m_Ids[index], CSeq_id::BestRank);
493 }
494 
495 
497  CDense_seg& denseg)
498 {
499  CDense_seg::TIds& ids = denseg.SetIds();
500  ids.resize(m_Dim);
501  m_Ids.resize(m_Dim);
502 
503  for (auto i=0; i<m_Dim; ++i) {
504  // Reconstruct original defline string from results
505  // returned by C code.
506  string fasta_defline = m_IdStrings[i];
507  if (i < m_DeflineInfo.size() && !m_DeflineInfo[i].mData.empty()) {
508  fasta_defline += " " + m_DeflineInfo[i].mData;
509  }
510  ids[i] = GenerateID(fasta_defline, i, fasta_flags);
511  }
512  return;
513  }
514 
515 
517  ILineErrorListener* pErrorListener)
518  {
519  if (m_Aln) {
520  return m_Aln;
521  } else if ( !m_ReadDone ) {
523  "CAlnReader::GetSeqAlign(): "
524  "Seq_align is not available until after Read()", 0);
525  }
526 
527  if (!m_ReadSucceeded) {
528  return CRef<CSeq_align>();
529  }
530 
531  typedef CDense_seg::TNumseg TNumseg;
532 
533  m_Aln = new CSeq_align();
535  m_Aln->SetDim(m_Dim);
536 
537  CDense_seg& ds = m_Aln->SetSegs().SetDenseg();
538  ds.SetDim(m_Dim);
539 
540  CDense_seg::TStarts& starts = ds.SetStarts();
541  //CDense_seg::TStrands& strands = ds.SetStrands();
542  CDense_seg::TLens& lens = ds.SetLens();
543 
544  x_AssignDensegIds(fasta_flags, ds);
545 
546  // get the length of the alignment
547  TSeqPos aln_stop = m_Seqs[0].size();
548  for (TNumrow row_i = 1; row_i < m_Dim; row_i++) {
549  if (m_Seqs[row_i].size() > aln_stop) {
550  aln_stop = m_Seqs[row_i].size();
551  }
552  }
553 
554 
555  m_SeqVec.resize(m_Dim);
556  for (TNumrow row_i = 0; row_i < m_Dim; row_i++) {
557  m_SeqVec[row_i].resize(m_Seqs[row_i].length(), 0);
558  }
559  m_SeqLen.resize(m_Dim, 0);
560  vector<bool> is_gap; is_gap.resize(m_Dim, true);
561  vector<bool> prev_is_gap; prev_is_gap.resize(m_Dim, true);
562  vector<TSignedSeqPos> next_start; next_start.resize(m_Dim, 0);
563  int starts_i = 0;
564  TSeqPos prev_aln_pos = 0, prev_len = 0;
565  bool new_seg = true;
566  TNumseg numseg = 0;
567 
568  for (TSeqPos aln_pos = 0; aln_pos < aln_stop; aln_pos++) {
569  for (TNumrow row_i = 0; row_i < m_Dim; row_i++) {
570  if (aln_pos >= m_Seqs[row_i].length()) {
571  if (!is_gap[row_i]) {
572  is_gap[row_i] = true;
573  new_seg = true;
574  }
575  } else {
576  string residue = m_Seqs[row_i].substr(aln_pos, 1);
577  NStr::ToUpper(residue);
578  if (!x_IsGap(row_i, aln_pos, residue)) {
579 
580  if (is_gap[row_i]) {
581  is_gap[row_i] = false;
582  new_seg = true;
583  }
584 
585  // add to the sequence vector
586  m_SeqVec[row_i][m_SeqLen[row_i]++] = residue[0];
587 
588  } else {
589 
590  if ( !is_gap[row_i] ) {
591  is_gap[row_i] = true;
592  new_seg = true;
593  }
594  }
595 
596  }
597  }
598 
599  if (new_seg) {
600  if (numseg) { // if not the first seg
601  lens.push_back(prev_len = aln_pos - prev_aln_pos);
602  for (TNumrow row_i = 0; row_i < m_Dim; row_i++) {
603  if ( !prev_is_gap[row_i] ) {
604  next_start[row_i] += prev_len;
605  }
606  }
607  }
608 
609  starts.resize(starts_i + m_Dim);
610  for (TNumrow row_i = 0; row_i < m_Dim; row_i++) {
611  if (is_gap[row_i]) {
612  starts[starts_i++] = -1;
613  } else {
614  starts[starts_i++] = next_start[row_i];;
615  }
616  prev_is_gap[row_i] = is_gap[row_i];
617  }
618 
619  prev_aln_pos = aln_pos;
620 
621  numseg++;
622  new_seg = false;
623  }
624  }
625 
626  for (TNumrow row_i = 0; row_i < m_Dim; row_i++) {
627  m_SeqVec[row_i].resize(m_SeqLen[row_i]); // resize down to actual size
628  }
629 
630  lens.push_back(aln_stop - prev_aln_pos);
631  //strands.resize(numseg * m_Dim, eNa_strand_plus);
632  _ASSERT(lens.size() == numseg);
633  ds.SetNumseg(numseg);
634 
635 #if _DEBUG
636  m_Aln->Validate(true);
637 #endif
638  return m_Aln;
639  }
640 
641 
644  const string& alphabet,
645  const string& seqData,
646  ILineErrorListener* pErrorListener
647  )
648 {
649  return x_GetSequenceMolType(alphabet, seqData, "", pErrorListener);
650 }
651 
652 
655  const string& alphabet,
656  const string& seqData,
657  const string& seqId, // used in error message
658  ILineErrorListener* pErrorListener
659  )
660 {
661  const auto& missingChars = GetMissing();
662  string seqChars = seqData;
663  if (!missingChars.empty()) {
664  seqChars.erase(
665  remove_if(seqChars.begin(), seqChars.end(),
666  [&](char c) { return missingChars.find(c) != string::npos;}),
667  seqChars.end());
668  }
669 
670  auto formatGuess = CFormatGuess::SequenceType(seqChars.data(), seqChars.length());
671  if (formatGuess == CFormatGuess::eProtein) {
672  return CSeq_inst::eMol_aa;
673  }
674 
675  //if alphabet contains full complement (26) of protein chars then
676  // it's definitely protein. It may also contain stop-codon characters:
677  if (formatGuess == CFormatGuess::eUndefined &&
678  alphabet.size() >= 2*26) {
679  return CSeq_inst::eMol_aa;
680  }
681 
682  auto posFirstT = seqChars.find_first_of("Tt");
683  auto posFirstU = seqChars.find_first_of("Uu");
684  if (posFirstT != string::npos && posFirstU != string::npos) {
685  string msg = "Invalid Mol Type: "
686  "U and T cannot appear in the same nucleotide sequence. "
687  "Reinterpreting as protein.";
688  sReportError(pErrorListener,
689  eDiag_Error,
692  seqId, 0, msg);
693 
694 
695  //impossible NA- can't contain both
696  return CSeq_inst::eMol_aa;
697  }
698  return (posFirstU == string::npos ? CSeq_inst::eMol_dna : CSeq_inst::eMol_rna);
699 }
700 
701 
702 
704  CSeq_inst::EMol mol,
705  const string& seqData) const
706 {
707  auto pSeqInst = Ref(new CSeq_inst());
708  pSeqInst->SetRepr(CSeq_inst::eRepr_raw);
709  pSeqInst->SetMol(mol);
710  pSeqInst->SetLength(seqData.size());
711  CSeq_data& data = pSeqInst->SetSeq_data();
712  if (mol == CSeq_inst::eMol_aa) {
713  data.SetIupacaa().Set(seqData);
714  } else {
715  data.SetIupacna().Set(seqData);
717  }
718  return pSeqInst;
719 }
720 
721 
723  ILineErrorListener* pErrorListener)
724 {
725  if (m_Entry) {
726  return m_Entry;
727  } else if ( !m_ReadDone ) {
729  "CAlnReader::GetSeqEntry(): "
730  "Seq_entry is not available until after Read()", 0);
731  }
732 
733  if (!m_ReadSucceeded) {
734  return CRef<CSeq_entry>();
735  }
736 
737  m_Entry = new CSeq_entry();
738  CRef<CSeq_align> seq_align = GetSeqAlign(fasta_flags, pErrorListener);
739 
740  CRef<CSeq_annot> seq_annot (new CSeq_annot);
741  seq_annot->SetData().SetAlign().push_back(seq_align);
742 
743  m_Entry->SetSet().SetClass(CBioseq_set::eClass_pop_set);
744  m_Entry->SetSet().SetAnnot().push_back(seq_annot);
745 
746  auto& seq_set = m_Entry->SetSet().SetSeq_set();
747 
748  typedef CDense_seg::TDim TNumrow;
749  for (TNumrow row_i = 0; row_i < m_Dim; row_i++) {
750  const string& seq_str = m_SeqVec[row_i];
751  auto pSubEntry = Ref(new CSeq_entry());
752 
753  // seq-id(s)
754  auto& ids = pSubEntry->SetSeq().SetId();
755  ids = m_Ids[row_i];
756 
757  // mol
759  CSeq_id::EAccessionInfo ai = ids.front()->IdentifyAccession();
760  if (ai & CSeq_id::fAcc_nuc) {
761  mol = CSeq_inst::eMol_na;
762  } else if (ai & CSeq_id::fAcc_prot) {
763  mol = CSeq_inst::eMol_aa;
764  } else {
765  const string seqId = ids.front()->AsFastaString();
766  mol = x_GetSequenceMolType(GetAlphabet(), seq_str, seqId, pErrorListener);
767  }
768  // seq-inst
769  auto pSeqInst = x_GetSeqInst(mol, seq_str);
770  pSubEntry->SetSeq().SetInst(*pSeqInst);
771  seq_set.push_back(pSubEntry);
772  }
773 
774  if (!m_DeflineInfo.empty()) {
775  int i=0;
776  if (fasta_flags & CFastaReader::fAddMods) {
777  for (auto& pSeqEntry : seq_set) {
778  x_AddMods(m_DeflineInfo[i++], pSeqEntry->SetSeq(), pErrorListener);
779  }
780  }
781  else {
782  for (auto& pSeqEntry : seq_set) {
783  x_AddTitle(m_DeflineInfo[i++].mData,
784  pSeqEntry->SetSeq());
785  }
786  }
787  }
788 
789  return m_Entry;
790 }
791 
792 
793 static void s_AppendMods(
794  const CModHandler::TModList& mods,
795  string& title
796  )
797 {
798  for (const auto& mod : mods) {
799  title.append(" ["
800  + mod.GetName()
801  + "="
802  + mod.GetValue()
803  + "]");
804  }
805 }
806 
807 void CAlnReader::x_AddMods(const SLineInfo& defline_info,
808  CBioseq& bioseq,
809  ILineErrorListener* pErrorListener)
810 {
811  auto defline = defline_info.mData;
812  if (NStr::IsBlank(defline)) {
813  return;
814  }
815 
816  auto pFirstID = bioseq.GetFirstId();
817  _ASSERT(pFirstID);
818  const auto idString = pFirstID->AsFastaString();
819 
821  errorReporter(idString, defline_info.mNumLine, pErrorListener);
822 
823  CModHandler::TModList mod_list;
824  string remainder;
825 
826  // Parse the defline string for modifiers
827  CTitleParser::Apply(defline, mod_list, remainder);
828  if (mod_list.empty() && NStr::IsBlank(remainder)) {
829  return;
830  }
831 
832  CModHandler mod_handler;
833  CModHandler::TModList rejected_mods;
834  mod_handler.AddMods(mod_list, CModHandler::eAppendReplace, rejected_mods, errorReporter);
835 
836  // Apply modifiers to the bioseq
837  CModHandler::TModList skipped_mods;
838  const bool logInfo = pErrorListener ?
839  pErrorListener->SevEnabled(eDiag_Info) :
840  false;
841 
842  CModAdder::Apply(mod_handler, bioseq, skipped_mods, logInfo, errorReporter);
843 
844  s_AppendMods(rejected_mods, remainder);
845  s_AppendMods(skipped_mods, remainder);
846  // Add title string
847  NStr::TruncateSpacesInPlace(remainder);
848  x_AddTitle(remainder, bioseq);
849 }
850 
851 
852 void CAlnReader::x_AddTitle(const string& title, CBioseq& bioseq)
853 {
854  if (NStr::IsBlank(title)) {
855  return;
856  }
857  auto pDesc = Ref(new CSeqdesc());
858  pDesc->SetTitle() = title;
859  bioseq.SetDescr().Set().push_back(std::move(pDesc));
860 }
861 
862 
863 void CAlnReader::ParseDefline(const string& defline,
864  const SDeflineParseInfo& info,
865  const TIgnoredProblems& ignoredErrors,
866  list<CRef<CSeq_id>>& ids,
867  bool& hasRange,
868  TSeqPos& rangeStart,
869  TSeqPos& rangeEnd,
870  TSeqTitles& seqTitles,
871  ILineErrorListener* pMessageListener)
872 {
874  defline,
875  info,
876  ignoredErrors,
877  ids,
878  hasRange,
879  rangeStart,
880  rangeEnd,
881  seqTitles,
882  pMessageListener);
883 }
884 
885 
887 
User-defined methods of the data storage class.
User-defined methods of the data storage class.
END_ENUM_INFO string ErrorPrintf(const char *format,...)
Definition: aln_errors.cpp:99
thread_local unique_ptr< CAlnErrorReporter > theErrorReporter
Definition: aln_errors.cpp:40
EAlignFormat
Definition: aln_formats.hpp:37
@ UNKNOWN
Definition: aln_formats.hpp:38
USING_SCOPE(objects)
static void sReportError(ILineErrorListener *pEC, EDiagSev severity, int code, int subcode, const string &seqId, int lineNumber, const string &message, ILineError::EProblem problemType=ILineError::eProblem_GeneralParsingError)
Definition: aln_reader.cpp:267
static CAlnReader::FValidateIds s_GetMultiIdValidate(CAlnReader::FIdValidate fSingleIdValidate)
Definition: aln_reader.cpp:186
static void s_AppendMods(const CModHandler::TModList &mods, string &title)
Definition: aln_reader.cpp:793
string sAlnErrorToString(const CAlnError &error)
Definition: aln_reader.cpp:63
bool ReadAlignmentFile(istream &istr, bool gen_local_ids, bool use_nexus_info, CSequenceInfo &sequence_info, SAlignmentFile &alignmentInfo, ILineErrorListener *pErrorListener=nullptr)
Definition: alnread.cpp:78
void remove_if(Container &c, Predicate *__pred)
Definition: chainer.hpp:69
AutoPtr –.
Definition: ncbimisc.hpp:401
CAlnErrorReporter.
void Report(int lineNumber, EDiagSev severity, EReaderCode subsystem, EAlnSubcode errorCode, const string &descr, const string &seqId="")
CAlnError(int category, int line_num, string id, string message)
Definition: aln_reader.cpp:79
int GetLineNum() const
Definition: aln_reader.hpp:80
EAlnErr GetCategory() const
Definition: aln_reader.hpp:79
const string & GetMsg() const
Definition: aln_reader.hpp:82
const string & GetID() const
Definition: aln_reader.hpp:81
string m_ID
Definition: aln_reader.hpp:91
@ eAlnErr_BadFormat
Definition: aln_reader.hpp:65
@ eAlnErr_BadData
Definition: aln_reader.hpp:64
@ eAlnErr_BadChar
Definition: aln_reader.hpp:66
@ eAlnErr_Unknown
Definition: aln_reader.hpp:61
@ eAlnErr_NoError
Definition: aln_reader.hpp:62
string m_Message
Definition: aln_reader.hpp:92
int m_LineNum
Definition: aln_reader.hpp:90
EAlnErr m_Category
Definition: aln_reader.hpp:89
class CAlnReader supports importing a large variety of text-based alignment formats into standard dat...
Definition: aln_reader.hpp:100
vector< string > m_IdStrings
Definition: aln_reader.hpp:283
bool m_ReadSucceeded
Definition: aln_reader.hpp:296
bool m_ReadDone
Definition: aln_reader.hpp:295
void ParseDefline(const string &defline, const SDeflineParseInfo &info, const TIgnoredProblems &ignoredErrors, list< CRef< objects::CSeq_id >> &ids, bool &hasRange, TSeqPos &rangeStart, TSeqPos &rangeEnd, TSeqTitles &seqTitles, objects::ILineErrorListener *pMessageListener)
Definition: aln_reader.cpp:863
void x_ParseAndValidateSeqIds(const TLineInfo &seqIdInfo, TReadFlags flags, TIdList &ids)
Definition: aln_reader.cpp:342
objects::CFastaDeflineReader::TIgnoredProblems TIgnoredProblems
Definition: aln_reader.hpp:339
objects::CSeq_inst::EMol GetSequenceMolType(const string &alphabet, const string &seqData, objects::ILineErrorListener *pErrorListener=nullptr)
Get a sequence's moltype, also considering the alphabet used to read it.
Definition: aln_reader.cpp:643
void x_CalculateMiddleSections()
Definition: aln_reader.cpp:433
virtual ~CAlnReader(void)
Definition: aln_reader.cpp:886
objects::CFastaDeflineReader::SDeflineParseInfo SDeflineParseInfo
Definition: aln_reader.hpp:338
void SetPaup(EAlphabet alpha)
Definition: aln_reader.cpp:251
vector< string > m_SeqVec
Definition: aln_reader.hpp:300
void Read(bool guess, bool generate_local_ids=false, objects::ILineErrorListener *pErrorListener=nullptr)
TAlignMiddles m_MiddleSections
Definition: aln_reader.hpp:312
function< void(const list< CRef< objects::CSeq_id > > &, int, objects::CAlnErrorReporter *)> FValidateIds
Definition: aln_reader.hpp:152
static string GetAlphabetLetters(EAlphabet)
Definition: aln_reader.cpp:207
vector< TSeqPos > m_SeqLen
Definition: aln_reader.hpp:301
objects::CSeq_inst::EMol x_GetSequenceMolType(const string &alphabet, const string &seqData, const string &seqId="", objects::ILineErrorListener *pErrorListener=nullptr)
Definition: aln_reader.cpp:654
const string & GetMiddleGap(void) const
Definition: aln_reader.hpp:392
bool m_UseNexusInfo
Definition: aln_reader.hpp:303
int TReadFlags
binary OR of EReadFlags
Definition: aln_reader.hpp:212
objects::CDense_seg::TDim TNumrow
Definition: aln_reader.hpp:314
const string & GetAlphabet(void) const
Definition: aln_reader.hpp:364
pair< TSeqPos, TSeqPos > TAlignMiddleInterval
characters have different contexts, depending on whether they are before the first non-gap character,...
Definition: aln_reader.hpp:310
FValidateIds m_fValidateIds
Definition: aln_reader.hpp:289
objects::CFastaDeflineReader::TFastaFlags TFastaFlags
Definition: aln_reader.hpp:236
CRef< objects::CSeq_inst > x_GetSeqInst(objects::CSeq_inst::EMol mol, const string &seqData) const
Definition: aln_reader.cpp:703
void SetClustal(EAlphabet alpha)
Definition: aln_reader.cpp:244
vector< TIdList > m_Ids
Definition: aln_reader.hpp:284
void x_AddMods(const TLineInfo &defline_info, objects::CBioseq &bioseq, objects::ILineErrorListener *pErrorListener)
Definition: aln_reader.cpp:807
function< void(const objects::CSeq_id &, int, objects::CAlnErrorReporter *)> FIdValidate
Definition: aln_reader.hpp:157
ncbi::objects::CSequenceInfo mSequenceInfo
Definition: aln_reader.hpp:274
CRef< objects::CSeq_align > GetSeqAlign(TFastaFlags fasta_flags=0, objects::ILineErrorListener *pErrorListener=nullptr)
Create ASN.1 classes from the parsed alignment.
Definition: aln_reader.cpp:516
EAlignFormat m_AlignFormat
Definition: aln_reader.hpp:290
TErrorList m_Errors
Definition: aln_reader.hpp:302
CNcbiIstream & m_IS
Other internal data.
Definition: aln_reader.hpp:294
vector< string > m_Seqs
Definition: aln_reader.hpp:285
CRef< objects::CSeq_entry > m_Entry
Definition: aln_reader.hpp:299
const string & GetEndGap(void) const
Definition: aln_reader.hpp:406
bool x_IsGap(TNumrow row, TSeqPos pos, const string &residue)
Definition: aln_reader.cpp:456
const string & GetMissing(void) const
Definition: aln_reader.hpp:191
objects::CFastaDeflineReader::TSeqTitles TSeqTitles
Definition: aln_reader.hpp:337
void SetPhylip(EAlphabet alpha)
Definition: aln_reader.cpp:258
void SetAlphabet(const string &value)
Definition: aln_reader.hpp:371
virtual CRef< objects::CSeq_id > GenerateID(const string &fasta_defline, const TSeqPos &line_number, TFastaFlags fasta_flags)
Definition: aln_reader.cpp:485
CRef< objects::CSeq_align > m_Aln
Definition: aln_reader.hpp:298
void x_VerifyAlignmentInfo(const ncbi::objects::SAlignmentFile &, TReadFlags readFlags)
Definition: aln_reader.cpp:372
void x_AssignDensegIds(TFastaFlags fasta_flags, objects::CDense_seg &denseg)
Definition: aln_reader.cpp:496
CRef< objects::CSeq_entry > GetSeqEntry(TFastaFlags fasta_flags=objects::CFastaReader::fAddMods, objects::ILineErrorListener *pErrorListener=nullptr)
Definition: aln_reader.cpp:722
void SetAllGap(const string &value)
Convenience function for setting beginning, middle, and end gap to the same thing.
Definition: aln_reader.hpp:433
void x_AddTitle(const string &defline, objects::CBioseq &bioseq)
Definition: aln_reader.cpp:852
list< CRef< objects::CSeq_id > > TIdList
Parsed result data (analogous to SAlignmentFile) Seqs are upper-case strings representing the sequenc...
Definition: aln_reader.hpp:282
vector< TLineInfo > m_DeflineInfo
Definition: aln_reader.hpp:288
void SetFastaGap(EAlphabet alpha)
Alternative & easy way to choose alphabet, etc.
Definition: aln_reader.cpp:237
const string & GetBeginningGap(void) const
Definition: aln_reader.hpp:378
CAlnReader(CNcbiIstream &is, FValidateIds fIdValidate=nullptr)
Definition: aln_reader.cpp:171
const CSeq_id * GetFirstId() const
Definition: Bioseq.cpp:271
CAlnErrorReporter * m_pErrorReporter
Definition: aln_reader.cpp:145
CDefaultIdErrorReporter(CAlnErrorReporter *pErrorReporter)
Definition: aln_reader.cpp:124
void operator()(EDiagSev severity, int lineNum, const string &idString, CFastaIdValidate::EErrCode, const string &msg)
Definition: aln_reader.cpp:129
void operator()(const TIds &ids, int lineNum, CAlnErrorReporter *pErrorReporter)
Definition: aln_reader.cpp:162
list< CRef< CSeq_id > > TIds
Definition: aln_reader.cpp:152
CFastaIdValidate m_FastaIdValidate
Definition: aln_reader.cpp:158
static void ParseDefline(const CTempString &defline, const SDeflineParseInfo &info, const TIgnoredProblems &ignoredErrors, TIds &ids, bool &hasRange, TSeqPos &rangeStart, TSeqPos &rangeEnd, TSeqTitles &seqTitles, ILineErrorListener *pMessageListener)
static ESequenceType SequenceType(const char *str, unsigned length=0, ESTStrictness strictness=eST_Default)
Guess sequence type.
static CLineErrorEx * Create(EProblem eProblem, EDiagSev eSeverity, int code, int subcode, const std::string &strSeqId, unsigned int uLine, const std::string &strErrorMessage=string(""), const std::string &strFeatureName=string(""), const std::string &strQualifierName=string(""), const std::string &strQualifierValue=string(""), const TVecOfLines &vecOfOtherLines=TVecOfLines())
Use this because the constructor is protected.
Definition: line_error.cpp:103
static void Apply(const CModHandler &mod_handler, CBioseq &bioseq, TSkippedMods &skipped_mods, FPostMessage fPostMessage=nullptr)
Definition: mod_reader.cpp:450
list< CModData > TModList
Definition: mod_reader.hpp:94
void AddMods(const TModList &mods, EHandleExisting handle_existing, TModList &rejected_mods, FReportError fReportError=nullptr)
Definition: mod_reader.cpp:221
Definition: Seq_entry.hpp:56
static TSeqPos Pack(CSeq_data *in_seq, TSeqPos uLength=ncbi::numeric_limits< TSeqPos >::max())
static void Apply(const CTempString &title, TModList &mods, string &remainder)
Definition: mod_reader.cpp:754
virtual bool PutError(const ILineError &)=0
Store error in the container, and return true if error was stored fine, and return false if the calle...
@ eProblem_GeneralParsingError
Definition: line_error.hpp:106
virtual bool SevEnabled(EDiagSev severity) const
Definition: listener.cpp:43
vector< string > mSequences
Definition: alnread.hpp:145
vector< TLineInfo > mIds
Definition: alnread.hpp:144
size_t NumDeflines() const
Definition: alnread.hpp:136
size_t NumSequences() const
Definition: alnread.hpp:139
vector< TLineInfo > mDeflines
Definition: alnread.hpp:146
Definition: map.hpp:338
static uch flags
Operators to edit gaps in sequences.
#define true
Definition: bool.h:35
#define false
Definition: bool.h:36
static DLIST_TYPE *DLIST_NAME() first(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:46
static unsigned int line_num
Definition: attributes.c:11
char data[12]
Definition: iconv.c:80
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
EDiagSev
Severity level for the posted diagnostics.
Definition: ncbidiag.hpp:650
@ eDiag_Info
Informational message.
Definition: ncbidiag.hpp:651
@ eDiag_Error
Error message.
Definition: ncbidiag.hpp:653
#define NCBI_THROW2(exception_class, err_code, message, extra)
Throw exception with extra parameter.
Definition: ncbiexpt.hpp:1754
#define FORMAT(message)
Format message using iostreams library.
Definition: ncbiexpt.hpp:672
@ fAddMods
Parse defline mods and add to SeqEntry.
Definition: fasta.hpp:104
static SIZE_TYPE ParseIDs(CBioseq::TId &ids, const CTempString &s, TParseFlags flags=fParse_Default)
Parse a string representing one or more Seq-ids, appending the results to IDS.
Definition: Seq_id.cpp:2613
EAccessionInfo
For IdentifyAccession (below)
Definition: Seq_id.hpp:220
static int BestRank(const CRef< CSeq_id > &id)
Definition: Seq_id.hpp:774
int TParseFlags
Definition: Seq_id.hpp:104
@ fAcc_prot
Definition: Seq_id.hpp:252
@ fAcc_nuc
Definition: Seq_id.hpp:251
@ fParse_RawText
Try to ID raw non-numeric accessions.
Definition: Seq_id.hpp:81
@ fParse_AnyLocal
Treat otherwise unidentified strings as local accessions as long as they don't resemble FASTA-style I...
Definition: Seq_id.hpp:90
CRef< C > Ref(C *object)
Helper functions to get CRef<> and CConstRef<> objects.
Definition: ncbiobj.hpp:2015
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
Definition: ncbistre.hpp:146
static bool IsBlank(const CTempString str, SIZE_TYPE pos=0)
Check if a string is blank (has no text).
Definition: ncbistr.cpp:106
static void TruncateSpacesInPlace(string &str, ETrunc where=eTrunc_Both)
Truncate spaces in a string (in-place)
Definition: ncbistr.cpp:3201
static SIZE_TYPE Find(const CTempString str, const CTempString pattern, ECase use_case=eCase, EDirection direction=eForwardSearch, SIZE_TYPE occurrence=0)
Find the pattern in the string.
Definition: ncbistr.cpp:2891
static string & ToUpper(string &str)
Convert string to upper case – string& version.
Definition: ncbistr.cpp:424
static string TruncateSpaces(const string &str, ETrunc where=eTrunc_Both)
Truncate spaces in a string.
Definition: ncbistr.cpp:3186
C::value_type FindBestChoice(const C &container, F score_func)
Find the best choice (lowest score) for values in a container.
Definition: ncbiutil.hpp:250
TLens & SetLens(void)
Assign a value to Lens data member.
Definition: Dense_seg_.hpp:561
vector< TSeqPos > TLens
Definition: Dense_seg_.hpp:108
vector< TSignedSeqPos > TStarts
Definition: Dense_seg_.hpp:107
void SetDim(TDim value)
Assign a value to Dim data member.
Definition: Dense_seg_.hpp:427
vector< CRef< CSeq_id > > TIds
Definition: Dense_seg_.hpp:106
TStarts & SetStarts(void)
Assign a value to Starts data member.
Definition: Dense_seg_.hpp:536
void SetNumseg(TNumseg value)
Assign a value to Numseg data member.
Definition: Dense_seg_.hpp:474
TIds & SetIds(void)
Assign a value to Ids data member.
Definition: Dense_seg_.hpp:511
@ e_Local
local use
Definition: Seq_id_.hpp:95
@ eClass_pop_set
population study
void SetData(TData &value)
Assign a value to Data data member.
Definition: Seq_annot_.cpp:244
EMol
molecule class in living organism
Definition: Seq_inst_.hpp:108
void SetDescr(TDescr &value)
Assign a value to Descr data member.
Definition: Bioseq_.cpp:65
@ eRepr_raw
continuous sequence
Definition: Seq_inst_.hpp:94
@ eMol_not_set
> cdna = rna
Definition: Seq_inst_.hpp:109
@ eMol_na
just a nucleic acid
Definition: Seq_inst_.hpp:113
Definition of all error codes used in objtools libraries.
int i
static MDB_envinfo info
Definition: mdb_load.c:37
constexpr bool empty(list< Ts... >) noexcept
const struct ncbi::grid::netcache::search::fields::SIZE size
Int mod(Int i, Int j)
Definition: njn_integer.hpp:67
@ eAlnSubcode_BadSequenceCount
@ eAlnSubcode_IllegalSequenceId
@ eAlnSubcode_InconsistentMolType
@ eAlnSubcode_InsufficientDeflineInfo
@ eReader_Alignment
#define row(bind, expected)
Definition: string_bind.c:73
int mNumLine
Definition: alnread.hpp:53
string mData
Definition: alnread.hpp:52
SShowStopper.
Definition: inftrees.h:24
#define _ASSERT
Modified on Wed May 22 11:29:09 2024 by modify_doxy.py rev. 669887