NCBI C++ ToolKit
fasta.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: fasta.cpp 101956 2024-03-08 17:00:41Z gotvyans $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors: Aaron Ucko, NCBI
27 *
28 * File Description:
29 * Reader for FASTA-format sequences. (The writer is CFastaOStream, in
30 * src/objmgr/util/sequence.cpp.)
31 *
32 * ===========================================================================
33 */
34 
35 #include <ncbi_pch.hpp>
36 
38 #include "fasta_aln_builder.hpp"
41 #include <objtools/readers/source_mod_parser.hpp>
42 #include <objtools/error_codes.hpp>
43 
44 #include <corelib/ncbiutil.hpp>
45 #include <util/format_guess.hpp>
47 
50 
52 
53 #include <objects/seq/Bioseq.hpp>
56 #include <objects/seq/NCBIeaa.hpp>
57 #include <objects/seq/IUPACaa.hpp>
58 #include <objects/seq/IUPACna.hpp>
59 #include <objects/seq/Seg_ext.hpp>
62 #include <objects/seq/Seq_ext.hpp>
63 #include <objects/seq/Seq_hist.hpp>
64 #include <objects/seq/Seq_inst.hpp>
66 #include <objects/seq/Seqdesc.hpp>
68 
71 
77 
80 
85 
86 #include <ctype.h>
87 
88 // The "49518053" is just a random number to minimize the chance of the
89 // variable name conflicting with another variable name and has no
90 // particular meaning
91 #define FASTA_LINE_EXPT(_eSeverity, _uLineNum, _MessageStrmOps, _eErrCode, _eProblem, _sFeature, _sQualName, _sQualValue) \
92  do { \
93  stringstream err_strm_49518053; \
94  err_strm_49518053 << _MessageStrmOps; \
95  PostWarning(pMessageListener, (_eSeverity), (_uLineNum), (err_strm_49518053.str()), (_eErrCode), (_eProblem), (_sFeature), (_sQualName), (_sQualValue)); \
96  } while(0)
97 
98 // The "49518053" is just a random number to minimize the chance of the
99 // variable name conflicting with another variable name and has no
100 // particular meaning
101 #define FASTA_PROGRESS(_MessageStrmOps) \
102  do { \
103  stringstream err_strm_49518053; \
104  err_strm_49518053 << _MessageStrmOps; \
105  if( pMessageListener ) { \
106  pMessageListener->PutProgress(err_strm_49518053.str()); \
107  } \
108  } while(false)
109 
110 
111 #define FASTA_WARNING(_uLineNum, _MessageStrmOps, _eProblem, _Feature) \
112  FASTA_LINE_EXPT(eDiag_Warning, _uLineNum, _MessageStrmOps, CObjReaderParseException::eFormat, _eProblem, _Feature, kEmptyStr, kEmptyStr)
113 
114 #define FASTA_WARNING_EX(_uLineNum, _MessageStrmOps, _eProblem, _Feature, _sQualName, _sQualValue) \
115  FASTA_LINE_EXPT(eDiag_Warning, _uLineNum, _MessageStrmOps, CObjReaderParseException::eFormat, _eProblem, _Feature, _sQualName, _sQualValue)
116 
117 #define FASTA_ERROR(_uLineNum, _MessageStrmOps, _eErrCode) \
118  FASTA_LINE_EXPT(eDiag_Error, _uLineNum, _MessageStrmOps, _eErrCode, ILineError::eProblem_GeneralParsingError, kEmptyStr, kEmptyStr, kEmptyStr)
119 
120 #define NCBI_USE_ERRCODE_X Objtools_Rd_Fasta
121 
124 
125 template <typename TStack>
127 {
128 public:
129  typedef typename TStack::value_type TValue;
130  CTempPusher(TStack& s, const TValue& v) : m_Stack(s) { s.push(v); }
131  ~CTempPusher() { _ASSERT( !m_Stack.empty() ); m_Stack.pop(); }
132 
133 private:
134  TStack& m_Stack;
135 };
136 
137 // temporarily swap two CRef's, then swap them back again on destruction
138 // (RAII)
139 template<class TObject>
141 public:
143  CRef<TObject> & pObj1,
144  CRef<TObject> & pObj2 ) :
145  m_pObj1(pObj1),
146  m_pObj2(pObj2)
147  {
149  }
150 
152  {
153  // swap back when done
155  }
156 
157 private:
160 };
161 
163 
164 // The FASTA reader uses these heavily, but the standard versions
165 // aren't inlined on as many configurations as one might hope, and we
166 // don't necessarily want locale-dependent behavior anyway.
167 
168 inline bool s_ASCII_IsUpper(unsigned char c)
169 {
170  return c >= 'A' && c <= 'Z';
171 }
172 
173 inline bool s_ASCII_IsLower(unsigned char c)
174 {
175  return c >= 'a' && c <= 'z';
176 }
177 
178 inline bool s_ASCII_IsAlpha(unsigned char c)
179 {
180  return s_ASCII_IsUpper(c) || s_ASCII_IsLower(c);
181 }
182 
183 // The arg *must* be a lowercase letter or this won't work
184 inline unsigned char s_ASCII_MustBeLowerToUpper(unsigned char c)
185 {
186  return c + ('A' - 'a');
187 }
188 
189 inline bool s_ASCII_IsAmbigNuc(unsigned char c)
190 {
191  switch(c) {
192  case 'U': case 'u':
193  case 'R': case 'r':
194  case 'Y': case 'y':
195  case 'S': case 's':
196  case 'W': case 'w':
197  case 'K': case 'k':
198  case 'M': case 'm':
199  case 'B': case 'b':
200  case 'D': case 'd':
201  case 'H': case 'h':
202  case 'V': case 'v':
203  case 'N': case 'n':
204  return true;
205  default:
206  return false;
207  }
208 }
209 
210 inline static bool s_ASCII_IsUnAmbigNuc(unsigned char c)
211 {
212  switch( c ) {
213  case 'A': case 'C': case 'G': case 'T':
214  case 'a': case 'c': case 'g': case 't':
215  return true;
216  default:
217  return false;
218  }
219 }
220 
221 void
223  const list<string>& stringFlags,
224  TFlags& baseFlags)
225 {
226  static const map<string, CFastaReader::TReaderFlags> flagsMap = {
227  { "AssumeNuc", CFastaReader::fAssumeNuc},
228  { "AssumeProt", CFastaReader::fAssumeProt},
229  { "ForceType", CFastaReader::fForceType},
230  { "NoParseID", CFastaReader::fNoParseID},
231  { "ParseGaps", CFastaReader::fParseGaps},
232  { "OneSeq", CFastaReader::fOneSeq},
233  { "NoSeqData", CFastaReader::fNoSeqData},
234  { "RequireID", CFastaReader::fRequireID},
235  { "DLOptional", CFastaReader::fDLOptional},
236  { "ParseRawID", CFastaReader::fParseRawID},
237  { "SkipCheck", CFastaReader::fSkipCheck},
238  { "NoSplit", CFastaReader::fNoSplit},
239  { "Validate", CFastaReader::fValidate},
240  { "UniqueIDs", CFastaReader::fUniqueIDs},
241  { "StrictGuess", CFastaReader::fStrictGuess},
242  { "LaxGuess", CFastaReader::fLaxGuess},
243  { "AddMods", CFastaReader::fAddMods},
244  { "LetterGaps", CFastaReader::fLetterGaps},
245  { "NoUserObjs", CFastaReader::fNoUserObjs},
246  { "LeaveAsText", CFastaReader::fLeaveAsText},
247  { "QuickIDCheck", CFastaReader::fQuickIDCheck},
248  { "UseIupacaa", CFastaReader::fUseIupacaa},
249  { "HyphensIgnoreAndWarn", CFastaReader::fHyphensIgnoreAndWarn},
250  { "DisableNoResidues", CFastaReader::fDisableNoResidues},
251  { "DisableParseRange", CFastaReader::fDisableParseRange},
252  { "IgnoreMods", CFastaReader::fIgnoreMods}
253  };
254 
255  return CReaderBase::xAddStringFlagsWithMap(stringFlags, flagsMap, baseFlags);
256 };
257 
258 
260  : m_LineReader(&reader), m_MaskVec(0),
261  m_gapNmin(0), m_gap_Unknown_length(0),
262  m_MaxIDLength(kMax_UI4),
263  m_fIdCheck(f_idcheck)
264 {
265  m_Flags.push(flags);
267 }
268 
270  : CFastaReader(*(ILineReader::New(in)), flags, f_idcheck) {}
271 
272 CFastaReader::CFastaReader(const string& path, TFlags flags, FIdCheck f_idcheck)
273  : CFastaReader(*(ILineReader::New(path)), flags, f_idcheck) {}
274 
276  : CReaderBase(fBaseFlags), m_MaskVec(0),
277  m_gapNmin(0), m_gap_Unknown_length(0),
278  m_MaxIDLength(kMax_UI4),
279  m_fIdCheck(f_idcheck)
280 {
281  m_Flags.push(flags);
283 }
284 
286 {
287  _ASSERT(m_Flags.size() == 1);
288 }
289 
290 void CFastaReader::SetMinGaps(TSeqPos gapNmin, TSeqPos gap_Unknown_length)
291 {
292  m_gapNmin = gapNmin; m_gap_Unknown_length = gap_Unknown_length;
293 }
294 
297 {
298  CRef<CSerialObject> object(
299  ReadSeqEntry( lr, pMessageListener ).ReleaseOrNull() );
300  return object;
301 }
302 
305 {
306  CRef<ILineReader> pTempLineReader( &lr );
307  CTempRefSwap<ILineReader> tempRefSwap(m_LineReader, pTempLineReader);
308 
309  return ReadSet(kMax_Int, pMessageListener);
310 }
311 
313 {
315  // m_CurrentMask.Reset();
316  m_SeqData.erase();
317  m_Gaps.clear();
318  m_CurrentPos = 0;
319  m_BestID.Reset();
321  if ( !TestFlag(fInSegSet) ) {
322  if (m_MaskVec && m_NextMask.IsNull()) {
323  m_MaskVec->push_back(SaveMask());
324  }
326  if (m_CurrentMask) {
328  }
329  m_NextMask.Reset();
330  m_SegmentBase = 0;
331  m_Offset = 0;
332  }
334  m_CurrentGapChar = '\0';
335  m_CurrentSeqTitles.clear();
336 
337  bool need_defline = true;
338  CBadResiduesException::SBadResiduePositions bad_residue_positions;
339  while ( !GetLineReader().AtEOF() ) {
340  char c = GetLineReader().PeekChar();
341  if( LineNumber() % 10000 == 0 && LineNumber() != 0 ) {
342  FASTA_PROGRESS("Processing line " << LineNumber());
343  }
344  if (GetLineReader().AtEOF()) {
346  "CFastaReader: Unexpected end-of-file around line " << LineNumber(),
348  break;
349  }
350  if (c == '>' ) {
351  CTempString next_line = *++GetLineReader();
352  string strmodified;
353  if( NStr::StartsWith(next_line, ">?_") ) {
354  CTempString tmp = next_line.substr(3);
355  strmodified = ">";
356  strmodified.append(tmp.data(), tmp.length());
357  next_line = strmodified;
358  }
359  if( NStr::StartsWith(next_line, ">?") ) {
360  // This is actually a data line. an assembly gap, in particular, which
361  // we handle farther below
363  } else {
364  if(need_defline) {
365  ParseDefLine(next_line, pMessageListener);
366  need_defline = false;
367  continue;
368  } else {
370  // start of the next sequence
371  break;
372  }
373  }
374  }
375 
377 
378  if (line.empty()) {
379  continue; // ignore lines containing only whitespace
380  }
381  c = line[0];
382 
383  if (c == '!' || c == '#' || c == ';') {
384  // no content, just a comment or blank line
385  continue;
386  } else if (need_defline) {
387  if (TestFlag(fDLOptional)) {
388  ParseDefLine(">", pMessageListener);
389  need_defline = false;
390  } else {
391  const auto lineNum = LineNumber();
394  "CFastaReader: Input doesn't start with"
395  " a defline or comment around line " + NStr::NumericToString(lineNum),
396  lineNum);
397  }
398  }
399 
400  if ( !TestFlag(fNoSeqData) ) {
401  try {
402  string strmodified;
403  if( NStr::StartsWith(line, ">?_") ) {
404  CTempString tmp = line.substr(3);
405  strmodified = ">";
406  strmodified.append(tmp.data(), tmp.length());
407  line = strmodified;
408  }
409  ParseDataLine(line, pMessageListener);
410  } catch(const CBadResiduesException & e) {
411  // we have to catch this exception so we can build up
412  // information on all lines, not just the first line
413  // with a bad residue
414  bad_residue_positions.m_SeqId = e.GetBadResiduePositions().m_SeqId;
415  bad_residue_positions.AddBadIndexMap(e.GetBadResiduePositions().m_BadIndexMap);
416  }
417  }
418  }
419 
420  if( ! bad_residue_positions.m_BadIndexMap.empty() ) {
421  // bad residues unconditionally throws, for now.
422  // (not worth the refactoring at this time)
423  NCBI_THROW2(CBadResiduesException, eBadResidues,
424  "CFastaReader: There are invalid " + x_NucOrProt() + "residue(s) in input sequence",
425  bad_residue_positions );
426  }
427 
428  if (need_defline && GetLineReader().AtEOF()) {
430  "CFastaReader: Expected defline around line " << LineNumber(),
432  }
433 
434  AssembleSeq(pMessageListener);
435  CRef<CSeq_entry> entry(new CSeq_entry);
436  entry->SetSeq(*m_CurrentSeq);
437 
438  entry->Parentize();
439  return entry;
440 }
441 
443 {
444  CRef<CSeq_entry> entry(new CSeq_entry);
445  if (TestFlag(fOneSeq)) {
446  max_seqs = 1;
447  }
448  for (int i = 0; i < max_seqs && !GetLineReader().AtEOF(); ++i) {
449  try {
450  CRef<CSeq_entry> entry2(ReadOneSeq(pMessageListener));
451  if (max_seqs == 1) {
452  return entry2;
453  }
454  if (entry2.NotEmpty())
455  entry->SetSet().SetSeq_set().push_back(entry2);
456  } catch (const CObjReaderParseException& e) {
458  break;
459  } else {
460  throw;
461  }
462  }
463  }
464 
465 
466  if (entry->IsSet() && entry->GetSet().GetSeq_set().size() == 1) {
467  return entry->SetSet().SetSeq_set().front();
468  } else {
469  entry->Parentize();
470  return entry;
471  }
472 }
473 
475 {
477  return m_NextMask;
478 }
479 
481 {
483 }
484 
486 {
491 }
492 
493 
494 // For reasons of efficiency, this method does not use
495 // CRef<CSeq_interval> to access range information - RW-26
496 void CFastaReader::ParseDefLine(const TStr& defLine,
497  const SDefLineParseInfo& info,
498  const TIgnoredProblems& /*ignoredErrors*/,
499  list<CRef<CSeq_id>>& ids,
500  bool& hasRange,
501  TSeqPos& rangeStart,
502  TSeqPos& rangeEnd,
503  TSeqTitles& seqTitles,
504  ILineErrorListener* pMessageListener)
505 {
508  defLine,
509  info,
510  data,
511  pMessageListener);
512 
513  ids = std::move(data.ids);
514  hasRange = data.has_range;
515  rangeStart = data.range_start;
516  rangeEnd = data.range_end;
517  seqTitles = std::move(data.titles);
518 }
519 
520 
522  CRef<CSeq_id> new_id,
523  TSeqPos range_start,
524  TSeqPos range_end)
525 {
526  CRef<CSeq_align> align(new CSeq_align());
527  align->SetType(CSeq_align::eType_partial); // ?
528  align->SetDim(2);
529  CDense_seg& denseg = align->SetSegs().SetDenseg();
530  denseg.SetNumseg(1);
531  denseg.SetDim(2); // redundant, but required by validator
532  denseg.SetIds().push_back(new_id);
533  denseg.SetIds().push_back(old_id);
534  denseg.SetStarts().push_back(0);
535  denseg.SetStarts().push_back(range_start);
536  if (range_start > range_end) { // negative strand
537  denseg.SetLens().push_back(range_start + 1 - range_end);
538  denseg.SetStrands().push_back(eNa_strand_plus);
539  denseg.SetStrands().push_back(eNa_strand_minus);
540  } else {
541  denseg.SetLens().push_back(range_end + 1 - range_start);
542  }
543 
544  return align;
545 }
546 
547 
549 
550  for (auto pId : ids) {
551  const CSeq_id::EAccessionInfo acc_info = pId->IdentifyAccession();
552  if (acc_info & CSeq_id::fAcc_nuc) {
553  mol = CSeq_inst::eMol_na;
554  return true;
555  }
556 
557  if (acc_info & CSeq_id::fAcc_prot) {
558  mol = CSeq_inst::eMol_aa;
559  return true;
560  }
561  }
562  return false;
563 }
564 
565 
566 void CFastaReader::ParseDefLine(const TStr& s, ILineErrorListener * pMessageListener)
567 {
568  SDefLineParseInfo parseInfo;
569  x_SetDeflineParseInfo(parseInfo);
570 
572  CFastaDeflineReader::ParseDefline(s, parseInfo, data, pMessageListener, m_fIdCheck);
573 
574  m_CurrentSeqTitles = std::move(data.titles);
575 
576  if (data.ids.empty()) {
577  if (TestFlag(fRequireID)) {
578  // No [usable] IDs
580  "CFastaReader: Defline lacks a proper ID around line " << LineNumber(),
582  }
583  }
584  else if (!TestFlag(fForceType)) {
585  CSeq_inst::EMol mol;
586  if (xSetSeqMol(data.ids, mol)) {
587  m_CurrentSeq->SetInst().SetMol(mol);
588  }
589  }
590 
591  PostProcessIDs(data.ids, s, data.has_range, data.range_start, data.range_end);
592 
594  FASTA_PROGRESS("Processing Seq-id: " <<
595  ( m_BestID ? m_BestID->AsFastaString() : string("UNKNOWN") ) );
596 
597  if ( !TestFlag(fNoUserObjs) ) {
598  // store the raw defline in a User-object for reference
599  CRef<CSeqdesc> desc(new CSeqdesc);
600  desc->SetUser().SetType().SetStr("CFastaReader");
601  desc->SetUser().AddField("DefLine", NStr::PrintableString(s));
602  m_CurrentSeq->SetDescr().Set().push_back(desc);
603  }
604 
605  if (TestFlag(fUniqueIDs)) {
606  ITERATE (CBioseq::TId, it, GetIDs()) {
608  if ( !m_IDHandler->CacheIdHandle(h) ) {
610  "CFastaReader: Seq-id " << h.AsString()
611  << " is a duplicate around line " << LineNumber(),
613  }
614  }
615  }
616 }
617 
618 
620  const CBioseq::TId& defline_ids,
621  const string& /*defline*/,
622  const bool has_range,
623  const TSeqPos range_start,
624  const TSeqPos range_end)
625 {
626  if (defline_ids.empty()) {
627  GenerateID();
628  }
629  else {
630  SetIDs() = defline_ids;
631  }
632 
633  if (has_range) {
634  auto old_id = FindBestChoice(GetIDs(), CSeq_id::BestRank);
635  // generate a new ID, and record its relation to the given one(s).
636  SetIDs().clear();
637  GenerateID();
638 
639  CRef<CSeq_align> align = xCreateAlignment(old_id,
640  GetIDs().front(), range_start, range_end);
641 
642  m_CurrentSeq->SetInst().SetHist().SetAssembly().push_back(align);
643  }
644 }
645 
646 
648 {
649  info.fBaseFlags = m_iFlags;
650  info.fFastaFlags = GetFlags();
651  info.maxIdLength = m_bModifiedMaxIdLength ?
652  m_MaxIDLength :
653  0;
654  info.lineNumber = LineNumber();
655 }
656 
657 
659  const TStr& s, TSeqPos& start, TSeqPos& end,
660  ILineErrorListener* pMessageListener)
661 {
663  s,
664  start,
665  end,
666  pMessageListener);
667 }
668 
669 
671  const SLineTextAndLoc & lineInfo, ILineErrorListener * pMessageListener)
672 {
673  const static size_t kWarnTitleLength = 1000;
674  if( lineInfo.m_sLineText.length() > kWarnTitleLength ) {
675  FASTA_WARNING(lineInfo.m_iLineNum,
676  "FASTA-Reader: Title is very long: " << lineInfo.m_sLineText.length()
677  << " characters (max is " << kWarnTitleLength << ")",
678  ILineError::eProblem_TooLong, "defline");
679  }
680 
681  CreateWarningsForSeqDataInTitle(lineInfo.m_sLineText,lineInfo.m_iLineNum, pMessageListener);
682 
683  CTempString title(lineInfo.m_sLineText.data(), lineInfo.m_sLineText.length());
684  x_ApplyMods(title, lineInfo.m_iLineNum, *m_CurrentSeq, pMessageListener);
685 }
686 
687 
688 bool CFastaReader::IsValidLocalID(const TStr& s) const
689 {
690  return IsValidLocalID(s, GetFlags());
691 }
692 
693 bool CFastaReader::IsValidLocalID(const TStr& idString,
694  const TFlags fFastaFlags)
695 {
696  if ( fFastaFlags & fQuickIDCheck) { // check only the first character
697  return CSeq_id::IsValidLocalID(idString.substr(0,1));
698  }
699 
700  return CSeq_id::IsValidLocalID(idString);
701 }
702 
704 {
706  SetIDs().push_back(id);
707 }
708 
709 
711  const TStr& s, ILineErrorListener * pMessageListener)
712 {
713  // make sure the first data line has at least SOME resemblance to
714  // actual sequence data.
715  if (TestFlag(fSkipCheck) || ! m_SeqData.empty() ) {
716  return;
717  }
718  const bool bIgnoreHyphens = TestFlag(fHyphensIgnoreAndWarn);
719  size_t good = 0, bad = 0;
720  // in case the data has huge sequences all on the first line we do need
721  // a cutoff and "70" seems reasonable since it's the default width of
722  // CFastaOstream (as of 2017-03-09)
723  size_t len_to_check = min(s.length(),
724  static_cast<size_t>(70));
725  const bool bIsNuc = (
729  size_t ambig_nuc = 0;
730  for (size_t pos = 0; pos < len_to_check; ++pos) {
731  unsigned char c = s[pos];
732  if (s_ASCII_IsAlpha(c) || c == '*') {
733  ++good;
734  if( bIsNuc && s_ASCII_IsAmbigNuc(c) ) {
735  ++ambig_nuc;
736  }
737  } else if( c == '-' ) {
738  if( ! bIgnoreHyphens ) {
739  ++good;
740  }
741  // if bIgnoreHyphens == true, the "hyphens are ignored" warning
742  // will be triggered elsewhere
743  } else if (isspace(c) || (c >= '0' && c <= '9')) {
744  // treat whitespace and digits as neutral
745  } else if (c == ';') {
746  break; // comment -- ignore rest of line
747  } else {
748  ++bad;
749  }
750  }
751  if (bad >= good / 3 &&
752  (len_to_check > 3 || good == 0 || bad > good))
753  {
755  "CFastaReader: Near line " << LineNumber()
756  << ", there's a line that doesn't look like plausible data, "
757  "but it's not marked as defline or comment.",
759  }
760  // warn if more than a certain percentage is ambiguous nucleotides
761  const static size_t kWarnPercentAmbiguous = 40; // e.g. "40" means "40%"
762  const size_t percent_ambig = (good == 0)?100:((ambig_nuc * 100) / good);
763  if( len_to_check > 3 && percent_ambig > kWarnPercentAmbiguous ) {
765  "FASTA-Reader: Start of first data line in seq is about "
766  << percent_ambig << "% ambiguous nucleotides (shouldn't be over "
767  << kWarnPercentAmbiguous << "%)",
769  "first data line");
770  }
771 }
772 
773 
775  const TStr& s, ILineErrorListener * pMessageListener)
776 {
777  if( NStr::StartsWith(s, ">?") ) {
778  ParseGapLine(s, pMessageListener);
779  return;
780  }
781 
782  CheckDataLine(s, pMessageListener);
783 
784  // most lines won't have a comment (';') so optimize for that case as
785  // much as possible
786 
787  const size_t s_len = s.length(); // avoid checking over and over
788 
789  if (m_SeqData.capacity() < m_SeqData.size() + s_len) {
790  // ensure exponential capacity growth to avoid quadratic runtime
791  m_SeqData.reserve(2 * max(m_SeqData.capacity(), s_len));
792  }
793 
795  && m_CurrentMask.Empty())
796  {
797  // copy until comment char or end of line
798  size_t pos = 0;
799  char c = '\0';
800  for( ; pos < s_len && (c = s[pos]) != ';'; ++pos) {
801  m_SeqData.push_back(c);
802  }
803  m_CurrentPos += pos;
804  return;
805  }
806 
807  // we're stricter with nucs, so try to determine if we should
808  // assume this is a nuc
809  const bool bIsNuc = (
810  (! TestFlag(fForceType) &&
812  ? m_CurrentSeq->IsNa()
814  );
815 
816  m_SeqData.resize(m_CurrentPos + s_len);
817 
818  // these will stay as -1 and empty unless there's an error
819  Int8 bad_pos_line_num = -1;
820  vector<TSeqPos> bad_pos_vec;
821 
822  const bool bHyphensIgnoreAndWarn = TestFlag(fHyphensIgnoreAndWarn);
823  const bool bHyphensAreGaps =
824  ( TestFlag(fParseGaps) && ! bHyphensIgnoreAndWarn );
825  const bool bAllowLetterGaps =
827 
828  bool bIgnorableHyphenSeen = false;
829 
830  // indicates how the char should be treated
831  enum ECharType {
832  eCharType_NormalNonGap,
833  eCharType_MaskedNonGap,
834  eCharType_Gap,
835  eCharType_JustIgnore,
836  eCharType_HyphenToIgnoreAndWarn,
837  eCharType_Comment,
838  eCharType_Bad,
839  };
840 
841  for (size_t pos = 0; pos < s_len; ++pos) {
842  const unsigned char c = s[pos];
843 
844  // figure out what exactly should be done with the char
845  ECharType char_type = eCharType_Bad;
846  switch(c) {
847  // try to keep cases with consecutive letters on the same line
848  // and try to have all cases with the same result in alphabetical
849  // order. This will to make it easier to
850  // tell if a letter was skipped or entered mult times
851 
852  // some cases just set char_type but others can be implemented right
853  // in the first switch followed by a continue. Try to minimize the
854  // latter because they can make this switch very long
855 
856  case 'A': case 'B': case 'C': case 'D':
857  case 'G': case 'H':
858  case 'K':
859  case 'M':
860  case 'R': case 'S': case 'T': case 'U': case 'V': case 'W':
861  case 'Y':
862  CloseGap(pos == 0);
863  m_SeqData[m_CurrentPos] = c;
864  CloseMask();
865  ++m_CurrentPos;
866  continue;
867  case 'a': case 'b': case 'c': case 'd':
868  case 'g': case 'h':
869  case 'k':
870  case 'm':
871  case 'r': case 's': case 't': case 'u': case 'v': case 'w':
872  case 'y':
873  char_type = eCharType_MaskedNonGap;
874  break;
875 
876  case 'E': case 'F':
877  case 'I': case 'J':
878  case 'L':
879  case 'O': case 'P': case 'Q':
880  case 'Z':
881  case '*':
882  if( bIsNuc ) {
883  char_type = eCharType_Bad;
884  } else {
885  CloseGap(pos == 0);
886  m_SeqData[m_CurrentPos] = c;
887  CloseMask();
888  ++m_CurrentPos;
889  continue;
890  }
891  break;
892  case 'e': case 'f':
893  case 'i': case 'j':
894  case 'l':
895  case 'o': case 'p': case 'q':
896  case 'z':
897  char_type = (bIsNuc ? eCharType_Bad : eCharType_MaskedNonGap );
898  break;
899 
900  case 'N':
901  char_type = ( bIsNuc && bAllowLetterGaps ?
902  eCharType_Gap : eCharType_NormalNonGap );
903  break;
904  case 'n':
905  char_type = ( bIsNuc && bAllowLetterGaps ?
906  eCharType_Gap : eCharType_MaskedNonGap );
907  break;
908 
909  case 'X':
910  char_type = ( bIsNuc ? eCharType_Bad :
911  bAllowLetterGaps ? eCharType_Gap :
912  eCharType_NormalNonGap);
913  break;
914  case 'x':
915  char_type = ( bIsNuc ? eCharType_Bad :
916  bAllowLetterGaps ? eCharType_Gap :
917  eCharType_MaskedNonGap);
918  break;
919 
920  case '-':
921  char_type = (
922  bHyphensAreGaps ? eCharType_Gap :
923  bHyphensIgnoreAndWarn ? eCharType_HyphenToIgnoreAndWarn :
924  eCharType_NormalNonGap );
925  break;
926  case ';':
927  char_type = eCharType_Comment;
928  break;
929 
930  case '\t': case '\n': case '\v': case '\f': case '\r': case ' ':
931  continue;
932 
933  default:
934  char_type = eCharType_Bad;
935  break;
936  }
937 
938  switch(char_type) {
939  case eCharType_NormalNonGap:
940  CloseGap(pos == 0);
941  m_SeqData[m_CurrentPos] = c;
942  CloseMask();
943  ++m_CurrentPos;
944  break;
945  case eCharType_MaskedNonGap:
946  CloseGap(pos == 0);
948  OpenMask();
949  ++m_CurrentPos;
950  break;
951  case eCharType_Gap: {
952  CloseMask();
953  // open a gap
954 
955  size_t pos2 = pos + 1;
956  while( pos2 < s_len && s[pos2] == c ) {
957  ++pos2;
958  }
959  _ASSERT(pos2 <= s_len);
960  m_CurrentGapLength += pos2 - pos;
962  pos = pos2 - 1; // `- 1` compensates for the `++pos` in the `for`
963  break;
964  }
965  case eCharType_JustIgnore:
966  break;
967  case eCharType_HyphenToIgnoreAndWarn:
968  bIgnorableHyphenSeen = true;
969  break;
970  case eCharType_Comment:
971  // artificially advance pos to the end to break the pos loop
972  pos = s_len;
973  break;
974  case eCharType_Bad:
975  if( bad_pos_line_num < 0 ) {
976  bad_pos_line_num = LineNumber();
977  }
978  bad_pos_vec.push_back(pos);
979  break;
980  default:
981  _TROUBLE;
982  }
983  }
984 
985  m_SeqData.resize(m_CurrentPos);
986 
987  if( bIgnorableHyphenSeen ) {
988  _ASSERT( bHyphensIgnoreAndWarn );
990  "CFastaReader: Hyphens are invalid and will be ignored around line " << LineNumber(),
992  kEmptyStr, kEmptyStr, "-" );
993  }
994 
995  // before throwing, be sure that we're in a valid state so that callers can
996  // parse multiple lines and get the invalid residues in all of them.
997 
998  if( ! bad_pos_vec.empty() ) {
999  if (TestFlag(fValidate)) {
1000  NCBI_THROW2(CBadResiduesException, eBadResidues,
1001  "CFastaReader: There are invalid " + x_NucOrProt() + "residue(s) in input sequence",
1002  CBadResiduesException::SBadResiduePositions( m_BestID, bad_pos_vec, bad_pos_line_num ) );
1003  } else {
1004  stringstream warn_strm;
1005  warn_strm << "FASTA-Reader: Ignoring invalid " << x_NucOrProt()
1006  << "residues at position(s): ";
1008  m_BestID, bad_pos_vec, bad_pos_line_num ).ConvertBadIndexesToString(warn_strm);
1009 
1010  FASTA_WARNING(0,
1011  warn_strm.str(),
1013  kEmptyStr );
1014  }
1015  }
1016 }
1017 
1019  TSeqPos len, bool atStartOfLine, ILineErrorListener * /*pMessageListener*/)
1020 {
1021  _ASSERT(len > 0 && TestFlag(fParseGaps));
1022 
1024  {
1025  // the run of N is too short to be assumed as a gap, put N as is.
1026  m_SeqData.resize(m_SeqData.size() + m_CurrentGapLength, 'X');
1029  return;
1030  }
1031 
1032  if (TestFlag(fAligning)) {
1035  m_Offset += len;
1036  m_Starts[pos + m_Offset][m_Row] = pos;
1037  } else {
1038  TSeqPos pos = GetCurrentPos(eRawPos);
1040  if (len == m_gap_Unknown_length)
1041  {
1042  eKnownSize = SGap::eKnownSize_No;
1043  //len = 0;
1044  }
1045  else
1046  {
1047  // Special case -- treat a lone hyphen at the end of a line as
1048  // a gap of unknown length.
1049  // (do NOT treat a lone 'N' or 'X' as unknown length)
1050  if (len == 1 && m_CurrentGapChar == '-' ) {
1051  TSeqPos l = m_SeqData.length();
1052  if ((l == pos) || (l == pos + (*GetLineReader()).length() && atStartOfLine)) {
1053  //and it's not the first col of the line
1054  len = 0;
1055  eKnownSize = SGap::eKnownSize_No;
1056  }
1057  }
1058  }
1059 
1060  const auto cit = m_GapsizeToLinkageEvidence.find(len);
1061  const auto& gap_linkage_evidence =
1062  (cit != m_GapsizeToLinkageEvidence.end()) ?
1063  cit->second :
1065 
1066  TGapRef pGap( new SGap(
1067  pos, len,
1068  eKnownSize,
1069  LineNumber(),
1070  m_gap_type,
1071  gap_linkage_evidence));
1072 
1073  m_Gaps.push_back(pGap);
1074  m_TotalGapLength += len;
1075  m_CurrentGapLength = 0;
1076  }
1077 }
1078 
1080 {
1083 }
1084 
1086 {
1088  m_CurrentMask->SetPacked_int().AddInterval
1090  eNa_strand_plus);
1092 }
1093 
1095  const TStr& line, ILineErrorListener * pMessageListener)
1096 {
1097  _ASSERT( NStr::StartsWith(line, ">?") );
1098 
1099  // just in case there's a gap before this one,
1100  // even though somewhat unusual
1101  CloseGap();
1102 
1103  // sRemainingLine will hold the part of the line left to parse
1104  TStr sRemainingLine = line.substr(2);
1105  NStr::TruncateSpacesInPlace(sRemainingLine);
1106 
1107  const TSeqPos uPos = GetCurrentPos(eRawPos);
1108 
1109  // check if size is unknown
1111  if( NStr::StartsWith(sRemainingLine, "unk") ) {
1112  eIsKnown = SGap::eKnownSize_No;
1113  sRemainingLine = sRemainingLine.substr(3);
1115  }
1116 
1117  // extract the gap size
1118  TSeqPos uGapSize = 0;
1119  {
1120  // find how many digits in number
1121  TStr::size_type uNumDigits = 0;
1122  while( uNumDigits != sRemainingLine.size() &&
1123  ::isdigit(sRemainingLine[uNumDigits]) )
1124  {
1125  ++uNumDigits;
1126  }
1127  TStr sDigits = sRemainingLine.substr(
1128  0, uNumDigits);
1129  uGapSize = NStr::StringToUInt(sDigits, NStr::fConvErr_NoThrow);
1130  if( uGapSize <= 0 ) {
1132  "CFastaReader: Bad gap size at line " << LineNumber(),
1134  "gapline" );
1135  // try to continue the best we can
1136  uGapSize = 1;
1137  }
1138  sRemainingLine = sRemainingLine.substr(sDigits.length());
1140  }
1141 
1142  // extract the raw key-value pairs for the gap
1143  typedef multimap<TStr, TStr> TModKeyValueMultiMap;
1144  TModKeyValueMultiMap modKeyValueMultiMap;
1145  while( ! sRemainingLine.empty() ) {
1146  TStr::size_type uOpenBracketPos = TStr::npos;
1147  if ( NStr::StartsWith(sRemainingLine, "[") ) {
1148  uOpenBracketPos = 0;
1149  }
1150  TStr::size_type uPosOfEqualSign = TStr::npos;
1151  if( uOpenBracketPos != TStr::npos ) {
1152  // uses "1" to skip the '['
1153  uPosOfEqualSign = sRemainingLine.find('=', uOpenBracketPos + 1);
1154  }
1155  TStr::size_type uCloseBracketPos = TStr::npos;
1156  if( uPosOfEqualSign != TStr::npos ) {
1157  uCloseBracketPos = sRemainingLine.find(']', uPosOfEqualSign + 1);
1158  }
1159  if( uCloseBracketPos == TStr::npos )
1160  {
1162  "CFastaReader: Problem parsing gap mods at line "
1163  << LineNumber(),
1165  "gapline" );
1166  break; // give up on mod-parsing
1167  }
1168 
1169  // extract the key and the value
1171  sRemainingLine.substr(uOpenBracketPos + 1,
1172  (uPosOfEqualSign - uOpenBracketPos - 1) ) );
1174  sRemainingLine.substr(uPosOfEqualSign + 1,
1175  uCloseBracketPos - uPosOfEqualSign - 1) );
1176 
1177  // remember what we saw
1178  modKeyValueMultiMap.insert(
1179  TModKeyValueMultiMap::value_type(sKey, sValue) );
1180 
1181  // prepare for the next loop around
1182  sRemainingLine = sRemainingLine.substr(uCloseBracketPos + 1);
1184  }
1185 
1186  // string to value maps
1187  const CEnumeratedTypeValues::TNameToValue & linkage_evidence_to_value_map =
1188  CLinkage_evidence::ENUM_METHOD_NAME(EType)()->NameToValue();
1189 
1190  // remember if there is a gap-type conflict
1191  bool bConflictingGapTypes = false;
1192  // extract the mods, if any
1193  SGap::TNullableGapType pGapType;
1195  set<CLinkage_evidence::EType> setOfLinkageEvidence;
1196 
1197  if (m_gap_type && modKeyValueMultiMap.empty()) // fall back to default values coming from caller
1198  {
1199  pGapType = m_gap_type;
1201  setOfLinkageEvidence = m_GapsizeToLinkageEvidence.begin()->second;
1202  }
1203  for (const auto& rec : CSeq_gap::GetNameToGapTypeInfoMap())
1204  {
1205  if (rec.second.m_eType == *pGapType)
1206  {
1207  eLinkEvid = rec.second.m_eLinkEvid;
1208  }
1209  }
1210  }
1211 
1212  ITERATE(TModKeyValueMultiMap, modKeyValue_it, modKeyValueMultiMap) {
1213  const TStr & sKey = modKeyValue_it->first;
1214  const TStr & sValue = modKeyValue_it->second;
1215 
1216  string canonicalKey = CanonicalizeString(sKey);
1217  if( canonicalKey == "gap-type") {
1218 
1219  const CSeq_gap::SGapTypeInfo *pGapTypeInfo = CSeq_gap::NameToGapTypeInfo(sValue);
1220  if( pGapTypeInfo ) {
1221  CSeq_gap::EType eGapType = pGapTypeInfo->m_eType;
1222 
1223  if( ! pGapType ) {
1224  pGapType.Reset( new SGap::TGapTypeObj(eGapType) );
1225  eLinkEvid = pGapTypeInfo->m_eLinkEvid;
1226  } else if( eGapType != *pGapType ) {
1227  // check if pGapType already set and different
1228  bConflictingGapTypes = true;
1229  }
1230  } else {
1232  LineNumber(),
1233  "Unknown gap-type: " << sValue,
1235  "gapline",
1236  "gap-type",
1237  sValue );
1238  }
1239  continue;
1240 
1241  }
1242 
1243  if( canonicalKey == "linkage-evidence") {
1244  // could be semi-colon separated
1245  vector<CTempString> arrLinkageEvidences;
1246  NStr::Split(sValue, ";", arrLinkageEvidences,
1248 
1249  ITERATE(vector<CTempString>, link_evid_it, arrLinkageEvidences) {
1250  CTempString sLinkEvid = *link_evid_it;
1252  linkage_evidence_to_value_map.find(CanonicalizeString(sLinkEvid));
1253  if( find_iter != linkage_evidence_to_value_map.end() ) {
1254  setOfLinkageEvidence.insert(
1255  static_cast<CLinkage_evidence::EType>(
1256  find_iter->second));
1257  } else {
1259  LineNumber(),
1260  "Unknown linkage-evidence: " << sValue,
1262  "gapline",
1263  "linkage-evidence",
1264  sValue );
1265  }
1266  }
1267  continue;
1268  }
1269 
1270  // unknown mod.
1272  LineNumber(),
1273  "Unknown gap modifier name(s): " << sKey,
1275  "gapline", sKey, kEmptyStr );
1276  }
1277 
1278  if( bConflictingGapTypes ) {
1280  "There were conflicting gap-types around line " << LineNumber(),
1282  "gapline", "gap-type", kEmptyStr );
1283  }
1284 
1285  // check validation beyond basic parsing problems
1286 
1287  // if no gap-type set (but linkage-evidence explicitly set, use "unknown")
1288  if( ! pGapType && ! setOfLinkageEvidence.empty() ) {
1290  }
1291 
1292  // check if linkage-evidence(s) compatible with gap-type
1293  switch( eLinkEvid ) {
1295  if( setOfLinkageEvidence.empty() ) {
1296  if( pGapType ) {
1297  // silently add the required "unspecified"
1298  setOfLinkageEvidence.insert(CLinkage_evidence::eType_unspecified);
1299  }
1300  } else if( setOfLinkageEvidence.size() > 1 ||
1301  *setOfLinkageEvidence.begin() != CLinkage_evidence::eType_unspecified )
1302  {
1303  // only "unspecified" is allowed
1304  FASTA_WARNING(
1305  LineNumber(),
1306  "FASTA-Reader: Unknown gap-type can have linkage-evidence "
1307  "of type 'unspecified' only.",
1309  "gapline");
1310  setOfLinkageEvidence.clear();
1311  setOfLinkageEvidence.insert(CLinkage_evidence::eType_unspecified);
1312  }
1313  break;
1315  if( ! setOfLinkageEvidence.empty() ) {
1317  "FASTA-Reader: This gap-type cannot have any "
1318  "linkage-evidence specified, so any will be ignored.",
1320  "gapline" );
1321  setOfLinkageEvidence.clear();
1322  }
1323  break;
1325  if( setOfLinkageEvidence.empty() ) {
1326  setOfLinkageEvidence.insert(CLinkage_evidence::eType_unspecified);
1327  }
1328  if( setOfLinkageEvidence.size() == 1 &&
1329  *setOfLinkageEvidence.begin() == CLinkage_evidence::eType_unspecified)
1330  {
1332  "CFastaReader: This gap-type should have at least one "
1333  "specified linkage-evidence.",
1335  "gapline" );
1336  }
1337  break;
1338  // intentionally omitted "default:" so a compiler warning will
1339  // hopefully let us know if we've forgotten a case
1340  }
1341 
1342  TGapRef pGap( new SGap(
1343  uPos, uGapSize, eIsKnown, LineNumber(), pGapType,
1344  setOfLinkageEvidence ) );
1345 
1346  m_Gaps.push_back(pGap);
1347  m_TotalGapLength += pGap->m_uLen;
1348  return true;
1349 }
1350 
1352 {
1353  CSeq_inst& inst = m_CurrentSeq->SetInst();
1354 
1355  CloseGap();
1356  CloseMask();
1357  if (TestFlag(fInSegSet)) {
1359  }
1360  AssignMolType(pMessageListener);
1361 
1362  // apply source mods *after* figuring out mol type
1363  ITERATE(vector<SLineTextAndLoc>, title_ci, m_CurrentSeqTitles) {
1364  ParseTitle(*title_ci, pMessageListener);
1365  }
1366  m_CurrentSeqTitles.clear();
1367 
1369  = ( inst.IsAa() ?
1370  ( TestFlag(fUseIupacaa) ?
1374 
1375  if (TestFlag(fValidate)) {
1376  CSeq_data tmp_data(m_SeqData, format);
1377  vector<TSeqPos> badIndexes;
1378  CSeqportUtil::Validate(tmp_data, &badIndexes);
1379  if ( ! badIndexes.empty() ) {
1380  NCBI_THROW2(CBadResiduesException, eBadResidues,
1381  "CFastaReader: Invalid " + x_NucOrProt() + "residue(s) in input sequence",
1383  }
1384  }
1385 
1386  if ( !TestFlag(fParseGaps) && m_TotalGapLength > 0 ) {
1387  // Encountered >? lines; substitute runs of Ns or Xs as appropriate.
1388  string new_data;
1389  char gap_char(inst.IsAa() ? 'X' : 'N');
1390  SIZE_TYPE pos = 0;
1391  new_data.reserve(GetCurrentPos(ePosWithGaps));
1392  ITERATE (TGaps, it, m_Gaps) {
1393  // since we're not parsing gaps, we have to throw out
1394  // any specified extra information that can't be
1395  // represented with a mere 'X' or 'N', so at least
1396  // give a warning
1397  const bool bHasSpecifiedGapType =
1398  ( (*it)->m_pGapType && *(*it)->m_pGapType != CSeq_gap::eType_unknown );
1399  const bool bHasSpecifiedLinkEvid =
1400  ( ! (*it)->m_setOfLinkageEvidence.empty() &&
1401  ( (*it)->m_setOfLinkageEvidence.size() > 1 ||
1402  *(*it)->m_setOfLinkageEvidence.begin() != CLinkage_evidence::eType_unspecified ) );
1403  if( bHasSpecifiedGapType || bHasSpecifiedLinkEvid )
1404  {
1405  FASTA_WARNING((*it)->m_uLineNumber,
1406  "CFastaReader: Gap mods are ignored because gaps are "
1407  "becoming N's or X's in this case.",
1409  "gapline" );
1410  }
1411  if ((*it)->m_uPos > pos) {
1412  new_data.append(m_SeqData, pos, (*it)->m_uPos - pos);
1413  pos = (*it)->m_uPos;
1414  }
1415  new_data.append((*it)->m_uLen, gap_char);
1416  }
1417  if (m_CurrentPos > pos) {
1418  new_data.append(m_SeqData, pos, m_CurrentPos - pos);
1419  }
1420  swap(m_SeqData, new_data);
1421  m_Gaps.clear();
1423  m_TotalGapLength = 0;
1424  m_CurrentGapChar = '\0';
1425  }
1426 
1427  if (m_Gaps.empty() && m_SeqData.empty()) {
1428 
1429  _ASSERT(m_TotalGapLength == 0);
1430  inst.SetLength(0);
1432  // empty sequence triggers warning if seq data was expected
1433  if( ! TestFlag(fDisableNoResidues) &&
1434  ! TestFlag(fNoSeqData) ) {
1436  "FASTA-Reader: No residues given",
1438  }
1439  }
1440  else
1441  if (m_Gaps.empty() && TestFlag(fNoSplit)) {
1445  if ( !TestFlag(fLeaveAsText) ) {
1447  }
1448  inst.SetSeq_data(*data);
1449  } else {
1450  CDelta_ext& delta_ext = inst.SetExt().SetDelta();
1453  SIZE_TYPE n = m_Gaps.size();
1454 
1455  if (n==0 || m_Gaps[0]->m_uPos > 0)
1456  {
1457  TStr chunk(m_SeqData, 0, (n>0 && m_Gaps[0]->m_uPos > 0)?m_Gaps[0]->m_uPos : inst.GetLength());
1458  delta_ext.AddAndSplit(chunk, format, chunk.length(), false, !TestFlag(fLeaveAsText));
1459  }
1460 
1461  for (SIZE_TYPE i = 0; i < n; ++i) {
1462 
1463  // add delta-seq
1464  CRef<CDelta_seq> gap_ds(new CDelta_seq);
1465  if (m_Gaps[i]->m_uLen == 0) { // totally unknown
1466  gap_ds->SetLoc().SetNull();
1467  } else { // has a nominal length (normally 100)
1468  gap_ds->SetLiteral().SetLength(m_Gaps[i]->m_uLen);
1469  if( m_Gaps[i]->m_eKnownSize == SGap::eKnownSize_No ) {
1470  gap_ds->SetLiteral().SetFuzz().SetLim(CInt_fuzz::eLim_unk);
1471  }
1472 
1473  if( m_Gaps[i]->m_pGapType || ! m_Gaps[i]->m_setOfLinkageEvidence.empty() ) {
1474  CSeq_gap & seq_gap = gap_ds->SetLiteral().SetSeq_data().SetGap();
1475  seq_gap.SetType( m_Gaps[i]->m_pGapType ?
1476  *m_Gaps[i]->m_pGapType :
1478 
1479  // set linkage and linkage-evidence, if any
1480  if( ! m_Gaps[i]->m_setOfLinkageEvidence.empty() ) {
1481  // any linkage-evidence (even "unspecified")
1482  // implies "linked"
1484 
1485  CSeq_gap::TLinkage_evidence & vecLinkEvids =
1486  seq_gap.SetLinkage_evidence();
1487  ITERATE(SGap::TLinkEvidSet, link_evid_it,
1488  m_Gaps[i]->m_setOfLinkageEvidence )
1489  {
1491  new CLinkage_evidence );
1492  pNewLinkEvid->SetType( *link_evid_it );
1493  vecLinkEvids.push_back(std::move(pNewLinkEvid));
1494  }
1495  }
1496  }
1497  }
1498  delta_ext.Set().push_back(std::move(gap_ds));
1499 
1500  TSeqPos next_start = (i == n-1) ? m_CurrentPos : m_Gaps[i+1]->m_uPos;
1501  if (next_start != m_Gaps[i]->m_uPos) {
1502  TSeqPos seq_len = next_start - m_Gaps[i]->m_uPos;
1503  TStr chunk(m_SeqData, m_Gaps[i]->m_uPos, seq_len);
1504  delta_ext.AddAndSplit(chunk, format, chunk.length(), false, !TestFlag(fLeaveAsText));
1505  }
1506  }
1507  if (delta_ext.Get().size() == 1) {
1508  // simplify -- just one piece
1510  inst.SetSeq_data(delta_ext.Set().front()
1511  ->SetLiteral().SetSeq_data());
1512  inst.ResetExt();
1513  }
1514 
1515  }
1516 }
1517 
1518 static void s_AddBiomol(CMolInfo::EBiomol biomol, CBioseq& bioseq)
1519 {
1520  auto pDesc = Ref(new CSeqdesc());
1521  pDesc->SetMolinfo().SetBiomol(biomol);
1522  bioseq.SetDescr().Set().emplace_back(std::move(pDesc));
1523 }
1524 
1525 static bool sRefineNaMol(const char* beginSeqData, const char* endSeqData, CBioseq& bioseq)
1526 {
1527  auto& seqInst = bioseq.SetInst();
1528  const bool hasT =
1529  (find_if(beginSeqData, endSeqData, [](char c) { return (c=='t' || c == 'T'); }) != endSeqData);
1530  const bool hasU =
1531  (find_if(beginSeqData, endSeqData, [](char c) { return (c=='u' || c == 'U'); }) != endSeqData);
1532 
1533  if (hasT && !hasU) {
1534  seqInst.SetMol(CSeq_inst::eMol_dna);
1535  s_AddBiomol(CMolInfo::eBiomol_genomic, bioseq); // RW-931
1536  return true;
1537  }
1538 
1539  if (hasU && !hasT) {
1540  seqInst.SetMol(CSeq_inst::eMol_rna);
1541  return true;
1542  }
1543 
1544  return false;
1545 }
1546 
1547 
1549 {
1550  CSeq_inst::EMol default_mol;
1551  CFormatGuess::ESTStrictness strictness;
1552 
1553  // Check flags; in general, treat contradictory settings as canceling out.
1554  // Did the user specify a (default) type?
1555  switch (GetFlags() & (fAssumeNuc | fAssumeProt)) {
1556  case fAssumeNuc: default_mol = CSeq_inst::eMol_na; break;
1557  case fAssumeProt: default_mol = CSeq_inst::eMol_aa; break;
1558  default: default_mol = CSeq_inst::eMol_not_set; break;
1559  }
1560  // Did the user request non-default format-guessing strictness?
1561  switch (GetFlags() & (fStrictGuess | fLaxGuess)) {
1562  case fStrictGuess: strictness = CFormatGuess::eST_Strict; break;
1563  case fLaxGuess: strictness = CFormatGuess::eST_Lax; break;
1564  default: strictness = CFormatGuess::eST_Default; break;
1565  }
1566 
1567  auto& inst = m_CurrentSeq->SetInst();
1568 
1569  if (TestFlag(fForceType)) {
1570  _ASSERT(default_mol != CSeq_inst::eMol_not_set);
1571  inst.SetMol(default_mol);
1572  return;
1573  }
1574 
1575  if (inst.IsSetMol()) {
1576  if (inst.GetMol() == CSeq_inst::eMol_na && !m_SeqData.empty()) {
1577  sRefineNaMol(m_SeqData.data(),
1578  m_SeqData.data() + min(m_SeqData.length(), SIZE_TYPE(4096)),
1579  *m_CurrentSeq);
1580  }
1581  return;
1582  }
1583 
1584  if (m_SeqData.empty()) {
1585  // Nothing else to go on, but that's OK (no sequence to worry
1586  // about encoding); however, Seq-inst.mol is still mandatory.
1587  inst.SetMol(CSeq_inst::eMol_not_set);
1588  return;
1589  }
1590 
1591 
1592  // Do the residue frequencies suggest a specific type?
1593  SIZE_TYPE length = min(m_SeqData.length(), SIZE_TYPE(4096));
1594  const auto& data = m_SeqData.data();
1595  switch (CFormatGuess::SequenceType(data, length, strictness)) {
1597  {
1598  if (!sRefineNaMol(data, data+length, *m_CurrentSeq)) {
1599  inst.SetMol(CSeq_inst::eMol_na);
1600  }
1601  return;
1602  }
1603  case CFormatGuess::eProtein: inst.SetMol(CSeq_inst::eMol_aa); return;
1604  default:
1605  if (default_mol == CSeq_inst::eMol_not_set) {
1607  "CFastaReader: Unable to determine sequence type (is it nucleotide? protein?) around line " + NStr::NumericToString(LineNumber()),
1608  LineNumber());
1609  } else {
1610  inst.SetMol(default_mol);
1611  }
1612  }
1613 }
1614 
1615 
1616 bool
1618  const TStr& sLineText,
1619  TSeqPos iLineNum,
1620  ILineErrorListener * pMessageListener) const
1621 {
1622 
1623  // check for nuc or aa sequences at the end of the title
1624  const static size_t kWarnNumNucCharsAtEnd = 20;
1625  const static size_t kWarnAminoAcidCharsAtEnd = 50;
1626 
1627  const size_t length = sLineText.length();
1628  SIZE_TYPE pos_to_check = length-1;
1629 
1630  if((length > kWarnNumNucCharsAtEnd) && !TestFlag(fAssumeProt)) {
1631  // find last non-nuc character, within the last kWarnNumNucCharsAtEnd characters
1632  const SIZE_TYPE last_pos_to_check_for_nuc = (sLineText.length() - kWarnNumNucCharsAtEnd);
1633  for( ; pos_to_check >= last_pos_to_check_for_nuc; --pos_to_check ) {
1634  if( ! s_ASCII_IsUnAmbigNuc(sLineText[pos_to_check]) ) {
1635  // found a character which is not an unambiguous nucleotide
1636  break;
1637  }
1638  }
1639  if( pos_to_check < last_pos_to_check_for_nuc ) {
1640  FASTA_WARNING(iLineNum,
1641  "FASTA-Reader: Title ends with at least " << kWarnNumNucCharsAtEnd
1642  << " valid nucleotide characters. Was the sequence "
1643  << "accidentally put in the title line?",
1645  "defline"
1646  );
1647  return true; // found problem
1648  }
1649  }
1650 
1651  if((length > kWarnAminoAcidCharsAtEnd) && !TestFlag(fAssumeNuc)) {
1652  // check for aa's at the end of the title
1653  // for efficiency, continue where the nuc search left off, since
1654  // we know that nucs can be amino acids, also
1655  const SIZE_TYPE last_pos_to_check_for_amino_acid =
1656  ( sLineText.length() - kWarnAminoAcidCharsAtEnd );
1657  for( ; pos_to_check >= last_pos_to_check_for_amino_acid; --pos_to_check ) {
1658  // can't just use "isalpha" in case it includes characters
1659  // with diacritics (an accent, tilde, umlaut, etc.)
1660  const char ch = sLineText[pos_to_check];
1661  if( ( ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') ) {
1662  // potential amino acid, so keep going
1663  } else {
1664  // non-amino-acid found
1665  break;
1666  }
1667  }
1668 
1669  if( pos_to_check < last_pos_to_check_for_amino_acid ) {
1670  FASTA_WARNING(iLineNum,
1671  "FASTA-Reader: Title ends with at least " << kWarnAminoAcidCharsAtEnd
1672  << " valid amino acid characters. Was the sequence "
1673  << "accidentally put in the title line?",
1675  "defline");
1676  return true; // found problem
1677  }
1678  }
1679 
1680  return false;
1681 }
1682 
1684  int reference_row, ILineErrorListener * pMessageListener)
1685 {
1686  TIds ids;
1687  CRef<CSeq_entry> entry = x_ReadSeqsToAlign(ids, pMessageListener);
1688  CRef<CSeq_annot> annot(new CSeq_annot);
1689 
1690  if ( !entry->IsSet()
1691  || entry->GetSet().GetSeq_set().size() <
1692  static_cast<unsigned int>(max(reference_row + 1, 2)))
1693  {
1695  "CFastaReader::ReadAlignedSet: not enough input sequences.",
1696  LineNumber());
1697  } else if (reference_row >= 0) {
1698  x_AddPairwiseAlignments(*annot, ids, reference_row);
1699  } else {
1700  x_AddMultiwayAlignment(*annot, ids);
1701  }
1702  entry->SetSet().SetAnnot().push_back(annot);
1703 
1704  entry->Parentize();
1705  return entry;
1706 }
1707 
1709  ILineErrorListener * pMessageListener)
1710 {
1711  CRef<CSeq_entry> entry(new CSeq_entry);
1712  vector<TSeqPos> lengths;
1713 
1715 
1716  for (m_Row = 0, m_Starts.clear(); !GetLineReader().AtEOF(); ++m_Row) {
1717  try {
1718  // must mark m_Starts prior to reading in case of leading gaps
1719  m_Starts[0][m_Row] = 0;
1720  CRef<CSeq_entry> entry2(ReadOneSeq(pMessageListener));
1721  entry->SetSet().SetSeq_set().push_back(entry2);
1722  CRef<CSeq_id> id(new CSeq_id);
1723  id->Assign(GetBestID());
1724  ids.push_back(id);
1725  lengths.push_back(GetCurrentPos(ePosWithGapsAndSegs) + m_Offset);
1726  _ASSERT(lengths.size() == size_t(m_Row) + 1);
1727  // redundant if there was a trailing gap, but that should be okay
1729  } catch (const CObjReaderParseException&) {
1730  if (GetLineReader().AtEOF()) {
1731  break;
1732  } else {
1733  throw;
1734  }
1735  }
1736  }
1737  // check whether lengths are all equal, and warn if they differ
1738  if (lengths.size() > 1 && TestFlag(fValidate)) {
1739  vector<TSeqPos>::const_iterator it(lengths.begin());
1740  const TSeqPos len = *it;
1741  for (++it; it != lengths.end(); ++it) {
1742  if (*it != len) {
1744  "CFastaReader::ReadAlignedSet: Rows have different "
1745  "lengths. For example, look around line " << LineNumber(),
1747  }
1748  }
1749  }
1750 
1751  return entry;
1752 }
1753 
1755  TRowNum reference_row)
1756 {
1757  typedef CFastaAlignmentBuilder TBuilder;
1758  typedef CRef<TBuilder> TBuilderRef;
1759 
1760  TRowNum rows = m_Row;
1761  vector<TBuilderRef> builders(rows);
1762 
1763  for (TRowNum r = 0; r < rows; ++r) {
1764  if (r != reference_row) {
1765  builders[r].Reset(new TBuilder(ids[reference_row], ids[r]));
1766  }
1767  }
1768  ITERATE (TStartsMap, it, m_Starts) {
1769  const TSubMap& submap = it->second;
1770  TSubMap::const_iterator rr_it2 = submap.find(reference_row);
1771  if (rr_it2 == submap.end()) { // reference unchanged
1772  ITERATE (TSubMap, it2, submap) {
1773  int r = it2->first;
1774  _ASSERT(r != reference_row);
1775  builders[r]->AddData(it->first, TBuilder::kContinued,
1776  it2->second);
1777  }
1778  } else { // reference changed; all rows need updating
1779  TSubMap::const_iterator it2 = submap.begin();
1780  for (TRowNum r = 0; r < rows; ++r) {
1781  if (it2 != submap.end() && r == it2->first) {
1782  if (r != reference_row) {
1783  builders[r]->AddData(it->first, rr_it2->second,
1784  it2->second);
1785  }
1786  ++it2;
1787  } else {
1788  _ASSERT(r != reference_row);
1789  builders[r]->AddData(it->first, rr_it2->second,
1790  TBuilder::kContinued);
1791  }
1792  }
1793  }
1794  }
1795 
1796  // finalize and store the alignments
1797  CSeq_annot::TData::TAlign& annot_align = annot.SetData().SetAlign();
1798  for (TRowNum r = 0; r < rows; ++r) {
1799  if (r != reference_row) {
1800  annot_align.push_back(builders[r]->GetCompletedAlignment());
1801  }
1802  }
1803 }
1804 
1806 {
1807  TRowNum rows = m_Row;
1808  CRef<CSeq_align> sa(new CSeq_align);
1809  CDense_seg& ds = sa->SetSegs().SetDenseg();
1810  CDense_seg::TStarts& dss = ds.SetStarts();
1811 
1813  sa->SetDim(rows);
1814  ds.SetDim(rows);
1815  ds.SetIds() = ids;
1816  dss.reserve((m_Starts.size() - 1) * rows);
1817 
1818  TSeqPos old_len = 0;
1820  next != m_Starts.end(); it = next++) {
1821  TSeqPos len = next->first - it->first;
1822  _ASSERT(len > 0);
1823  ds.SetLens().push_back(len);
1824 
1825  const TSubMap& submap = it->second;
1826  TSubMap::const_iterator it2 = submap.begin();
1827  for (TRowNum r = 0; r < rows; ++r) {
1828  if (it2 != submap.end() && r == it2->first) {
1829  dss.push_back(it2->second);
1830  ++it2;
1831  } else {
1832  _ASSERT(dss.size() >= size_t(rows) && old_len > 0);
1833  TSignedSeqPos last_pos = dss[dss.size() - rows];
1834  if (last_pos == CFastaAlignmentBuilder::kNoPos) {
1835  dss.push_back(last_pos);
1836  } else {
1837  dss.push_back(last_pos + old_len);
1838  }
1839  }
1840  }
1841 
1842  it = next;
1843  old_len = len;
1844  }
1845  ds.SetNumseg(ds.GetLens().size());
1846  annot.SetData().SetAlign().push_back(sa);
1847 }
1848 
1849 
1851  int* counter, CFastaReader::TMasks* lcv,
1852  ILineErrorListener* pMessageListener)
1853 {
1854  CFastaReader reader(in, flags);
1855  if (counter) {
1856  reader.SetIDGenerator().SetCounter(*counter);
1857  }
1858  if (lcv) {
1859  reader.SaveMasks(lcv);
1860  }
1861 
1862  auto pEntry = reader.ReadSet(kMax_Int, pMessageListener);
1863 
1864  if (counter) {
1865  *counter = reader.GetIDGenerator().GetCounter();
1866  }
1867  return pEntry;
1868 }
1869 
1870 
1872 {
1873 }
1874 
1875 
1877 {
1878 public:
1880 
1881  CFastaMapper(ILineReader& reader,
1882  SFastaFileMap* fasta_map,
1883  TFlags flags,
1884  FIdCheck f_idcheck = CSeqIdCheck());
1885 
1886 protected:
1887  void ParseDefLine(const TStr& s,
1888  ILineErrorListener * pMessageListener);
1889  void ParseTitle(const SLineTextAndLoc & lineInfo,
1890  ILineErrorListener * pMessageListener = 0);
1891  void AssembleSeq(ILineErrorListener * pMessageListener);
1892 
1893 private:
1896 };
1897 
1899  SFastaFileMap* fasta_map,
1900  TFlags flags,
1901  FIdCheck f_idcheck)
1902  : TParent(reader, flags, f_idcheck), m_Map(fasta_map)
1903 {
1904  _ASSERT(fasta_map);
1905  fasta_map->file_map.resize(0);
1906 }
1907 
1908 void CFastaMapper::ParseDefLine(const TStr& s, ILineErrorListener * pMessageListener)
1909 {
1910  TParent::ParseDefLine(s, pMessageListener); // We still want the default behavior.
1911  m_MapEntry.seq_id = GetIDs().front()->AsFastaString(); // XXX -- GetBestID?
1912  m_MapEntry.all_seq_ids.resize(0);
1913  ITERATE (CBioseq::TId, it, GetIDs()) {
1914  m_MapEntry.all_seq_ids.push_back((*it)->AsFastaString());
1915  }
1917 }
1918 
1920  ILineErrorListener * pMessageListener)
1921 {
1922  TParent::ParseTitle(s, pMessageListener);
1924 }
1925 
1927 {
1928  TParent::AssembleSeq(pMessageListener);
1929  m_Map->file_map.push_back(m_MapEntry);
1930 }
1931 
1932 
1934 {
1935  static const CFastaReader::TFlags kFlags
1937 
1938  if ( !input.is_open() ) {
1939  return;
1940  }
1941 
1943  CFastaMapper mapper(*lr, fasta_map, kFlags);
1944  mapper.ReadSet();
1945 }
1946 
1947 
1950  CFastaReader::TFlags fread_flags)
1951 {
1952  if ( !input.is_open() ) {
1953  return;
1954  }
1955 
1957  CFastaReader reader(*lr, fread_flags);
1958 
1959  while ( !lr->AtEOF() ) {
1960  try {
1961  CNcbiStreampos pos = lr->GetPosition();
1962  CRef<CSeq_entry> se = reader.ReadOneSeq();
1963  if (se->IsSeq()) {
1964  scanner->EntryFound(se, pos);
1965  }
1966  } catch (const CObjReaderParseException&) {
1967  if ( !lr->AtEOF() ) {
1968  throw;
1969  }
1970  }
1971  }
1972 }
1973 
1974 
1975 static void s_AppendMods(
1976  const CModHandler::TModList& mods,
1977  string& title
1978  )
1979 {
1980  for (const auto& mod : mods) {
1981 
1982  title.append(" ["
1983  + mod.GetName()
1984  + "="
1985  + mod.GetValue()
1986  + "]");
1987  }
1988 }
1989 
1990 
1991 void CFastaReader::SetExcludedMods(const vector<string>& excluded_mods)
1992 {
1993  m_ModHandler.SetExcludedMods(excluded_mods);
1994 }
1995 
1996 
1997 void CFastaReader::SetIgnoredMods(const list<string>& ignored_mods)
1998 {
1999  m_ModHandler.SetIgnoredMods(ignored_mods);
2000 }
2001 
2002 void CFastaReader::SetPostponedMods(const list<string>& postponed_mods)
2003 {
2004  m_PostponedMods.clear();
2005  transform(postponed_mods.begin(), postponed_mods.end(),
2006  inserter(m_PostponedMods, m_PostponedMods.end()),
2007  [](const string& mod_name) { return CModHandler::GetCanonicalName(mod_name); });
2008 }
2009 
2010 
2013 {
2014  return m_PostponedModMap;
2015 }
2016 
2017 
2019  const string& title,
2020  TSeqPos line_number,
2021  CBioseq& bioseq,
2022  ILineErrorListener* pMessageListener )
2023 {
2024  string processed_title = title;
2025  if (TestFlag(fAddMods)) {
2026  x_AddMods(line_number, bioseq, processed_title, pMessageListener);
2027  }
2028  else
2029  if (!TestFlag(fIgnoreMods) &&
2030  CTitleParser::HasMods(title)) {
2031  FASTA_WARNING(line_number,
2032  "FASTA-Reader: Ignoring FASTA modifier(s) found because "
2033  "the input was not expected to have any.",
2035  "defline");
2036  }
2037 
2038  NStr::TruncateSpacesInPlace(processed_title);
2039  if (!processed_title.empty()) {
2040  auto pDesc = Ref(new CSeqdesc());
2041  pDesc->SetTitle() = processed_title;
2042  bioseq.SetDescr().Set().push_back(std::move(pDesc));
2043  }
2044 }
2045 
2046 
2048  TSeqPos line_number,
2049  CBioseq& bioseq,
2050  string& processed_title,
2051  ILineErrorListener* pMessageListener)
2052 {
2053  string remainder;
2054  CModHandler::TModList mods;
2055  CTitleParser::Apply(processed_title, mods, remainder);
2056  if (mods.empty()) {
2057  return;
2058  }
2059 
2060  const auto* pFirstID = bioseq.GetFirstId();
2061  _ASSERT(pFirstID);
2062  const auto idString = pFirstID->AsFastaString();
2063 
2064  x_CheckForPostponedMods(idString, line_number, mods);
2065  if (mods.empty()) {
2066  return;
2067  }
2068 
2070  errorReporter(idString, line_number, pMessageListener);
2071 
2072  CModHandler::TModList rejected_mods;
2073  m_ModHandler.Clear();
2074  m_ModHandler.AddMods(mods, CModHandler::eReplace, rejected_mods, errorReporter);
2075  s_AppendMods(rejected_mods, remainder);
2076 
2077  CModHandler::TModList skipped_mods;
2078  const bool logInfo =
2079  pMessageListener ?
2080  pMessageListener->SevEnabled(eDiag_Info) :
2081  false;
2082  CModAdder::Apply(m_ModHandler, bioseq, skipped_mods, logInfo, errorReporter);
2083  s_AppendMods(skipped_mods, remainder);
2084 
2085  processed_title = remainder;
2086 }
2087 
2088 
2089 void CFastaReader::x_CheckForPostponedMods(const string& idString,
2090  TSeqPos line_number,
2091  CModHandler::TModList& mods)
2092 {
2093  if (mods.empty() || m_PostponedMods.empty()) {
2094  return;
2095  }
2096 
2097  auto it = mods.begin();
2098  while(it != mods.end()) {
2099  if (m_PostponedMods.find(CModHandler::GetCanonicalName(it->GetName()))
2100  != m_PostponedMods.end()) {
2101 
2102  if (auto mit = m_PostponedModMap.find(idString);
2103  mit != m_PostponedModMap.end()) {
2104  mit->second.second.push_back(*it);
2105  }
2106  else {
2107  m_PostponedModMap[idString] = {line_number, {*it}};
2108  }
2109  it = mods.erase(it);
2110  } else {
2111  ++it;
2112  }
2113  }
2114 }
2115 
2116 
2118 {
2119  if( m_CurrentSeq && m_CurrentSeq->IsSetInst() &&
2121  {
2122  return ( m_CurrentSeq->GetInst().IsAa() ? "protein " : "nucleotide " );
2123  } else {
2124  return kEmptyStr;
2125  }
2126 }
2127 
2128 // static
2130 {
2131  string newString;
2132  newString.reserve(sValue.length());
2133 
2134  ITERATE_0_IDX(ii, sValue.length()) {
2135  const char ch = sValue[ii];
2136  if( isupper(ch) ) {
2137  newString.push_back(tolower(ch));
2138  } else if( ch == ' ' || ch == '_' ) {
2139  newString.push_back('-');
2140  } else {
2141  newString.push_back(ch);
2142  }
2143  }
2144 
2145  return newString;
2146 }
2147 
2148 
2150  TSeqPos uPos,
2151  TSignedSeqPos uLen,
2152  EKnownSize eKnownSize,
2153  Uint8 uLineNumber,
2154  TNullableGapType pGapType,
2155  const set<CLinkage_evidence::EType> & setOfLinkageEvidence ) :
2156  m_uPos(uPos),
2157  m_uLen(uLen),
2158  m_eKnownSize(eKnownSize),
2159  m_uLineNumber(uLineNumber),
2160  m_pGapType(pGapType),
2161  m_setOfLinkageEvidence(setOfLinkageEvidence)
2162 {
2163 }
2164 
2165 
2168  const set<int>& defaultEvidence,
2169  const map<TSeqPos, set<int>>& countToEvidenceMap
2170 )
2171 {
2172  SetGapLinkageEvidences(type, defaultEvidence);
2173 
2175  for (const auto& key_val : countToEvidenceMap) {
2176  const auto& input_evidence_set = key_val.second;
2177  auto& evidence_set = m_GapsizeToLinkageEvidence[key_val.first];
2178  for (const auto& evidence : input_evidence_set) {
2179  evidence_set.insert(static_cast<CLinkage_evidence::EType>(evidence));
2180  }
2181  }
2182 }
2183 
2184 
2186 {
2188 
2190  for (const auto& evidence : evidences) {
2192  }
2193 }
2194 
2195 
2197  ILineErrorListener * pMessageListener,
2198  EDiagSev _eSeverity, size_t _uLineNum, CTempString _MessageStrmOps, CObjReaderParseException::EErrCode _eErrCode, ILineError::EProblem _eProblem, CTempString _sFeature, CTempString _sQualName, CTempString _sQualValue) const
2199 {
2200  if (find(m_ignorable.begin(), m_ignorable.end(), _eProblem) != m_ignorable.end())
2201  // this is a problem that should be ignored
2202  return;
2203 
2204  string sSeqId = ( m_BestID ? m_BestID->AsFastaString() : kEmptyStr);
2207  (_eSeverity), static_cast<unsigned>(_uLineNum),
2208  _MessageStrmOps,
2209  (_eProblem),
2210  sSeqId, (_sFeature),
2211  (_sQualName), (_sQualValue),
2212  _eErrCode) );
2213  if ( ! pMessageListener && (_eSeverity) <= eDiag_Warning ) {
2214  LOG_POST_X(1, Warning << pLineExpt->Message());
2215  } else if ( ! pMessageListener || ! pMessageListener->PutError( *pLineExpt ) )
2216  {
2217  throw CObjReaderParseException(DIAG_COMPILE_INFO, 0, _eErrCode, _MessageStrmOps, _uLineNum, _eSeverity);
2218  }
2219 }
2220 
2222 {
2223  m_ignorable.push_back(problem);
2224 }
2225 
2226 
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
static const NStr::TNumToStringFlags kFlags
void transform(Container &c, UnaryFunction *op)
Definition: chainer.hpp:86
AutoPtr –.
Definition: ncbimisc.hpp:401
const SBadResiduePositions & GetBadResiduePositions(void) const THROWS_NONE
const CSeq_id * GetFirstId() const
Definition: Bioseq.cpp:271
bool IsNa(void) const
Definition: Bioseq.cpp:345
void AddAndSplit(const CTempString &src, CSeq_data::E_Choice format, TSeqPos length, bool gaps_ok=false, bool allow_packing=true)
add a chunk of sequence, splitting it as necessary for the sake of compactness (isolating ambiguous p...
Definition: Delta_ext.cpp:183
CDelta_seq –.
Definition: Delta_seq.hpp:66
Helper class to build pairwise alignments, with double gaps automatically spliced out.
static TSeqPos ParseRange(const CTempString &s, TSeqPos &start, TSeqPos &end, ILineErrorListener *pMessageListener)
static size_t s_MaxGeneralTagLength
static size_t s_MaxAccessionLength
static void ParseDefline(const CTempString &defline, const SDeflineParseInfo &info, const TIgnoredProblems &ignoredErrors, TIds &ids, bool &hasRange, TSeqPos &rangeStart, TSeqPos &rangeEnd, TSeqTitles &seqTitles, ILineErrorListener *pMessageListener)
static size_t s_MaxLocalIDLength
bool CacheIdHandle(CSeq_id_Handle idh)
virtual CRef< CSeq_id > GenerateID(bool unique_id)
void SetGenerator(CSeqIdGenerator &generator)
SFastaFileMap * m_Map
Definition: fasta.cpp:1894
CFastaReader TParent
Definition: fasta.cpp:1879
void ParseDefLine(const TStr &s, ILineErrorListener *pMessageListener)
Definition: fasta.cpp:1908
void ParseTitle(const SLineTextAndLoc &lineInfo, ILineErrorListener *pMessageListener=0)
Definition: fasta.cpp:1919
SFastaFileMap::SFastaEntry m_MapEntry
Definition: fasta.cpp:1895
void AssembleSeq(ILineErrorListener *pMessageListener)
Definition: fasta.cpp:1926
CFastaMapper(ILineReader &reader, SFastaFileMap *fasta_map, TFlags flags, FIdCheck f_idcheck=CSeqIdCheck())
Definition: fasta.cpp:1898
Base class for reading FASTA sequences.
Definition: fasta.hpp:80
@ eST_Lax
Implement historic behavior, risking false positives.
@ eST_Strict
Require 100% encodability of printable non-digits.
@ eST_Default
Be relatively strict, but still allow for typos.
static ESequenceType SequenceType(const char *str, unsigned length=0, ESTStrictness strictness=eST_Default)
Guess sequence type.
static void Apply(const CModHandler &mod_handler, CBioseq &bioseq, TSkippedMods &skipped_mods, FPostMessage fPostMessage=nullptr)
Definition: mod_reader.cpp:450
void SetExcludedMods(const vector< string > &excluded_mods)
Definition: mod_reader.cpp:198
static const string & GetCanonicalName(const TModEntry &mod_entry)
Definition: mod_reader.cpp:393
void SetIgnoredMods(const list< string > &ignored_mods)
Definition: mod_reader.cpp:206
list< CModData > TModList
Definition: mod_reader.hpp:94
void AddMods(const TModList &mods, EHandleExisting handle_existing, TModList &rejected_mods, FReportError fReportError=nullptr)
Definition: mod_reader.cpp:221
void Clear(void)
Definition: mod_reader.cpp:387
static CObjReaderLineException * Create(EDiagSev eSeverity, unsigned int uLine, const std::string &strMessage, EProblem eProblem=eProblem_GeneralParsingError, const std::string &strSeqId=string(""), const std::string &strFeatureName=string(""), const std::string &strQualifierName=string(""), const std::string &strQualifierValue=string(""), CObjReaderLineException::EErrCode eErrCode=eFormat, const TVecOfLines &vecOfOtherLines=TVecOfLines())
Please use this instead of the constructor because the ctor is protected.
Definition: line_error.cpp:194
@ eFormat
Some of these are pretty specialized.
CObjectFor –.
Definition: ncbiobj.hpp:2335
Defines and provides stubs for a general interface to a variety of file readers.
Definition: reader_base.hpp:63
long TReaderFlags
Definition: reader_base.hpp:84
TReaderFlags m_iFlags
static void xAddStringFlagsWithMap(const list< string > &stringFlags, const map< string, TReaderFlags > flagMap, TReaderFlags &baseFlags)
void SetCounter(TCount count)
TCount GetCounter(void) const
Definition: Seq_entry.hpp:56
void Parentize(void)
Definition: Seq_entry.cpp:71
static const TGapTypeMap & GetNameToGapTypeInfoMap(void)
This is for if the user needs to get the gap-type string to SGapTypeInfo info directly (For example,...
Definition: Seq_gap.cpp:176
ELinkEvid
indicates which linkage-evidences a given gap-type can accept, if any
Definition: Seq_gap.hpp:75
@ eLinkEvid_UnspecifiedOnly
only the "unspecified" linkage-evidence is allowed
Definition: Seq_gap.hpp:77
@ eLinkEvid_Forbidden
no linkage-evidence is allowed
Definition: Seq_gap.hpp:79
@ eLinkEvid_Required
any linkage-evidence is allowed, and at least one is required
Definition: Seq_gap.hpp:81
static const SGapTypeInfo * NameToGapTypeInfo(const CTempString &sName)
From a gap-type string, get the SGapTypeInfo, insensitive to case, etc.
Definition: Seq_gap.cpp:158
static bool IsAa(EMol mol)
Definition: Seq_inst.hpp:99
static void Validate(const CSeq_data &in_seq, vector< TSeqPos > *badIdx, TSeqPos uBeginIdx=0, TSeqPos uLength=0)
static TSeqPos Pack(CSeq_data *in_seq, TSeqPos uLength=ncbi::numeric_limits< TSeqPos >::max())
TStack & m_Stack
Definition: fasta.cpp:134
CTempPusher(TStack &s, const TValue &v)
Definition: fasta.cpp:130
~CTempPusher()
Definition: fasta.cpp:131
TStack::value_type TValue
Definition: fasta.cpp:129
CRef< TObject > & m_pObj1
Definition: fasta.cpp:158
CRef< TObject > & m_pObj2
Definition: fasta.cpp:159
CTempRefSwap(CRef< TObject > &pObj1, CRef< TObject > &pObj2)
Definition: fasta.cpp:142
~CTempRefSwap(void)
Definition: fasta.cpp:151
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
Definition: tempstr.hpp:65
static bool HasMods(const CTempString &title)
Definition: mod_reader.cpp:793
static void Apply(const CTempString &title, TModList &mods, string &remainder)
Definition: mod_reader.cpp:754
CUser_object & AddField(const string &label, const string &value, EParseField parse=eParse_String)
add a data field to the user object that holds a given value
Callback interface to scan fasta file for entries.
Definition: fasta.hpp:468
virtual bool PutError(const ILineError &)=0
Store error in the container, and return true if error was stored fine, and return false if the calle...
@ eProblem_InvalidResidue
Definition: line_error.hpp:80
@ eProblem_IgnoredResidue
Definition: line_error.hpp:89
@ eProblem_TooLong
Definition: line_error.hpp:76
@ eProblem_ParsingModifiers
Definition: line_error.hpp:86
@ eProblem_UnexpectedAminoAcids
Definition: line_error.hpp:78
@ eProblem_ModifierFoundButNoneExpected
Definition: line_error.hpp:81
@ eProblem_UnexpectedNucResidues
Definition: line_error.hpp:77
@ eProblem_TooManyAmbiguousResidues
Definition: line_error.hpp:79
@ eProblem_UnrecognizedQualifierName
Definition: line_error.hpp:59
@ eProblem_NonPositiveLength
Definition: line_error.hpp:85
@ eProblem_ContradictoryModifiers
Definition: line_error.hpp:87
@ eProblem_ExpectedModifierMissing
Definition: line_error.hpp:83
@ eProblem_ExtraModifierFound
Definition: line_error.hpp:82
Abstract base class for lightweight line-by-line reading.
Definition: line_reader.hpp:54
virtual bool SevEnabled(EDiagSev severity) const
Definition: listener.cpp:43
void erase(iterator pos)
Definition: map.hpp:167
size_type size() const
Definition: map.hpp:148
container_type::const_iterator const_iterator
Definition: map.hpp:53
const_iterator begin() const
Definition: map.hpp:151
const_iterator end() const
Definition: map.hpp:152
iterator_bool insert(const value_type &val)
Definition: map.hpp:165
bool empty() const
Definition: map.hpp:149
void clear()
Definition: map.hpp:169
const_iterator find(const key_type &key) const
Definition: map.hpp:153
Definition: map.hpp:338
iterator_bool insert(const value_type &val)
Definition: set.hpp:149
const_iterator begin() const
Definition: set.hpp:135
void clear()
Definition: set.hpp:153
size_type size() const
Definition: set.hpp:132
bool empty() const
Definition: set.hpp:133
static uch flags
ECharType
Porter's Stemming Algorithm.
static bool s_ASCII_IsUnAmbigNuc(unsigned char c)
Definition: fasta.cpp:210
bool s_ASCII_IsAmbigNuc(unsigned char c)
Definition: fasta.cpp:189
bool s_ASCII_IsUpper(unsigned char c)
Definition: fasta.cpp:168
unsigned char s_ASCII_MustBeLowerToUpper(unsigned char c)
Definition: fasta.cpp:184
CTempPusher< stack< CFastaReader::TFlags > > CFlagGuard
Definition: fasta.cpp:162
#define FASTA_WARNING(_uLineNum, _MessageStrmOps, _eProblem, _Feature)
Definition: fasta.cpp:111
#define FASTA_PROGRESS(_MessageStrmOps)
Definition: fasta.cpp:101
bool s_ASCII_IsLower(unsigned char c)
Definition: fasta.cpp:173
static void s_AppendMods(const CModHandler::TModList &mods, string &title)
Definition: fasta.cpp:1975
#define FASTA_WARNING_EX(_uLineNum, _MessageStrmOps, _eProblem, _Feature, _sQualName, _sQualValue)
Definition: fasta.cpp:114
static bool sRefineNaMol(const char *beginSeqData, const char *endSeqData, CBioseq &bioseq)
Definition: fasta.cpp:1525
bool s_ASCII_IsAlpha(unsigned char c)
Definition: fasta.cpp:178
#define FASTA_ERROR(_uLineNum, _MessageStrmOps, _eErrCode)
Definition: fasta.cpp:117
static void s_AddBiomol(CMolInfo::EBiomol biomol, CBioseq &bioseq)
Definition: fasta.cpp:1518
Operators to edit gaps in sequences.
Helper class to build pairwise alignments, with double gaps automatically spliced out.
static DLIST_TYPE *DLIST_NAME() next(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
Definition: dlist.tmpl.h:56
static char tmp[3200]
Definition: utf8.c:42
char data[12]
Definition: iconv.c:80
#define ITERATE_0_IDX(idx, up_to)
idx loops from 0 (inclusive) to up_to (exclusive)
Definition: ncbimisc.hpp:865
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
int TSignedSeqPos
Type for signed sequence position.
Definition: ncbimisc.hpp:887
const TSeqPos kInvalidSeqPos
Define special value for invalid sequence position.
Definition: ncbimisc.hpp:878
void swap(NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair1, NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair2)
Definition: ncbimisc.hpp:1508
string
Definition: cgiapp.hpp:687
#define LOG_POST_X(err_subcode, message)
Definition: ncbidiag.hpp:553
#define DIAG_COMPILE_INFO
Make compile time diagnostic information object to use in CNcbiDiag and CException.
Definition: ncbidiag.hpp:170
EDiagSev
Severity level for the posted diagnostics.
Definition: ncbidiag.hpp:650
@ eDiag_Info
Informational message.
Definition: ncbidiag.hpp:651
@ eDiag_Warning
Warning message.
Definition: ncbidiag.hpp:652
void Warning(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1191
#define NCBI_THROW2(exception_class, err_code, message, extra)
Throw exception with extra parameter.
Definition: ncbiexpt.hpp:1754
#define ENUM_METHOD_NAME(EnumName)
Definition: serialbase.hpp:994
virtual bool ParseGapLine(const TStr &s, ILineErrorListener *pMessageListener)
Definition: fasta.cpp:1094
TRowNum m_Row
Definition: fasta.hpp:408
virtual void ParseTitle(const SLineTextAndLoc &lineInfo, ILineErrorListener *pMessageListener)
Definition: fasta.cpp:670
TMasks * m_MaskVec
Definition: fasta.hpp:394
TFastaSeqIds all_seq_ids
List of all seq.ids.
Definition: fasta.hpp:458
TSeqTitles m_CurrentSeqTitles
Definition: fasta.hpp:422
bool TestFlag(EFlags flag) const
Definition: fasta.hpp:299
CModHandler m_ModHandler
Definition: fasta.hpp:318
TSeqPos m_CurrentGapLength
Definition: fasta.hpp:401
void SetIgnoredMods(const list< string > &ignored_mods)
Definition: fasta.cpp:1997
TSeqPos m_Offset
Definition: fasta.hpp:409
virtual bool IsValidLocalID(const TStr &s) const
Definition: fasta.cpp:688
const CSeqIdGenerator & GetIDGenerator(void) const
Definition: fasta.hpp:173
virtual void CheckDataLine(const TStr &s, ILineErrorListener *pMessageListener)
Definition: fasta.cpp:710
virtual CRef< CSeq_entry > ReadOneSeq(ILineErrorListener *pMessageListener=nullptr)
Read a single effective sequence, which may turn out to be a segmented set.
Definition: fasta.cpp:312
void SetGapLinkageEvidence(CSeq_gap::EType type, const set< int > &defaultEvidence, const map< TSeqPos, set< int >> &countToEvidenceMap)
Definition: fasta.cpp:2166
virtual ~CFastaReader(void)
Definition: fasta.cpp:285
static size_t ParseRange(const TStr &s, TSeqPos &start, TSeqPos &end, ILineErrorListener *pMessageListener)
Definition: fasta.cpp:658
SGap::TNullableGapType m_gap_type
Definition: fasta.hpp:420
vector< TMask > TMasks
Definition: fasta.hpp:160
virtual CRef< CSerialObject > ReadObject(ILineReader &lr, ILineErrorListener *pErrors)
CReaderBase overrides.
Definition: fasta.cpp:296
int TRowNum
Definition: fasta.hpp:272
std::string x_NucOrProt(void) const
Definition: fasta.cpp:2117
void x_ApplyMods(const string &title, TSeqPos line_number, CBioseq &bioseq, ILineErrorListener *pMessageListener)
Definition: fasta.cpp:2018
TSeqPos m_TotalGapLength
Definition: fasta.hpp:402
TFlags GetFlags(void) const
Definition: fasta.hpp:157
CRef< CSeq_entry > ReadSet(int max_seqs=kMax_Int, ILineErrorListener *pMessageListener=nullptr)
Read multiple sequences (by default, as many as are available.)
Definition: fasta.cpp:442
static string CanonicalizeString(const CTempString &sValue)
Definition: fasta.cpp:2129
CSeqIdGenerator & SetIDGenerator(void)
Definition: fasta.hpp:174
static CRef< ILineReader > New(const string &filename)
Return a new ILineReader object corresponding to the given filename, taking "-" (but not "....
Definition: line_reader.cpp:49
void SaveMasks(TMasks *masks)
Definition: fasta.hpp:168
long TFlags
binary OR of EFlags
Definition: fasta.hpp:117
virtual CRef< CSeq_entry > ReadSeqEntry(ILineReader &lr, ILineErrorListener *pErrors)
Read an object from a given line reader, render it as a single Seq-entry, if possible.
Definition: fasta.cpp:304
SGap::TLinkEvidSet m_DefaultLinkageEvidence
Definition: fasta.hpp:418
virtual void x_CloseMask(void)
Definition: fasta.cpp:1085
virtual void UngetLine(void)=0
Unget current line, which must be valid.
virtual bool CreateWarningsForSeqDataInTitle(const TStr &sLineText, TSeqPos iLineNum, ILineErrorListener *pMessageListener) const
Definition: fasta.cpp:1617
TSeqPos m_CurrentPos
Definition: fasta.hpp:398
void SetGapLinkageEvidences(CSeq_gap::EType type, const set< int > &evidences)
Definition: fasta.cpp:2185
CRef< CSeq_loc > SaveMask(void)
Directs the *following* call to ReadOneSeq to note the locations of lowercase letters.
Definition: fasta.cpp:474
TSeqPos m_gapNmin
Definition: fasta.hpp:403
CRef< CFastaIdHandler > m_IDHandler
Definition: fasta.hpp:395
SGap(TSeqPos pos, TSignedSeqPos len, EKnownSize eKnownSize, Uint8 uLineNumber, TNullableGapType pGapType=TNullableGapType(), const set< CLinkage_evidence::EType > &setOfLinkageEvidence=set< CLinkage_evidence::EType >())
Definition: fasta.cpp:2149
TSeqPos m_gap_Unknown_length
Definition: fasta.hpp:404
TSeqPos m_SegmentBase
Definition: fasta.hpp:400
CRef< CSeq_entry > x_ReadSeqsToAlign(TIds &ids, ILineErrorListener *pMessageListener)
Definition: fasta.cpp:1708
static void AddStringFlags(const list< string > &stringFlags, TFlags &baseFlags)
Definition: fasta.cpp:222
virtual void AssembleSeq(ILineErrorListener *pMessageListener)
Definition: fasta.cpp:1351
CRef< ILineReader > m_LineReader
Definition: fasta.hpp:389
void x_AddMultiwayAlignment(CSeq_annot &annot, const TIds &ids)
Definition: fasta.cpp:1805
CRef< CSeq_id > m_BestID
Definition: fasta.hpp:406
const CSeq_id & GetBestID(void) const
Definition: fasta.hpp:171
CBioseq::TId & SetIDs(void)
Definition: fasta.hpp:303
stack< TFlags > m_Flags
Definition: fasta.hpp:390
TSeqPos GetCurrentPos(EPosType pos_type)
Definition: fasta.hpp:503
void CloseMask(void)
Definition: fasta.hpp:291
const TPostponedModMap & GetPostponedModMap() const
Definition: fasta.cpp:2012
void x_CheckForPostponedMods(const string &idString, TSeqPos line_number, CModHandler::TModList &mods)
Definition: fasta.cpp:2089
TPostponedModMap m_PostponedModMap
Definition: fasta.hpp:427
virtual void PostProcessIDs(const CBioseq::TId &defline_ids, const string &defline, bool has_range=false, TSeqPos range_start=kInvalidSeqPos, TSeqPos range_end=kInvalidSeqPos)
Definition: fasta.cpp:619
TGaps m_Gaps
Definition: fasta.hpp:397
CRef< CBioseq > m_CurrentSeq
Definition: fasta.hpp:391
TMapVector file_map
Definition: fasta.hpp:463
virtual void EntryFound(CRef< CSeq_entry > se, CNcbiStreampos stream_position)=0
Callback function, called after reading the fasta entry.
void SetMinGaps(TSeqPos gapNmin, TSeqPos gap_Unknown_length)
Definition: fasta.cpp:290
TMask m_CurrentMask
Definition: fasta.hpp:392
vector< CRef< CSeq_id > > TIds
Definition: fasta.hpp:276
virtual void x_OpenMask(void)
Definition: fasta.cpp:1079
TStartsMap m_Starts
Definition: fasta.hpp:407
bool xSetSeqMol(const list< CRef< CSeq_id >> &ids, CSeq_inst_Base::EMol &mol)
Definition: fasta.cpp:548
void OpenMask(void)
Definition: fasta.hpp:495
CRef< CSeq_entry > ReadAlignedSet(int reference_row, ILineErrorListener *pMessageListener=nullptr)
Read as many sequences as are available, and interpret them as an alignment, with hyphens marking rel...
Definition: fasta.cpp:1683
void x_AddMods(TSeqPos line_number, CBioseq &bioseq, string &processed_title, ILineErrorListener *pMessageListener)
Definition: fasta.cpp:2047
string m_SeqData
Definition: fasta.hpp:396
virtual void ParseDataLine(const TStr &s, ILineErrorListener *pMessageListener)
Definition: fasta.cpp:774
std::vector< ILineError::EProblem > m_ignorable
Definition: fasta.hpp:423
vector< TGapRef > TGaps
Definition: fasta.hpp:386
virtual void x_CloseGap(TSeqPos len, bool atStartOfLine, ILineErrorListener *pMessageListener)
Definition: fasta.cpp:1018
CNcbiStreampos stream_offset
Molecule offset in file.
Definition: fasta.hpp:457
vector< ILineError::EProblem > TIgnoredProblems
Definition: fasta.hpp:220
void x_AddPairwiseAlignments(CSeq_annot &annot, const TIds &ids, TRowNum reference_row)
Definition: fasta.cpp:1754
CRef< CSeq_align > xCreateAlignment(CRef< CSeq_id > old_id, CRef< CSeq_id > new_id, TSeqPos range_start, TSeqPos range_end)
Definition: fasta.cpp:521
TSeqPos m_MaskRangeStart
Definition: fasta.hpp:399
void SetExcludedMods(const vector< string > &excluded_mods)
Definition: fasta.cpp:1991
static void ParseDefLine(const TStr &defLine, const SDefLineParseInfo &info, const TIgnoredProblems &ignoredErrors, list< CRef< CSeq_id >> &ids, bool &hasRange, TSeqPos &rangeStart, TSeqPos &rangeEnd, TSeqTitles &seqTitles, ILineErrorListener *pMessageListener)
Definition: fasta.cpp:496
bool AtEOF(void) const
Indicates (negatively) whether there is any more input.
Definition: fasta.hpp:141
char m_CurrentGapChar
Definition: fasta.hpp:405
virtual bool AtEOF(void) const =0
Indicates (negatively) whether there is any more input.
void SetMaxIDLength(Uint4 max_len)
If this is set, an exception will be thrown if a Sequence ID exceeds the given length.
Definition: fasta.cpp:485
CFastaReader(ILineReader &reader, TFlags flags=0, FIdCheck f_idcheck=CSeqIdCheck())
Definition: fasta.cpp:259
void ScanFastaFile(IFastaEntryScan *scanner, CNcbiIfstream &input, CFastaReader::TFlags fread_flags)
Scan FASTA files, call IFastaEntryScan::EntryFound (payload function)
Definition: fasta.cpp:1948
Uint4 m_MaxIDLength
Definition: fasta.hpp:413
Uint8 LineNumber(void) const
Definition: fasta.hpp:295
Int8 StreamPosition(void) const
Definition: fasta.hpp:293
string seq_id
Primary sequence Id.
Definition: fasta.hpp:455
virtual char PeekChar(void) const =0
Returns the first character of the next string without consuming it.
CFastaDeflineReader::FIdCheck FIdCheck
Definition: fasta.hpp:123
void SetPostponedMods(const list< string > &postponed_mods)
Definition: fasta.cpp:2002
virtual ~IFastaEntryScan()
Definition: fasta.cpp:1871
void ReadFastaFileMap(SFastaFileMap *fasta_map, CNcbiIfstream &input)
Function reads input stream (assumed that it is FASTA format) one molecule entry after another fillin...
Definition: fasta.cpp:1933
bool m_bModifiedMaxIdLength
Definition: fasta.hpp:336
virtual void AssignMolType(ILineErrorListener *pMessageListener)
Definition: fasta.cpp:1548
string description
Molecule description.
Definition: fasta.hpp:456
FIdCheck m_fIdCheck
Definition: fasta.hpp:424
void CloseGap(bool atStartOfLine=false, ILineErrorListener *pMessageListener=nullptr)
Definition: fasta.hpp:284
unordered_set< string > m_PostponedMods
Definition: fasta.hpp:426
TMask m_NextMask
Definition: fasta.hpp:393
ILineReader & GetLineReader(void)
Definition: fasta.hpp:298
void IgnoreProblem(ILineError::EProblem problem)
Definition: fasta.cpp:2221
const CBioseq::TId & GetIDs(void) const
Definition: fasta.hpp:170
void x_SetDeflineParseInfo(SDefLineParseInfo &info)
Definition: fasta.cpp:647
vector< SLineTextAndLoc > TSeqTitles
Definition: fasta.hpp:221
TCountToLinkEvidMap m_GapsizeToLinkageEvidence
Definition: fasta.hpp:416
virtual CT_POS_TYPE GetPosition(void) const =0
Return the current (absolute) position.
CRef< CSeq_entry > ReadFasta(CNcbiIstream &in, CFastaReader::TFlags flags, int *counter, CFastaReader::TMasks *lcv, ILineErrorListener *pMessageListener)
A const-correct replacement for the deprecated ReadFasta function.
Definition: fasta.cpp:1850
virtual void GenerateID(void)
Definition: fasta.cpp:703
virtual void PostWarning(ILineErrorListener *pMessageListener, EDiagSev _eSeverity, size_t _uLineNum, CTempString _MessageStrmOps, CObjReaderParseException::EErrCode _eErrCode, ILineError::EProblem _eProblem, CTempString _sFeature, CTempString _sQualName, CTempString _sQualValue) const
Definition: fasta.cpp:2196
@ fNoParseID
Generate an ID (whole defline -> title)
Definition: fasta.hpp:90
@ fQuickIDCheck
Just check local IDs' first characters.
Definition: fasta.hpp:110
@ fDLOptional
Don't require a leading defline.
Definition: fasta.hpp:96
@ fLaxGuess
Use legacy heuristic for guessing seq. type.
Definition: fasta.hpp:103
@ fHyphensIgnoreAndWarn
When a hyphen is encountered in seq data, ignore it but warn.
Definition: fasta.hpp:112
@ fRequireID
Reject deflines that lack IDs.
Definition: fasta.hpp:95
@ fSkipCheck
Skip (rudimentary) body content check.
Definition: fasta.hpp:98
@ fDisableNoResidues
If no residues found do not raise an error.
Definition: fasta.hpp:113
@ fUniqueIDs
Forbid duplicate IDs.
Definition: fasta.hpp:101
@ fLetterGaps
Parse runs of Ns when splitting data.
Definition: fasta.hpp:105
@ fIgnoreMods
Ignore mods entirely. Incompatible with fAddMods.
Definition: fasta.hpp:115
@ fAddMods
Parse defline mods and add to SeqEntry.
Definition: fasta.hpp:104
@ fNoUserObjs
Don't save raw deflines in User-objects.
Definition: fasta.hpp:106
@ fForceType
Force specified type regardless of accession.
Definition: fasta.hpp:89
@ fParseRawID
Try to identify raw accessions.
Definition: fasta.hpp:97
@ fNoSplit
Don't split out ambiguous sequence regions.
Definition: fasta.hpp:99
@ fUseIupacaa
If Prot, use iupacaa instead of the default ncbieaa.
Definition: fasta.hpp:111
@ fAssumeNuc
Assume nucs unless accns indicate otherwise.
Definition: fasta.hpp:87
@ fParseGaps
Make a delta sequence if gaps found.
Definition: fasta.hpp:91
@ fValidate
Check (alphabetic) residue validity.
Definition: fasta.hpp:100
@ fOneSeq
Just read the first sequence found.
Definition: fasta.hpp:92
@ fLeaveAsText
Don't reencode at all, just parse.
Definition: fasta.hpp:109
@ fAssumeProt
Assume prots unless accns indicate otherwise.
Definition: fasta.hpp:88
@ fNoSeqData
Parse the deflines but skip the data.
Definition: fasta.hpp:94
@ fStrictGuess
Assume no typos when guessing sequence type.
Definition: fasta.hpp:102
@ fDisableParseRange
No ranges in seq-ids. Ranges part of seq-id instead.
Definition: fasta.hpp:114
@ ePosWithGapsAndSegs
Definition: fasta.hpp:311
@ ePosWithGaps
Definition: fasta.hpp:310
const string AsFastaString(void) const
Definition: Seq_id.cpp:2266
EAccessionInfo
For IdentifyAccession (below)
Definition: Seq_id.hpp:220
static bool IsValidLocalID(const CTempString &s)
Perform rudimentary validation on potential local IDs, whose contents should be pure ASCII and limite...
Definition: Seq_id.cpp:2576
static CSeq_id_Handle GetHandle(const CSeq_id &id)
Normal way of getting a handle, works for any seq-id.
string AsString(void) const
static int BestRank(const CRef< CSeq_id > &id)
Definition: Seq_id.hpp:774
@ fAcc_prot
Definition: Seq_id.hpp:252
@ fAcc_nuc
Definition: Seq_id.hpp:251
void SetPacked_int(TPacked_int &v)
Definition: Seq_loc.hpp:984
void SetNull(void)
Override all setters to incorporate cache invalidation.
Definition: Seq_loc.hpp:960
CRef< C > Ref(C *object)
Helper functions to get CRef<> and CConstRef<> objects.
Definition: ncbiobj.hpp:2015
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:1439
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
bool NotEmpty(void) const THROWS_NONE
Check if CRef is not empty – pointing to an object and has a non-null value.
Definition: ncbiobj.hpp:726
void Swap(TThisType &ref)
Swaps the pointer with another reference.
Definition: ncbiobj.hpp:754
bool IsNull(void) const THROWS_NONE
Check if pointer is null – same effect as Empty().
Definition: ncbiobj.hpp:735
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
Definition: ncbiobj.hpp:719
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
#define kMax_Int
Definition: ncbi_limits.h:184
int64_t Int8
8-byte (64-bit) signed integer
Definition: ncbitype.h:104
uint64_t Uint8
8-byte (64-bit) unsigned integer
Definition: ncbitype.h:105
#define kMax_UI4
Definition: ncbi_limits.h:219
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
Definition: ncbistre.hpp:146
IO_PREFIX::ifstream CNcbiIfstream
Portable alias for ifstream.
Definition: ncbistre.hpp:439
IO_PREFIX::streampos CNcbiStreampos
Portable alias for streampos.
Definition: ncbistre.hpp:134
NCBI_NS_STD::string::size_type SIZE_TYPE
Definition: ncbistr.hpp:132
static string PrintableString(const CTempString str, TPrintableMode mode=fNewLine_Quote|fNonAscii_Passthru)
Get a printable version of the specified string.
Definition: ncbistr.cpp:3953
static CTempString TruncateSpaces_Unsafe(const CTempString str, ETrunc where=eTrunc_Both)
Truncate spaces in a string.
Definition: ncbistr.cpp:3191
#define kEmptyStr
Definition: ncbistr.hpp:123
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
Definition: ncbistr.cpp:3461
static void TruncateSpacesInPlace(string &str, ETrunc where=eTrunc_Both)
Truncate spaces in a string (in-place)
Definition: ncbistr.cpp:3201
bool empty(void) const
Return true if the represented string is empty (i.e., the length is zero)
Definition: tempstr.hpp:334
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
Definition: ncbistr.hpp:5412
size_t size_type
Definition: tempstr.hpp:70
size_type length(void) const
Return the length of the represented array.
Definition: tempstr.hpp:320
static unsigned int StringToUInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to unsigned int.
Definition: ncbistr.cpp:642
CTempString substr(size_type pos) const
Obtain a substring from this string, beginning at a given offset.
Definition: tempstr.hpp:776
TErrCode GetErrCode(void) const
Get error code.
Definition: ncbistr.hpp:4455
static enable_if< is_arithmetic< TNumeric >::value||is_convertible< TNumeric, Int8 >::value, string >::type NumericToString(TNumeric value, TNumToStringFlags flags=0, int base=10)
Convert numeric value to string.
Definition: ncbistr.hpp:673
size_type find(const CTempString match, size_type pos=0) const
Find the first instance of the entire matching string within the current string, beginning at an opti...
Definition: tempstr.hpp:655
size_type size(void) const
Return the length of the represented array.
Definition: tempstr.hpp:327
static const size_type npos
Definition: tempstr.hpp:72
@ fConvErr_NoThrow
Do not throw an exception on error.
Definition: ncbistr.hpp:285
@ fSplit_Truncate
Definition: ncbistr.hpp:2501
@ fSplit_MergeDelimiters
Merge adjacent delimiters.
Definition: ncbistr.hpp:2498
@ eTrunc_Begin
Truncate leading spaces only.
Definition: ncbistr.hpp:2240
C::value_type FindBestChoice(const C &container, F score_func)
Find the best choice (lowest score) for values in a container.
Definition: ncbiutil.hpp:250
void SetType(TType &value)
Assign a value to Type data member.
@ eLim_unk
unknown
Definition: Int_fuzz_.hpp:210
TLens & SetLens(void)
Assign a value to Lens data member.
Definition: Dense_seg_.hpp:561
void SetSegs(TSegs &value)
Assign a value to Segs data member.
Definition: Seq_align_.cpp:310
const TLens & GetLens(void) const
Get the Lens member data.
Definition: Dense_seg_.hpp:555
void SetDim(TDim value)
Assign a value to Dim data member.
Definition: Seq_align_.hpp:865
vector< TSignedSeqPos > TStarts
Definition: Dense_seg_.hpp:107
void SetDim(TDim value)
Assign a value to Dim data member.
Definition: Dense_seg_.hpp:427
void SetType(TType value)
Assign a value to Type data member.
Definition: Seq_align_.hpp:818
TStarts & SetStarts(void)
Assign a value to Starts data member.
Definition: Dense_seg_.hpp:536
TStrands & SetStrands(void)
Assign a value to Strands data member.
Definition: Dense_seg_.hpp:586
void SetNumseg(TNumseg value)
Assign a value to Numseg data member.
Definition: Dense_seg_.hpp:474
TIds & SetIds(void)
Assign a value to Ids data member.
Definition: Dense_seg_.hpp:511
@ eType_partial
mapping pieces together
Definition: Seq_align_.hpp:103
@ eNa_strand_plus
Definition: Na_strand_.hpp:66
@ eNa_strand_minus
Definition: Na_strand_.hpp:67
TSet & SetSet(void)
Select the variant.
Definition: Seq_entry_.cpp:130
TAnnot & SetAnnot(void)
Assign a value to Annot data member.
const TSet & GetSet(void) const
Get the variant data.
Definition: Seq_entry_.cpp:124
bool IsSeq(void) const
Check if variant Seq is selected.
Definition: Seq_entry_.hpp:257
bool IsSet(void) const
Check if variant Set is selected.
Definition: Seq_entry_.hpp:263
const TSeq_set & GetSeq_set(void) const
Get the Seq_set member data.
TSeq & SetSeq(void)
Select the variant.
Definition: Seq_entry_.cpp:108
TSeq_set & SetSeq_set(void)
Assign a value to Seq_set data member.
void SetData(TData &value)
Assign a value to Data data member.
Definition: Seq_annot_.cpp:244
const TInst & GetInst(void) const
Get the Inst member data.
Definition: Bioseq_.hpp:336
void SetExt(TExt &value)
Assign a value to Ext data member.
Definition: Seq_inst_.cpp:147
bool IsSetMol(void) const
Check if a value has been assigned to Mol data member.
Definition: Seq_inst_.hpp:593
E_Choice
Choice variants.
Definition: Seq_data_.hpp:102
list< CRef< CSeq_align > > TAlign
Definition: Seq_annot_.hpp:194
void SetType(TType value)
Assign a value to Type data member.
Definition: Seq_gap_.hpp:291
bool IsSetInst(void) const
the sequence data Check if a value has been assigned to Inst data member.
Definition: Bioseq_.hpp:324
Tdata & Set(void)
Assign a value to data member.
Definition: Delta_ext_.hpp:170
TLength GetLength(void) const
Get the Length member data.
Definition: Seq_inst_.hpp:659
list< CRef< CSeq_id > > TId
Definition: Bioseq_.hpp:94
void SetInst(TInst &value)
Assign a value to Inst data member.
Definition: Bioseq_.cpp:86
EMol
molecule class in living organism
Definition: Seq_inst_.hpp:108
void SetDescr(TDescr &value)
Assign a value to Descr data member.
Definition: Bioseq_.cpp:65
TUser & SetUser(void)
Select the variant.
Definition: Seqdesc_.cpp:390
void SetRepr(TRepr value)
Assign a value to Repr data member.
Definition: Seq_inst_.hpp:574
const Tdata & Get(void) const
Get the member data.
Definition: Delta_ext_.hpp:164
void SetLength(TLength value)
Assign a value to Length data member.
Definition: Seq_inst_.hpp:668
TLinkage_evidence & SetLinkage_evidence(void)
Assign a value to Linkage_evidence data member.
Definition: Seq_gap_.hpp:375
void SetLinkage(TLinkage value)
Assign a value to Linkage data member.
Definition: Seq_gap_.hpp:338
void SetSeq_data(TSeq_data &value)
Assign a value to Seq_data data member.
Definition: Seq_inst_.cpp:130
void ResetExt(void)
Reset Ext data member.
Definition: Seq_inst_.cpp:142
list< CRef< CLinkage_evidence > > TLinkage_evidence
Definition: Seq_gap_.hpp:118
@ eRepr_delta
sequence made by changes (delta) to others
Definition: Seq_inst_.hpp:100
@ eRepr_raw
continuous sequence
Definition: Seq_inst_.hpp:94
@ eRepr_virtual
no seq data
Definition: Seq_inst_.hpp:93
@ e_Ncbieaa
extended ASCII 1 letter aa codes
Definition: Seq_data_.hpp:111
@ e_Iupacna
IUPAC 1 letter nuc acid code.
Definition: Seq_data_.hpp:104
@ e_Iupacaa
IUPAC 1 letter amino acid code.
Definition: Seq_data_.hpp:105
@ eMol_not_set
> cdna = rna
Definition: Seq_inst_.hpp:109
@ eMol_na
just a nucleic acid
Definition: Seq_inst_.hpp:113
Definition of all error codes used in objtools libraries.
static int input()
int i
yy_size_t n
int len
static MDB_envinfo info
Definition: mdb_load.c:37
constexpr auto front(list< Head, As... >, T=T()) noexcept -> Head
string_type::value_type char_type
The character type used by the parser.
Definition: muParserDef.h:246
double value_type
The numeric datatype used by the parser.
Definition: muParserDef.h:228
int isspace(Uchar c)
Definition: ncbictype.hpp:69
int tolower(Uchar c)
Definition: ncbictype.hpp:72
int isdigit(Uchar c)
Definition: ncbictype.hpp:64
int toupper(Uchar c)
Definition: ncbictype.hpp:73
int isupper(Uchar c)
Definition: ncbictype.hpp:70
Useful/utility classes and methods.
T max(T x_, T y_)
T min(T x_, T y_)
Int mod(Int i, Int j)
Definition: njn_integer.hpp:67
static Format format
Definition: njn_ioutil.cpp:53
std::istream & in(std::istream &in_, double &x_)
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
#define FIELD_CHAIN_OF_2_IS_SET(Var, Fld1, Fld2)
FIELD_CHAIN_OF_2_IS_SET.
void AddBadIndexMap(const TBadIndexMap &additionalBadIndexMap)
void ConvertBadIndexesToString(CNcbiOstream &out, unsigned int maxRanges=1000) const
Holds information about a given gap-type string.
Definition: Seq_gap.hpp:84
CSeq_gap::EType m_eType
The underlying type that the string corresponds to.
Definition: Seq_gap.hpp:86
ELinkEvid m_eLinkEvid
Indicates what linkage-evidences are compatible with this.
Definition: Seq_gap.hpp:88
Definition: type.c:6
#define _TROUBLE
#define _ASSERT
Modified on Wed Apr 17 13:10:02 2024 by modify_doxy.py rev. 669887