NCBI C++ ToolKit
readfeat.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1  /*
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Jonathan Kans, Michael Kornbluh
27  *
28  * File Description:
29  * Feature table reader
30  *
31  */
32 
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbistd.hpp>
35 #include <corelib/ncbithr.hpp>
36 
37 #include <util/static_map.hpp>
38 
39 #include <serial/iterator.hpp>
40 #include <serial/objistrasn.hpp>
41 
42 // Objects includes
48 
53 
57 #include <objects/pub/Pub.hpp>
59 #include <objects/seq/Pubdesc.hpp>
62 
81 
83 
86 
89 #include <objtools/error_codes.hpp>
90 
91 #include <algorithm>
92 #include <unordered_set>
93 
96 #include "best_feat_finder.hpp"
97 
98 #define NCBI_USE_ERRCODE_X Objtools_Rd_Feature
99 
100 
102 
103 BEGIN_objects_SCOPE // namespace ncbi::objects::
104 
105 
106 
107 namespace {
108  static const char * const kCdsFeatName = "CDS";
109  // priorities, inherited from C toolkit
110  static Uchar std_order[CSeq_id::e_MaxChoice] = {
111  83, /* 0 = not set */
112  80, /* 1 = local Object-id */
113  70, /* 2 = gibbsq */
114  70, /* 3 = gibbmt */
115  70, /* 4 = giim Giimport-id */
116  60, /* 5 = genbank */
117  60, /* 6 = embl */
118  60, /* 7 = pir */
119  60, /* 8 = swissprot */
120  81, /* 9 = patent */
121  65, /* 10 = other TextSeqId */
122  80, /* 11 = general Dbtag */
123  82, /* 12 = gi */
124  60, /* 13 = ddbj */
125  60, /* 14 = prf */
126  60, /* 15 = pdb */
127  60, /* 16 = tpg */
128  60, /* 17 = tpe */
129  60, /* 18 = tpd */
130  68, /* 19 = gpp */
131  69 /* 20 = nat */
132  };
133 
135 {
136  if (ids.size() == 1)
137  return ids.front();
138 
139  CRef<CSeq_id> id;
140  if (!ids.empty())
141  {
142  Uchar best_weight = UCHAR_MAX;
143  ITERATE(CBioseq::TId, it, ids)
144  {
145  Uchar new_weight = std_order[(*it)->Which()];
146  if (new_weight < best_weight)
147  {
148  id = *it;
149  best_weight = new_weight;
150  }
151  };
152  }
153 
154  return id;
155 }
156 
157 
158 map<char, list<char>> s_IUPACmap
159 {
160  {'A', list<char>({'A'})},
161  {'G', list<char>({'G'})},
162  {'C', list<char>({'C'})},
163  {'T', list<char>({'T'})},
164  {'U', list<char>({'U'})},
165  {'M', list<char>({'A', 'C'})},
166  {'R', list<char>({'A', 'G'})},
167  {'W', list<char>({'A', 'T'})},
168  {'S', list<char>({'C', 'G'})},
169  {'Y', list<char>({'C', 'T'})},
170  {'K', list<char>({'G', 'T'})},
171  {'V', list<char>({'A', 'C', 'G'})},
172  {'H', list<char>({'A', 'C', 'T'})},
173  {'D', list<char>({'A', 'G', 'T'})},
174  {'B', list<char>({'C', 'G', 'T'})},
175  {'N', list<char>({'A', 'C', 'G', 'T'})}
176 };
177 
178 }
179 
180 
181 class /* NCBI_XOBJREAD_EXPORT */ CFeatureTableReader_Imp
182 {
183 public:
184  enum EQual {
285  };
286 
287  enum EOrgRef {
294  };
295 
298 
299  // constructor
300  CFeatureTableReader_Imp(ILineReader* reader, unsigned int line_num, ILineErrorListener* pMessageListener);
301  // destructor
303 
304  // read 5-column feature table and return Seq-annot
306  const CTempString& annotname,
307  const TFlags flags,
308  ITableFilter *filter);
309 
310  // create single feature from key
311  CRef<CSeq_feat> CreateSeqFeat (const string& feat,
313  const TFlags flags,
314  const string &seq_id,
315  ITableFilter *filter);
316 
317  // add single qualifier to feature
318  void AddFeatQual (CRef<CSeq_feat> sfp,
319  const string& feat_name,
320  const string& qual,
321  const string& val,
322  const TFlags flags,
323  const string &seq_id );
324 
325  static bool ParseInitialFeatureLine (
326  const CTempString& line_arg,
327  CTempStringEx& out_seqid,
328  CTempStringEx& out_annotname );
329 
330  static void PutProgress(const CTempString& seq_id,
331  const unsigned int line_number,
332  ILineErrorListener* pListener);
333 
335  return m_reader;
336  }
337 
339  return m_pMessageListener;
340  }
341 
342 private:
343 
344  // Prohibit copy constructor and assignment operator
347 
348  void x_InitId(const CTempString& seq_id, const TFlags flags);
349  // returns true if parsed (otherwise, out_offset is left unchanged)
350  bool x_TryToParseOffset(const CTempString & sLine, Int4 & out_offset );
351 
352 
353  struct SFeatLocInfo {
358  bool is_point;
360  };
361 
362 
364  const CTempString& line,
365  SFeatLocInfo& loc_info,
366  string& feat,
367  string& qual,
368  string& val,
369  Int4 offset);
370 
371 
372  bool x_IsWebComment(CTempString line);
373 
375  CTempString strFeatureName,
376  CRef<CSeq_feat>& sfp,
377  const SFeatLocInfo& loc_info);
378 
380  const string &feat_name,
381  const string& qual, const string& val,
382  const TFlags flags);
383 
384  void x_ProcessQualifier(const string& qual_name,
385  const string& qual_val,
386  const string& feat_name,
387  CRef<CSeq_feat> feat,
388  TFlags flags);
389 
390  bool x_AddQualifierToGene (CSeqFeatData& sfdata,
391  EQual qtype, const string& val);
393  EQual qtype, const string& val);
395  EQual qtype, const string& val);
397  EQual qtype, const string& qual, const string& val);
398  bool x_AddQualifierToBioSrc (CSeqFeatData& sfdata,
399  const string &feat_name,
400  EOrgRef rtype, const string& val);
401  bool x_AddQualifierToBioSrc (CSeqFeatData& sfdata,
402  CSubSource::ESubtype stype, const string& val);
403  bool x_AddQualifierToBioSrc (CSeqFeatData& sfdata,
404  COrgMod::ESubtype mtype, const string& val);
405 
406  bool x_AddNoteToFeature(CRef<CSeq_feat> sfp, const string& note);
407 
409  const string& feat_name,
410  const string& qual,
411  const string& val);
412 
414  const string& qual, const string& val);
415 
416  bool x_AddCodons(const string& val, CTrna_ext& trna_ext) const;
417 
421  TFeatConstRef pFeat,
422  TSeqPos uLineNum ) :
423  m_pFeat(pFeat), m_uLineNum(uLineNum) {
424  _ASSERT(pFeat);
425  }
426 
427  bool operator==(const SFeatAndLineNum & rhs) const {
428  return Compare(rhs) == 0; }
429  bool operator!=(const SFeatAndLineNum & rhs) const {
430  return Compare(rhs) != 0; }
431  bool operator<(const SFeatAndLineNum & rhs) const {
432  return Compare(rhs) < 0; }
433 
434  int Compare(const SFeatAndLineNum & rhs) const {
435  if( m_uLineNum != rhs.m_uLineNum ) {
436  return ( m_uLineNum < rhs.m_uLineNum ? -1 : 1 );
437  }
438  return (m_pFeat.GetPointerOrNull() < rhs.m_pFeat.GetPointerOrNull() ? -1 : 1 );
439  }
440 
441  TFeatConstRef m_pFeat; // must be non-NULL
442  TSeqPos m_uLineNum; // the line where this feature was created (or zero if programmatically created)
443  };
446  CRef<CSeq_annot> sap,
447  TChoiceToFeatMap & choiceToFeatMap, // an input param, but might get more items added
448  const TFlags flags);
449 
450  bool x_StringIsJustQuotes (const string& str);
451 
452  string x_TrnaToAaString(const string& val);
453 
454  bool x_ParseTrnaExtString(CTrna_ext & ext_trna, const string & str);
455  SIZE_TYPE x_MatchingParenPos( const string &str, SIZE_TYPE open_paren_pos );
456 
457  long x_StringToLongNoThrow (
458  CTempString strToConvert,
459  CTempString strFeatureName,
460  CTempString strQualifierName,
461  // user can override the default problem types that are set on error
463  );
464 
465  bool x_SetupSeqFeat (CRef<CSeq_feat> sfp, const string& feat,
466  const TFlags flags,
467  ITableFilter *filter);
468 
470  ILineError::EProblem eProblem,
471  EDiagSev eSeverity,
472  const std::string & strFeatureName = kEmptyStr,
473  const std::string & strQualifierName = kEmptyStr,
474  const std::string & strQualifierValue = kEmptyStr,
475  const std::string & strErrorMessage = kEmptyStr,
476  const ILineError::TVecOfLines & vecOfOtherLines =
478 
480  int line_num,
481  ILineError::EProblem eProblem,
482  EDiagSev eSeverity,
483  const std::string & strFeatureName = kEmptyStr,
484  const std::string & strQualifierName = kEmptyStr,
485  const std::string & strQualifierValue = kEmptyStr,
486  const std::string & strErrorMessage = kEmptyStr,
487  const ILineError::TVecOfLines & vecOfOtherLines =
489 
490  void x_TokenizeStrict( const CTempString &line, vector<string> &out_tokens );
491  void x_TokenizeLenient( const CTempString &line, vector<string> &out_tokens );
493  void x_ResetFeat(CRef<CSeq_feat>& feat, bool & curr_feat_intervals_done);
494  void x_UpdatePointStrand(CSeq_feat& feat, CSeq_interval::TStrand strand) const;
495  void x_GetPointStrand(const CSeq_feat& feat, CSeq_interval::TStrand& strand) const;
496 
498  string m_real_seqid;
501  unsigned int m_LineNumber;
503  unordered_set<string> m_ProcessedTranscriptIds;
504  unordered_set<string> m_ProcessedProteinIds;
505 };
506 
507 
509 
510 static const TQualKey qual_key_to_subtype [] = {
512  { "PCR_conditions", CFeatureTableReader_Imp::eQual_PCR_conditions },
524  { "codon_recognized", CFeatureTableReader_Imp::eQual_codon_recognized },
526  { "codons_recognized", CFeatureTableReader_Imp::eQual_codon_recognized },
533  { "estimated_length", CFeatureTableReader_Imp::eQual_estimated_length },
544  { "gene_synonym", CFeatureTableReader_Imp::eQual_gene_syn },
552  { "linkage_evidence", CFeatureTableReader_Imp::eQual_linkage_evidence },
558  { "mobile_element_type", CFeatureTableReader_Imp::eQual_mobile_element_type },
577  { "regulatory_class", CFeatureTableReader_Imp::eQual_regulatory_class },
579  { "ribosomal_slippage", CFeatureTableReader_Imp::eQual_ribosomal_slippage },
583  { "rpt_unit_range", CFeatureTableReader_Imp::eQual_rpt_unit_range },
587  { "secondary_accession", CFeatureTableReader_Imp::eQual_secondary_accession },
588  { "secondary_accessions", CFeatureTableReader_Imp::eQual_secondary_accession },
602  { "trans_splicing", CFeatureTableReader_Imp::eQual_trans_splicing },
611 };
612 
615 
616 
618 
626 };
627 
630 
631 
633 
635  { "apicoplast", CBioSource::eGenome_apicoplast },
636  { "chloroplast", CBioSource::eGenome_chloroplast },
637  { "chromatophore", CBioSource::eGenome_chromatophore },
638  { "chromoplast", CBioSource::eGenome_chromoplast },
639  { "chromosome", CBioSource::eGenome_chromosome },
640  { "cyanelle", CBioSource::eGenome_cyanelle },
641  { "endogenous_virus", CBioSource::eGenome_endogenous_virus },
642  { "extrachrom", CBioSource::eGenome_extrachrom },
643  { "genomic", CBioSource::eGenome_genomic },
644  { "hydrogenosome", CBioSource::eGenome_hydrogenosome },
645  { "insertion_seq", CBioSource::eGenome_insertion_seq },
646  { "kinetoplast", CBioSource::eGenome_kinetoplast },
647  { "leucoplast", CBioSource::eGenome_leucoplast },
648  { "macronuclear", CBioSource::eGenome_macronuclear },
649  { "mitochondrion", CBioSource::eGenome_mitochondrion },
650  { "mitochondrion:kinetoplast", CBioSource::eGenome_kinetoplast },
651  { "nucleomorph", CBioSource::eGenome_nucleomorph },
652  { "plasmid", CBioSource::eGenome_plasmid },
653  { "plastid", CBioSource::eGenome_plastid },
654  { "plastid:apicoplast", CBioSource::eGenome_apicoplast },
655  { "plastid:chloroplast", CBioSource::eGenome_chloroplast },
656  { "plastid:chromoplast", CBioSource::eGenome_chromoplast },
657  { "plastid:cyanelle", CBioSource::eGenome_cyanelle },
658  { "plastid:leucoplast", CBioSource::eGenome_leucoplast },
659  { "plastid:proplastid", CBioSource::eGenome_proplastid },
660  { "proplastid", CBioSource::eGenome_proplastid },
661  { "proviral", CBioSource::eGenome_proviral },
662  { "transposon", CBioSource::eGenome_transposon },
663  { "unknown", CBioSource::eGenome_unknown },
664  { "virion", CBioSource::eGenome_virion }
665 };
666 
669 
670 
672 
674  { "altitude", CSubSource::eSubtype_altitude },
675  { "cell_line", CSubSource::eSubtype_cell_line },
676  { "cell_type", CSubSource::eSubtype_cell_type },
677  { "chromosome", CSubSource::eSubtype_chromosome },
678  { "clone", CSubSource::eSubtype_clone },
679  { "clone_lib", CSubSource::eSubtype_clone_lib },
680  { "collected_by", CSubSource::eSubtype_collected_by },
681  { "collection_date", CSubSource::eSubtype_collection_date },
682  { "country", CSubSource::eSubtype_country },
683  { "dev_stage", CSubSource::eSubtype_dev_stage },
684  { "endogenous_virus", CSubSource::eSubtype_endogenous_virus_name },
685  { "environmental_sample", CSubSource::eSubtype_environmental_sample },
686  { "frequency", CSubSource::eSubtype_frequency },
687  { "fwd_primer_name", CSubSource::eSubtype_fwd_primer_name },
688  { "fwd_primer_seq", CSubSource::eSubtype_fwd_primer_seq },
689  { "genotype", CSubSource::eSubtype_genotype },
690  { "geo_loc_name", CSubSource::eSubtype_country },
691  { "germline", CSubSource::eSubtype_germline },
692  { "haplotype", CSubSource::eSubtype_haplotype },
693  { "identified_by", CSubSource::eSubtype_identified_by },
694  { "insertion_seq", CSubSource::eSubtype_insertion_seq_name },
695  { "isolation_source", CSubSource::eSubtype_isolation_source },
696  { "lab_host", CSubSource::eSubtype_lab_host },
697  { "lat_lon", CSubSource::eSubtype_lat_lon },
698  { "map", CSubSource::eSubtype_map },
699  { "metagenomic", CSubSource::eSubtype_metagenomic },
700  { "plasmid", CSubSource::eSubtype_plasmid_name },
701  { "plastid", CSubSource::eSubtype_plastid_name },
702  { "pop_variant", CSubSource::eSubtype_pop_variant },
703  { "rearranged", CSubSource::eSubtype_rearranged },
704  { "rev_primer_name", CSubSource::eSubtype_rev_primer_name },
705  { "rev_primer_seq", CSubSource::eSubtype_rev_primer_seq },
706  { "segment", CSubSource::eSubtype_segment },
707  { "sex", CSubSource::eSubtype_sex },
708  { "subclone", CSubSource::eSubtype_subclone },
709  { "tissue_lib ", CSubSource::eSubtype_tissue_lib },
710  { "tissue_type", CSubSource::eSubtype_tissue_type },
711  { "transgenic", CSubSource::eSubtype_transgenic },
712  { "transposon", CSubSource::eSubtype_transposon_name }
713 };
714 
717 
718 // case-insensitive version of sm_SubSrcKeys
721  TSubSrcNoCaseMap, sm_SubSrcNoCaseKeys, subsrc_key_to_subtype);
722 
724 
726  { "acronym", COrgMod::eSubtype_acronym },
727  { "anamorph", COrgMod::eSubtype_anamorph },
728  { "authority", COrgMod::eSubtype_authority },
729  { "bio_material", COrgMod::eSubtype_bio_material },
730  { "biotype", COrgMod::eSubtype_biotype },
731  { "biovar", COrgMod::eSubtype_biovar },
732  { "breed", COrgMod::eSubtype_breed },
733  { "chemovar", COrgMod::eSubtype_chemovar },
734  { "common", COrgMod::eSubtype_common },
735  { "cultivar", COrgMod::eSubtype_cultivar },
736  { "culture_collection", COrgMod::eSubtype_culture_collection },
737  { "dosage", COrgMod::eSubtype_dosage },
738  { "ecotype", COrgMod::eSubtype_ecotype },
739  { "forma", COrgMod::eSubtype_forma },
740  { "forma_specialis", COrgMod::eSubtype_forma_specialis },
741  { "gb_acronym", COrgMod::eSubtype_gb_acronym },
742  { "gb_anamorph", COrgMod::eSubtype_gb_anamorph },
743  { "gb_synonym", COrgMod::eSubtype_gb_synonym },
744  { "group", COrgMod::eSubtype_group },
745  { "isolate", COrgMod::eSubtype_isolate },
746  { "metagenome_source", COrgMod::eSubtype_metagenome_source },
747  { "nat_host", COrgMod::eSubtype_nat_host },
748  { "natural_host", COrgMod::eSubtype_nat_host },
749  { "old_lineage", COrgMod::eSubtype_old_lineage },
750  { "old_name", COrgMod::eSubtype_old_name },
751  { "pathovar", COrgMod::eSubtype_pathovar },
752  { "serogroup", COrgMod::eSubtype_serogroup },
753  { "serotype", COrgMod::eSubtype_serotype },
754  { "serovar", COrgMod::eSubtype_serovar },
755  { "spec_host", COrgMod::eSubtype_nat_host },
756  { "specific_host", COrgMod::eSubtype_nat_host },
757  { "specimen_voucher", COrgMod::eSubtype_specimen_voucher },
758  { "strain", COrgMod::eSubtype_strain },
759  { "sub_species", COrgMod::eSubtype_sub_species },
760  { "subgroup", COrgMod::eSubtype_subgroup },
761  { "substrain", COrgMod::eSubtype_substrain },
762  { "subtype", COrgMod::eSubtype_subtype },
763  { "synonym", COrgMod::eSubtype_synonym },
764  { "teleomorph", COrgMod::eSubtype_teleomorph },
765  { "type", COrgMod::eSubtype_type },
766  { "type_material", COrgMod::eSubtype_type_material },
767  { "variety", COrgMod::eSubtype_variety }
768 };
769 
772 
774 {
775  { "Ala", 'A' },
776  { "Alanine", 'A' },
777  { "Arg", 'R' },
778  { "Arginine", 'R' },
779  { "Asn", 'N' },
780  { "Asp", 'D' },
781  { "Asp or Asn", 'B' },
782  { "Asparagine", 'N' },
783  { "Aspartate", 'D' },
784  { "Aspartic Acid", 'D' },
785  { "Asx", 'B' },
786  { "Cys", 'C' },
787  { "Cysteine", 'C' },
788  { "Gln", 'Q' },
789  { "Glu", 'E' },
790  { "Glu or Gln", 'Z' },
791  { "Glutamate", 'E' },
792  { "Glutamic Acid", 'E' },
793  { "Glutamine", 'Q' },
794  { "Glx", 'Z' },
795  { "Gly", 'G' },
796  { "Glycine", 'G' },
797  { "His", 'H' },
798  { "Histidine", 'H' },
799  { "Ile", 'I' },
800  { "Ile2", 'I' },
801  { "Isoleucine", 'I' },
802  { "Leu", 'L' },
803  { "Leu or Ile", 'J' },
804  { "Leucine", 'L' },
805  { "Lys", 'K' },
806  { "Lysine", 'K' },
807  { "Met", 'M' },
808  { "Methionine", 'M' },
809  { "OTHER", 'X' },
810  { "Phe", 'F' },
811  { "Phenylalanine", 'F' },
812  { "Pro", 'P' },
813  { "Proline", 'P' },
814  { "Pyl", 'O' },
815  { "Pyrrolysine", 'O' },
816  { "Sec", 'U' },
817  { "Selenocysteine", 'U' },
818  { "Ser", 'S' },
819  { "Serine", 'S' },
820  { "TERM", '*' },
821  { "Ter", '*' },
822  { "Termination", '*' },
823  { "Thr", 'T' },
824  { "Threonine", 'T' },
825  { "Trp", 'W' },
826  { "Tryptophan", 'W' },
827  { "Tyr", 'Y' },
828  { "Tyrosine", 'Y' },
829  { "Val", 'V' },
830  { "Valine", 'V' },
831  { "Xle", 'J' },
832  { "Xxx", 'X' },
833  { "Undet", 'X' },
834  { "fMet", 'M' },
835  { "iMet", 'M' }
836 };
837 
838 
839 static
842  "environmental_sample",
843  "germline",
844  "metagenomic",
845  "partial",
846  "pseudo",
847  "rearranged",
848  "ribosomal_slippage",
849  "trans_splicing",
850  "transgenic",
851  "replace" // RW-882
852 };
853 
854 // constructor
856  : m_reader(reader), m_LineNumber(line_num), m_pMessageListener(pMessageListener)
857 {
858 }
859 
860 // destructor
862 {
863 }
864 
865 
867  const CTempString & sLine, Int4 & out_offset )
868 {
869  // offset strings are of the form [offset=SOME_NUMBER], but here we try
870  // to be as forgiving of whitespace as possible.
871 
872  CTempString sKey;
873  CTempString sValue;
874  if( ! NStr::SplitInTwo(sLine, "=", sKey, sValue) ) {
875  // "=" not found
876  return false;
877  }
878 
879  // check key
881  if( NStr::StartsWith(sKey, "[") ) {
882  sKey = sKey.substr(1); // remove initial "["
883  }
885  if( ! NStr::EqualNocase(sKey, "offset") ) {
886  // key is not offset
887  return false;
888  }
889 
890  // check value
892  if( ! NStr::EndsWith(sValue, "]") ) {
893  // no closing bracket
894  return false;
895  }
896  // remove closing bracket
897  sValue = sValue.substr(0, (sValue.length() - 1) );
899  // is it a number?
900  try {
901  Int4 new_offset = NStr::StringToInt(sValue);
902  // if( new_offset < 0 ) {
903  // return false;
904  // }
905  out_offset = new_offset;
906  return true;
907  } catch ( CStringException & ) {
908  return false;
909  }
910 }
911 
913  const CTempString& line,
914  SFeatLocInfo& loc_info,
915  string& featP,
916  string& qualP,
917  string& valP,
918  Int4 offset
919 )
920 
921 {
922  SIZE_TYPE numtkns;
923  bool isminus = false;
924  bool ispoint = false;
925  size_t len;
926  bool partial5 = false;
927  bool partial3 = false;
928  Int4 startv = -1;
929  Int4 stopv = -1;
930  Int4 swp;
931  string start, stop, feat, qual, val, stnd;
932  vector<string> tkns;
933 
934 
935  if (line.empty ()) return false;
936 
937  /* offset and other instructions encoded in brackets */
938  if (NStr::StartsWith (line, '[')) return false;
939 
940  tkns.clear ();
941  x_TokenizeLenient(line, tkns);
942  numtkns = tkns.size ();
943 
944  if (numtkns > 0) {
945  start = NStr::TruncateSpaces(tkns[0]);
946  }
947  if (numtkns > 1) {
948  stop = NStr::TruncateSpaces(tkns[1]);
949  }
950  if (numtkns > 2) {
951  feat = NStr::TruncateSpaces(tkns[2]);
952  }
953  if (numtkns > 3) {
954  qual = NStr::TruncateSpaces(tkns[3]);
955  }
956  if (numtkns > 4) {
957  val = NStr::TruncateSpaces(tkns[4]);
958  // trim enclosing double-quotes
959  if( val.length() >= 2 && val[0] == '"' && val[val.length()-1] == '"' ) {
960  val = val.substr(1, val.length() - 2);
961  }
962  }
963  if (numtkns > 5) {
964  stnd = NStr::TruncateSpaces(tkns[5]);
965  }
966 
967  bool has_start = false;
968  if (! start.empty ()) {
969  if (start [0] == '<') {
970  partial5 = true;
971  start.erase (0, 1);
972  }
973  len = start.length ();
974  if (len > 1 && start [len - 1] == '^') {
975  ispoint = true;
976  start [len - 1] = '\0';
977  }
978  startv = x_StringToLongNoThrow(start, feat, qual,
980  has_start = true;
981  }
982 
983  bool has_stop = false;
984  if (! stop.empty ()) {
985  if (stop [0] == '>') {
986  partial3 = true;
987  stop.erase (0, 1);
988  }
989  stopv = x_StringToLongNoThrow (stop, feat, qual,
991  has_stop = true;
992  }
993 
994  if ( startv <= 0 || stopv <= 0 ) {
995  startv = -1;
996  stopv = -1;
997  } else {
998  startv--;
999  stopv--;
1000  if (! stnd.empty ()) {
1001  if (stnd == "minus" || stnd == "-" || stnd == "complement") {
1002  if (start < stop) {
1003  swp = startv;
1004  startv = stopv;
1005  stopv = swp;
1006  }
1007  isminus = true;
1008  }
1009  }
1010  }
1011 
1012  if (startv >= 0) {
1013  startv += offset;
1014  }
1015  if (stopv >= 0) {
1016  stopv += offset;
1017  }
1018 
1019  if ((has_start && startv < 0) || (has_stop && stopv < 0)) {
1020  x_ProcessMsg(
1022  eDiag_Error,
1023  feat);
1024  }
1025 
1026  loc_info.start_pos = ( startv < 0 ? -1 : startv);
1027  loc_info.stop_pos = ( stopv < 0 ? -1 : stopv);
1028 
1029  loc_info.is_5p_partial = partial5;
1030  loc_info.is_3p_partial = partial3;
1031  loc_info.is_point = ispoint;
1032  loc_info.is_minus_strand = isminus;
1033  featP = feat;
1034  qualP = qual;
1035  valP = val;
1036 
1037  return true;
1038 }
1039 
1040 /*
1041 bool CFeatureTableReader_Imp::x_ParseFeatureTableLine (
1042  const CTempString& line,
1043  Int4* startP,
1044  Int4* stopP,
1045  bool* partial5P,
1046  bool* partial3P,
1047  bool* ispointP,
1048  bool* isminusP,
1049  string& featP,
1050  string& qualP,
1051  string& valP,
1052  Int4 offset
1053 )
1054 
1055 {
1056  SIZE_TYPE numtkns;
1057  bool isminus = false;
1058  bool ispoint = false;
1059  size_t len;
1060  bool partial5 = false;
1061  bool partial3 = false;
1062  Int4 startv = -1;
1063  Int4 stopv = -1;
1064  Int4 swp;
1065  string start, stop, feat, qual, val, stnd;
1066  vector<string> tkns;
1067 
1068 
1069  if (line.empty ()) return false;
1070 
1071  if (NStr::StartsWith (line, '[')) return false;
1072 
1073  tkns.clear ();
1074  x_TokenizeLenient(line, tkns);
1075  numtkns = tkns.size ();
1076 
1077  if (numtkns > 0) {
1078  start = NStr::TruncateSpaces(tkns[0]);
1079  }
1080  if (numtkns > 1) {
1081  stop = NStr::TruncateSpaces(tkns[1]);
1082  }
1083  if (numtkns > 2) {
1084  feat = NStr::TruncateSpaces(tkns[2]);
1085  }
1086  if (numtkns > 3) {
1087  qual = NStr::TruncateSpaces(tkns[3]);
1088  }
1089  if (numtkns > 4) {
1090  val = NStr::TruncateSpaces(tkns[4]);
1091  // trim enclosing double-quotes
1092  if( val.length() >= 2 && val[0] == '"' && val[val.length()-1] == '"' ) {
1093  val = val.substr(1, val.length() - 2);
1094  }
1095  }
1096  if (numtkns > 5) {
1097  stnd = NStr::TruncateSpaces(tkns[5]);
1098  }
1099 
1100  bool has_start = false;
1101  if (! start.empty ()) {
1102  if (start [0] == '<') {
1103  partial5 = true;
1104  start.erase (0, 1);
1105  }
1106  len = start.length ();
1107  if (len > 1 && start [len - 1] == '^') {
1108  ispoint = true;
1109  start [len - 1] = '\0';
1110  }
1111  startv = x_StringToLongNoThrow(start, feat, qual,
1112  ILineError::eProblem_BadFeatureInterval);
1113  has_start = true;
1114  }
1115 
1116  bool has_stop = false;
1117  if (! stop.empty ()) {
1118  if (stop [0] == '>') {
1119  partial3 = true;
1120  stop.erase (0, 1);
1121  }
1122  stopv = x_StringToLongNoThrow (stop, feat, qual,
1123  ILineError::eProblem_BadFeatureInterval);
1124  has_stop = true;
1125  }
1126 
1127  if ( startv <= 0 || stopv <= 0 ) {
1128  startv = -1;
1129  stopv = -1;
1130  } else {
1131  startv--;
1132  stopv--;
1133  if (! stnd.empty ()) {
1134  if (stnd == "minus" || stnd == "-" || stnd == "complement") {
1135  if (start < stop) {
1136  swp = startv;
1137  startv = stopv;
1138  stopv = swp;
1139  }
1140  isminus = true;
1141  }
1142  }
1143  }
1144 
1145  if (startv >= 0) {
1146  startv += offset;
1147  }
1148  if (stopv >= 0) {
1149  stopv += offset;
1150  }
1151 
1152  if ((has_start && startv < 0) || (has_stop && stopv < 0)) {
1153  x_ProcessMsg(
1154  ILineError::eProblem_FeatureBadStartAndOrStop,
1155  eDiag_Error,
1156  feat);
1157  }
1158 
1159  *startP = ( startv < 0 ? -1 : startv);
1160  *stopP = ( stopv < 0 ? -1 : stopv);
1161 
1162  *partial5P = partial5;
1163  *partial3P = partial3;
1164  *ispointP = ispoint;
1165  *isminusP = isminus;
1166  featP = feat;
1167  qualP = qual;
1168  valP = val;
1169 
1170  return true;
1171 }
1172 */
1173 
1175  const CTempString &line,
1176  vector<string> &out_tokens )
1177 {
1178  out_tokens.clear();
1179 
1180  // each token has spaces before it and a tab or end-of-line after it
1181  string::size_type startPosOfNextRoundOfTokenization = 0;
1182  while ( startPosOfNextRoundOfTokenization < line.size() ) {
1183  auto posAfterSpaces = line.find_first_not_of( " ", startPosOfNextRoundOfTokenization );
1184  if( posAfterSpaces == string::npos ) {
1185  return;
1186  }
1187 
1188  string::size_type posOfTab = line.find( '\t', posAfterSpaces );
1189  if( posOfTab == string::npos ) {
1190  posOfTab = line.length();
1191  }
1192 
1193  // The next token is between the spaces and the tab (or end of string)
1194  out_tokens.push_back(kEmptyStr);
1195  string &new_token = out_tokens.back();
1196  copy( line.begin() + posAfterSpaces, line.begin() + posOfTab, back_inserter(new_token) );
1197  NStr::TruncateSpacesInPlace( new_token );
1198 
1199  startPosOfNextRoundOfTokenization = ( posOfTab + 1 );
1200  }
1201 }
1202 
1203 // since some compilers won't let me use isspace for find_if
1204 class CIsSpace {
1205 public:
1206  bool operator()( char c ) { return isspace(c); }
1207 };
1208 
1210 public:
1211  bool operator()( char c ) { return ! isspace(c); }
1212 };
1213 
1215  const CTempString &line,
1216  vector<string> &out_tokens )
1217 {
1218  out_tokens.clear();
1219 
1220  if( line.empty() ) {
1221  return;
1222  }
1223 
1224  // if it starts with whitespace, it must be a qual line, else it's a feature line
1225  if( isspace(line[0]) ) {
1226  // In regex form, we're doing something like this:
1227  // \s+(\S+)(\s+(\S.*))?
1228  // Where the first is the qual, and the rest is the val
1229  auto start_of_qual = find_if( line.begin(), line.end(), CIsNotSpace() );
1230  if( start_of_qual == line.end() ) {
1231  return;
1232  }
1233  auto start_of_whitespace_after_qual = find_if( start_of_qual, line.end(), CIsSpace() );
1234  auto start_of_val = find_if( start_of_whitespace_after_qual, line.end(), CIsNotSpace() );
1235 
1236  // first 3 are empty
1237  out_tokens.push_back(kEmptyStr);
1238  out_tokens.push_back(kEmptyStr);
1239  out_tokens.push_back(kEmptyStr);
1240 
1241  // then qual
1242  out_tokens.push_back(kEmptyStr);
1243  string &qual = out_tokens.back();
1244  copy( start_of_qual, start_of_whitespace_after_qual, back_inserter(qual) );
1245 
1246  // then val
1247  if( start_of_val != line.end() ) {
1248  out_tokens.push_back(kEmptyStr);
1249  string &val = out_tokens.back();
1250  copy( start_of_val, line.end(), back_inserter(val) );
1252  }
1253 
1254  } else {
1255  // parse a feature line
1256 
1257  // Since we're being lenient, we consider it to be 3 ( or 6 ) parts separated by whitespace
1258  auto first_column_start = line.begin();
1259  auto first_whitespace = find_if( first_column_start, line.end(), CIsSpace() );
1260  auto second_column_start = find_if( first_whitespace, line.end(), CIsNotSpace() );
1261  auto second_whitespace = find_if( second_column_start, line.end(), CIsSpace() );
1262  auto third_column_start = find_if( second_whitespace, line.end(), CIsNotSpace() );
1263  auto third_whitespace = find_if( third_column_start, line.end(), CIsSpace() );
1264  // columns 4 and 5 are unused on feature lines
1265  auto sixth_column_start = find_if( third_whitespace, line.end(), CIsNotSpace() );
1266  auto sixth_whitespace = find_if( sixth_column_start, line.end(), CIsSpace() );
1267 
1268  out_tokens.push_back(kEmptyStr);
1269  string &first = out_tokens.back();
1270  copy( first_column_start, first_whitespace, back_inserter(first) );
1271 
1272  out_tokens.push_back(kEmptyStr);
1273  string &second = out_tokens.back();
1274  copy( second_column_start, second_whitespace, back_inserter(second) );
1275 
1276  out_tokens.push_back(kEmptyStr);
1277  string &third = out_tokens.back();
1278  copy( third_column_start, third_whitespace, back_inserter(third) );
1279 
1280  if( sixth_column_start != line.end() ) {
1281  // columns 4 and 5 are unused
1282  out_tokens.push_back(kEmptyStr);
1283  out_tokens.push_back(kEmptyStr);
1284 
1285  out_tokens.push_back(kEmptyStr);
1286  string &sixth = out_tokens.back();
1287  copy( sixth_column_start, sixth_whitespace, back_inserter(sixth) );
1288  }
1289  }
1290 }
1291 
1292 
1294  CSeqFeatData& sfdata,
1295  EQual qtype,
1296  const string& val
1297 )
1298 
1299 {
1300  CGene_ref& grp = sfdata.SetGene ();
1301  switch (qtype) {
1302  case eQual_gene:
1303  grp.SetLocus (val);
1304  return true;
1305  case eQual_allele:
1306  grp.SetAllele (val);
1307  return true;
1308  case eQual_gene_desc:
1309  grp.SetDesc (val);
1310  return true;
1311  case eQual_gene_syn:
1312  {
1313  CGene_ref::TSyn& syn = grp.SetSyn ();
1314  syn.push_back (val);
1315  return true;
1316  }
1317  case eQual_map:
1318  grp.SetMaploc (val);
1319  return true;
1320  case eQual_locus_tag:
1321  grp.SetLocus_tag (val);
1322  return true;
1323  case eQual_nomenclature:
1324  /* !!! need to implement !!! */
1325  return true;
1326  default:
1327  break;
1328  }
1329  return false;
1330 }
1331 
1332 
1334  CRef<CSeq_feat> sfp,
1335  CSeqFeatData& sfdata,
1336  EQual qtype, const string& val
1337 )
1338 
1339 {
1340  CCdregion& crp = sfdata.SetCdregion ();
1341  switch (qtype) {
1342  case eQual_codon_start:
1343  {
1344  int frame = x_StringToLongNoThrow (val, kCdsFeatName, "codon_start");
1345  switch (frame) {
1346  case 0:
1348  break;
1349  case 1:
1351  break;
1352  case 2:
1354  break;
1355  case 3:
1357  break;
1358  default:
1359  break;
1360  }
1361  return true;
1362  }
1363  case eQual_EC_number:
1364  {
1365  CProt_ref& prp = sfp->SetProtXref ();
1366  CProt_ref::TEc& ec = prp.SetEc ();
1367  ec.push_back (val);
1368  return true;
1369  }
1370  case eQual_function:
1371  {
1372  CProt_ref& prp = sfp->SetProtXref ();
1373  CProt_ref::TActivity& fun = prp.SetActivity ();
1374  fun.push_back (val);
1375  return true;
1376  }
1377  case eQual_product:
1378  {
1379  CProt_ref& prp = sfp->SetProtXref ();
1380  CProt_ref::TName& prod = prp.SetName ();
1381  prod.push_back (val);
1382  return true;
1383  }
1384  case eQual_prot_desc:
1385  {
1386  CProt_ref& prp = sfp->SetProtXref ();
1387  prp.SetDesc (val);
1388  return true;
1389  }
1390  case eQual_prot_note:
1391  return x_AddGBQualToFeature(sfp, "prot_note", val);
1392  case eQual_transl_except:
1393  // add as GBQual, let cleanup convert to code_break
1394  return x_AddGBQualToFeature(sfp, "transl_except", val);
1395  case eQual_translation:
1396  // we should accept, but ignore this qual on CDSs.
1397  // so, do nothing but return success
1398  return true;
1399  case eQual_transl_table:
1400  // set genetic code directly, or add qualifier and let cleanup convert?
1401  try {
1402  int num = NStr::StringToLong(val);
1403  CGen_code_table::GetTransTable(num); // throws if bad num
1405  code->SetId(num);
1406  crp.SetCode().Set().push_back(code);
1407  return true;
1408  } catch( CStringException ) {
1409  // if val is not a number, add qualifier directly and
1410  // let cleanup convert?
1411  return x_AddGBQualToFeature(sfp, "transl_table", val);
1412  } catch( ... ) {
1413  // invalid genome code table so don't even try to make
1414  // the transl_table qual
1415  x_ProcessMsg(
1417  kCdsFeatName, "transl_table", val);
1418  return true;
1419  }
1420  break;
1421 
1422  default:
1423  break;
1424  }
1425  return false;
1426 }
1427 
1428 
1430  const string& str
1431 )
1432 
1433 {
1434  ITERATE (string, it, str) {
1435  char ch = *it;
1436  if (ch > ' ' && ch != '"' && ch != '\'') return false;
1437  }
1438 
1439  return true;
1440 }
1441 
1442 static bool
1444 {
1445  // basically, this is true if the line starts with "order" (whitespaces disregarded)
1446 
1447  const static char* kOrder = "ORDER";
1448 
1449  // find first non-whitespace character
1450  string::size_type pos = 0;
1451  for( ; pos < line.length() && isspace(line[pos]); ++pos) {
1452  // nothing to do here
1453  }
1454 
1455  // line is all whitespace
1456  if( pos >= line.length() ) {
1457  return false;
1458  }
1459 
1460  // check if starts with "order" after whitespace
1461  return ( 0 == NStr::CompareNocase( line, pos, strlen(kOrder), kOrder ) );
1462 }
1463 
1464 // Turns a "join" location into an "order" by putting nulls between it
1465 // Returns an unset CRef if the loc doesn't need nulls (e.g. if it's just an interval)
1466 static CRef<CSeq_loc>
1468 {
1469  // create result we're returning
1471  CSeq_loc_mix::Tdata & mix_pieces = result->SetMix().Set();
1472 
1473  // keep this around for whenever we need a "null" piece
1474  CRef<CSeq_loc> loc_piece_null( new CSeq_loc );
1475  loc_piece_null->SetNull();
1476 
1477  // push pieces of source, with NULLs between
1478  CSeq_loc_CI loc_iter( loc );
1479  for( ; loc_iter; ++loc_iter ) {
1480  if( ! mix_pieces.empty() ) {
1481  mix_pieces.push_back( loc_piece_null );
1482  }
1483  CRef<CSeq_loc> new_piece( new CSeq_loc );
1484  new_piece->Assign( loc_iter.GetEmbeddingSeq_loc() );
1485  mix_pieces.push_back( new_piece );
1486  }
1487 
1488  // Only wrap in "mix" if there was more than one piece
1489  if( mix_pieces.size() > 1 ) {
1490  return result;
1491  } else {
1492  return CRef<CSeq_loc>();
1493  }
1494 }
1495 
1496 
1498  const string& val
1499 )
1500 {
1502 
1503  if (NStr::StartsWith(value, "tRNA-")) {
1504  value.assign(value, strlen("tRNA-"), CTempString::npos);
1505  }
1506 
1507  CTempString::size_type pos = value.find_first_of("-,;:()=\'_~");
1508  if (pos != CTempString::npos) {
1509  value.erase(pos);
1511  }
1512 
1513  return string(value);
1514 }
1515 
1516 
1517 bool
1519 {
1520  if (NStr::IsBlank (str)) return false;
1521 
1522  string normalized_string = str;
1523  normalized_string.erase(
1524  remove_if(begin(normalized_string),
1525  end(normalized_string),
1526  [](char c) { return isspace(c);}),
1527  end(normalized_string));
1528 
1529  if ( NStr::StartsWith(normalized_string, "(pos:") ) {
1530  // find position of closing paren
1531  string::size_type pos_end = x_MatchingParenPos( normalized_string, 0 );
1532  if (pos_end != string::npos) {
1533  string pos_str = normalized_string.substr (5, pos_end - 5);
1534  string::size_type aa_start = NStr::FindNoCase(pos_str, "aa:");
1535  if (aa_start != string::npos) {
1536  auto seq_start = NStr::FindNoCase(pos_str, ",seq:");
1537  if (seq_start != string::npos &&
1538  seq_start < aa_start+3) {
1539  return false;
1540  }
1541 
1542  size_t aa_length = (seq_start == NPOS) ?
1543  pos_str.size() - (aa_start+3) :
1544  seq_start - (aa_start+3);
1545 
1546  string abbrev = pos_str.substr (aa_start + 3, aa_length);
1547  //TTrnaMap::const_iterator
1548  auto t_iter = sm_TrnaKeys.find (abbrev.c_str ());
1549  if (t_iter == sm_TrnaKeys.end ()) {
1550  // unable to parse
1551  return false;
1552  }
1554  aa->SetNcbieaa (t_iter->second);
1555  ext_trna.SetAa(*aa);
1556  pos_str = pos_str.substr (0, aa_start);
1557  NStr::TruncateSpacesInPlace (pos_str);
1558  if (NStr::EndsWith (pos_str, ",")) {
1559  pos_str = pos_str.substr (0, pos_str.length() - 1);
1560  }
1561  }
1563  CRef<CSeq_loc> anticodon = GetSeqLocFromString (pos_str, m_seq_id, & helper);
1564  if (! anticodon) {
1565  ext_trna.ResetAa();
1566  return false;
1567  } else {
1568  switch( anticodon->GetStrand() ) {
1569  case eNa_strand_unknown:
1570  case eNa_strand_plus:
1571  case eNa_strand_minus:
1572  ext_trna.SetAnticodon(*anticodon);
1573  return true;
1574  default:
1575  ext_trna.ResetAa();
1576  return false;
1577  }
1578  }
1579  }
1580  }
1581 
1582  return false;
1583 }
1584 
1585 
1587  const string &str, SIZE_TYPE open_paren_pos )
1588 {
1589  _ASSERT( str[open_paren_pos] == '(' );
1590  _ASSERT( open_paren_pos < str.length() );
1591 
1592  // nesting level. start at 1 since we know there's an open paren
1593  int level = 1;
1594 
1595  SIZE_TYPE pos = open_paren_pos + 1;
1596  for( ; pos < str.length(); ++pos ) {
1597  switch( str[pos] ) {
1598  case '(':
1599  // nesting deeper
1600  ++level;
1601  break;
1602  case ')':
1603  // closed a level of nesting
1604  --level;
1605  if( 0 == level ) {
1606  // reached the top: we're closing the initial paren,
1607  // so we return our position
1608  return pos;
1609  }
1610  break;
1611  default:
1612  // ignore other characters.
1613  // maybe in the future we'll handle ignoring parens in quotes or
1614  // things like that.
1615  break;
1616  }
1617  }
1618  return NPOS;
1619 }
1620 
1622  CTempString strToConvert,
1623  CTempString strFeatureName,
1624  CTempString strQualifierName,
1625  ILineError::EProblem eProblem
1626 )
1627 {
1628  try {
1629  return NStr::StringToLong(strToConvert);
1630  } catch( ... ) {
1631  // See if we start with a number, but there's extra junk after it, try again
1632  if( ! strToConvert.empty() && isdigit(strToConvert[0]) ) {
1633  try {
1635 
1636  ILineError::EProblem problem =
1638  if( eProblem != ILineError::eProblem_Unset ) {
1639  problem = eProblem;
1640  }
1641 
1642  x_ProcessMsg(
1643  problem,
1644  eDiag_Warning,
1645  strFeatureName, strQualifierName, strToConvert );
1646  return result;
1647  } catch( ... ) { } // fall-thru to usual handling
1648  }
1649 
1650  ILineError::EProblem problem =
1652  if( eProblem != ILineError::eProblem_Unset ) {
1653  problem = eProblem;
1654  }
1655 
1656  x_ProcessMsg(
1657  problem,
1658  eDiag_Warning,
1659  strFeatureName, strQualifierName, strToConvert );
1660  // we have no idea, so just return zero
1661  return 0;
1662  }
1663 }
1664 
1665 
1667  CRef<CSeq_feat> sfp,
1668  EQual qtype,
1669  const string& val
1670 )
1671 {
1672  CSeqFeatData& sfdata = sfp->SetData();
1673  CRNA_ref& rrp = sfdata.SetRna ();
1674  CRNA_ref::EType rnatyp = rrp.GetType ();
1675  switch (rnatyp) {
1677  case CRNA_ref::eType_mRNA:
1678  case CRNA_ref::eType_rRNA:
1679  switch (qtype) {
1680  case eQual_product:
1681  {
1682  CRNA_ref::TExt& tex = rrp.SetExt ();
1683  CRNA_ref::C_Ext::E_Choice exttype = tex.Which ();
1684  if (exttype == CRNA_ref::C_Ext::e_TRNA) return false;
1685  tex.SetName (val);
1686  return true;
1687  }
1688  default:
1689  break;
1690  }
1691  break;
1692  case CRNA_ref::eType_ncRNA:
1693  switch (qtype) {
1694  case eQual_product:
1695  rrp.SetExt().SetGen().SetProduct(val);
1696  return true;
1697  break;
1698  case eQual_ncRNA_class:
1699  rrp.SetExt().SetGen().SetClass(val);
1700  return true;
1701  break;
1702  default:
1703  break;
1704  }
1705  break;
1706  case CRNA_ref::eType_tmRNA:
1707  switch (qtype) {
1708  case eQual_product:
1709  rrp.SetExt().SetGen().SetProduct(val);
1710  return true;
1711  case eQual_tag_peptide:
1712  {
1713  CRef<CRNA_qual> q(new CRNA_qual());
1714  q->SetQual("tag_peptide");
1715  q->SetVal(val);
1716  rrp.SetExt().SetGen().SetQuals().Set().push_back(q);
1717  return true;
1718  }
1719  break;
1720  default:
1721  break;
1722  }
1723  break;
1724  case CRNA_ref::eType_snRNA:
1725  case CRNA_ref::eType_scRNA:
1727  case CRNA_ref::eType_other:
1728  return false;
1729  case CRNA_ref::eType_tRNA:
1730  switch (qtype) {
1731  case eQual_product: {
1732  if (rrp.IsSetExt() && rrp.GetExt().Which() == CRNA_ref::C_Ext::e_Name)
1733  return false;
1734 
1735  const string& aa_string = x_TrnaToAaString(val);
1736  const auto aaval_it = sm_TrnaKeys.find(aa_string.c_str());
1737 
1738  if (aaval_it != sm_TrnaKeys.end()) {
1739  CRNA_ref::TExt& tex = rrp.SetExt ();
1740  CTrna_ext& trx = tex.SetTRNA();
1741  CTrna_ext::TAa& taa = trx.SetAa();
1742  taa.SetNcbieaa(aaval_it->second);
1743  if (aa_string == "fMet" ||
1744  aa_string == "iMet" ||
1745  aa_string == "Ile2") {
1746  x_AddGBQualToFeature(sfp, "product", val);
1747  }
1748  }
1749  else {
1750  x_ProcessMsg(
1752  "tRNA", "product", val);
1753  }
1754  return true;
1755  }
1756  break;
1757  case eQual_anticodon:
1758  {
1759  CRNA_ref::TExt& tex = rrp.SetExt ();
1760  CRNA_ref::C_Ext::TTRNA & ext_trna = tex.SetTRNA();
1761  if( ! x_ParseTrnaExtString(ext_trna, val) ) {
1762  x_ProcessMsg(
1764  "tRNA", "anticodon", val );
1765  }
1766  return true;
1767  }
1768  break;
1770  {
1771  //const auto codon_index = CGen_code_table::CodonToIndex(val);
1772  //if (codon_index >= 0) {
1773  CRNA_ref::TExt& tex = rrp.SetExt ();
1774  CRNA_ref::C_Ext::TTRNA & ext_trna = tex.SetTRNA();
1775  if (!x_AddCodons(val, ext_trna)) {
1776  return false;
1777  }
1778  //}
1779  return true;
1780  }
1781  break;
1782  default:
1783  break;
1784  }
1785  break;
1786  default:
1787  break;
1788  }
1789  return false;
1790 }
1791 
1792 
1794  const string& val,
1795  CTrna_ext& trna_ext
1796  ) const
1797 {
1798  if (val.size() != 3) {
1799  return false;
1800  }
1801 
1802  set<int> codons;
1803  try {
1804  for (char char1 : s_IUPACmap.at(val[0])) {
1805  for (char char2 : s_IUPACmap.at(val[1])) {
1806  for (char char3 : s_IUPACmap.at(val[2])) {
1807  const auto codon_index = CGen_code_table::CodonToIndex(char1, char2, char3);
1808  codons.insert(codon_index);
1809  }
1810  }
1811  }
1812 
1813  if (!codons.empty()) {
1814  trna_ext.SetAa().SetNcbieaa();
1815  for (const auto codon_index : codons) {
1816  trna_ext.SetCodon().push_back(codon_index);
1817  }
1818  }
1819  return true;
1820  }
1821  catch(...) {}
1822 
1823  return false;
1824 }
1825 
1826 
1828  CRef<CSeq_feat> sfp,
1829  CSeqFeatData& sfdata,
1830  EQual qtype,
1831  const string& qual,
1832  const string& val
1833 )
1834 
1835 {
1836  const char* str = nullptr;
1837 
1838  CSeqFeatData::ESubtype subtype = sfdata.GetSubtype ();
1839 
1840  // used if-statement because CSeqFeatData::IsRegulatory won't work in a
1841  // switch statement.
1842  if( (subtype == CSeqFeatData::eSubtype_regulatory) ||
1843  CSeqFeatData::IsRegulatory(subtype) )
1844  {
1845  if (qtype == eQual_regulatory_class) {
1846  if (val != "other") { // RW-374 "other" is a special case
1847 
1848  const vector<string>& allowed_values =
1850  if (find(allowed_values.cbegin(), allowed_values.cend(), val)
1851  == allowed_values.cend()) {
1852  return false;
1853  }
1854 
1855 /*
1856  const CSeqFeatData::ESubtype regulatory_class_subtype =
1857  CSeqFeatData::GetRegulatoryClass(val);
1858  if( regulatory_class_subtype == CSeqFeatData::eSubtype_bad ) {
1859  // msg will be sent in caller x_AddQualifierToFeature
1860  return false;
1861  }
1862  */
1863  }
1864  // okay
1865  // (Note that at this time we don't validate
1866  // if the regulatory_class actually matches the
1867  // subtype)
1868  x_AddGBQualToFeature(sfp, qual, val);
1869  return true;
1870  }
1871  }
1872 
1873  switch (subtype) {
1875  {
1876  switch (qtype) {
1877  case eQual_chrcnt:
1878  case eQual_ctgcnt:
1879  case eQual_loccnt:
1880  case eQual_snp_class:
1881  case eQual_snp_gtype:
1882  case eQual_snp_het:
1883  case eQual_snp_het_se:
1884  case eQual_snp_linkout:
1885  case eQual_snp_maxrate:
1886  case eQual_snp_valid:
1887  case eQual_weight:
1888  str = "dbSnpSynonymyData";
1889  break;
1890  default:
1891  break;
1892  }
1893  }
1894  break;
1896  {
1897  switch (qtype) {
1898  case eQual_sts_aliases:
1899  case eQual_sts_dsegs:
1900  case eQual_weight:
1901  str = "stsUserObject";
1902  break;
1903  default:
1904  break;
1905  }
1906  }
1907  break;
1909  {
1910  switch (qtype) {
1911  case eQual_bac_ends:
1912  case eQual_clone_id:
1913  case eQual_method:
1914  case eQual_sequence:
1915  case eQual_STS:
1916  case eQual_weight:
1917  str = "cloneUserObject";
1918  break;
1919  default:
1920  break;
1921  }
1922  }
1923  break;
1924  default:
1925  break;
1926  }
1927 
1928  if (str) {
1929  CSeq_feat::TExt& ext = sfp->SetExt ();
1930  CObject_id& obj = ext.SetType ();
1931  if ((! obj.IsStr ()) || obj.GetStr ().empty ()) {
1932  obj.SetStr ();
1933  }
1935  return true;
1936  }
1937 
1938  return false;
1939 }
1940 
1941 
1943  CSeqFeatData& sfdata,
1944  const string &feat_name,
1945  EOrgRef rtype,
1946  const string& val
1947 )
1948 {
1949  CBioSource& bsp = sfdata.SetBiosrc ();
1950 
1951  switch (rtype) {
1952  case eOrgRef_organism:
1953  {
1954  CBioSource::TOrg& orp = bsp.SetOrg ();
1955  orp.SetTaxname (val);
1956  return true;
1957  }
1958  case eOrgRef_organelle:
1959  {
1960  TGenomeMap::const_iterator g_iter = sm_GenomeKeys.find (val.c_str ());
1961  if (g_iter != sm_GenomeKeys.end ()) {
1962  CBioSource::EGenome gtype = g_iter->second;
1963  bsp.SetGenome (gtype);
1964  } else {
1965  x_ProcessMsg(
1967  feat_name, "organelle", val );
1968  }
1969  return true;
1970  }
1971  case eOrgRef_div:
1972  {
1973  CBioSource::TOrg& orp = bsp.SetOrg ();
1974  COrg_ref::TOrgname& onp = orp.SetOrgname ();
1975  onp.SetDiv (val);
1976  return true;
1977  }
1978  case eOrgRef_lineage:
1979  {
1980  CBioSource::TOrg& orp = bsp.SetOrg ();
1981  COrg_ref::TOrgname& onp = orp.SetOrgname ();
1982  onp.SetLineage (val);
1983  return true;
1984  }
1985  case eOrgRef_gcode:
1986  {
1987  CBioSource::TOrg& orp = bsp.SetOrg ();
1988  COrg_ref::TOrgname& onp = orp.SetOrgname ();
1989  int code = x_StringToLongNoThrow (val, feat_name, "gcode");
1990  onp.SetGcode (code);
1991  return true;
1992  }
1993  case eOrgRef_mgcode:
1994  {
1995  CBioSource::TOrg& orp = bsp.SetOrg ();
1996  COrg_ref::TOrgname& onp = orp.SetOrgname ();
1997  int code = x_StringToLongNoThrow (val, feat_name, "mgcode");
1998  onp.SetMgcode (code);
1999  return true;
2000  }
2001  default:
2002  break;
2003  }
2004  return false;
2005 }
2006 
2007 
2009  CSeqFeatData& sfdata,
2010  CSubSource::ESubtype stype,
2011  const string& val
2012 )
2013 
2014 {
2015  CBioSource& bsp = sfdata.SetBiosrc ();
2016  CBioSource::TSubtype& slist = bsp.SetSubtype ();
2017  CRef<CSubSource> ssp (new CSubSource);
2018  ssp->SetSubtype (stype);
2019  ssp->SetName (val);
2020  slist.push_back (ssp);
2021  return true;
2022 }
2023 
2024 
2026  CSeqFeatData& sfdata,
2027  COrgMod::ESubtype mtype,
2028  const string& val
2029 )
2030 
2031 {
2032  CBioSource& bsp = sfdata.SetBiosrc ();
2033  CBioSource::TOrg& orp = bsp.SetOrg ();
2034  COrg_ref::TOrgname& onp = orp.SetOrgname ();
2035  COrgName::TMod& mlist = onp.SetMod ();
2036  CRef<COrgMod> omp (new COrgMod);
2037  omp->SetSubtype (mtype);
2038  omp->SetSubname (val);
2039  mlist.push_back (omp);
2040  return true;
2041 }
2042 
2043 
2045  CRef<CSeq_feat> sfp,
2046  const string& qual,
2047  const string& val
2048 )
2049 
2050 {
2051  if (qual.empty ()) return false;
2052 
2053  // need this pointer because references can't be repointed
2054  CTempString normalized_qual = qual;
2055 
2056  // normalize qual if needed, especially regarding case, and
2057  // use as-is if no normalization applies
2058  auto qual_type = CSeqFeatData::GetQualifierType(qual);
2059  if( qual_type != CSeqFeatData::eQual_bad ) {
2060  // swap is constant time
2061  CTempString potential_normalized_qual = CSeqFeatData::GetQualifierAsString(qual_type);
2062  if( ! potential_normalized_qual.empty() ) {
2063  normalized_qual = potential_normalized_qual;
2064  }
2065  }
2066 
2067  auto& qlist = sfp->SetQual ();
2068  CRef<CGb_qual> gbq (new CGb_qual);
2069  gbq->SetQual() = normalized_qual;
2070  if (x_StringIsJustQuotes (val)) {
2071  gbq->SetVal() = kEmptyStr;
2072  } else {
2073  gbq->SetVal() = val;
2074  }
2075  qlist.push_back (gbq);
2076 
2077  return true;
2078 }
2079 
2080 
2082  CRef<CSeq_annot> sap,
2083  TChoiceToFeatMap & choiceToFeatMap,
2084  const TFlags flags)
2085 {
2086  // load cds_equal_range to hold the CDSs
2087  typedef TChoiceToFeatMap::iterator TChoiceCI;
2088  typedef pair<TChoiceCI, TChoiceCI> TChoiceEqualRange;
2089  TChoiceEqualRange cds_equal_range =
2090  choiceToFeatMap.equal_range(CSeqFeatData::e_Cdregion);
2091  if( cds_equal_range.first == cds_equal_range.second )
2092  {
2093  // nothing to do if there are no CDSs
2094  return;
2095  }
2096 
2097  // load mappings from locus or locus-tag to gene
2098  typedef multimap<string, SFeatAndLineNum> TStringToGeneAndLineMap;
2099  TStringToGeneAndLineMap locusToGeneAndLineMap;
2100  TStringToGeneAndLineMap locusTagToGeneAndLineMap;
2101  const TChoiceEqualRange gene_equal_range =
2102  choiceToFeatMap.equal_range(CSeqFeatData::e_Gene);
2103  for( TChoiceCI gene_choice_ci = gene_equal_range.first;
2104  gene_choice_ci != gene_equal_range.second;
2105  ++gene_choice_ci )
2106  {
2107  SFeatAndLineNum gene_feat_ref_and_line = gene_choice_ci->second;
2108  const CGene_ref & gene_ref = gene_feat_ref_and_line.m_pFeat->GetData().GetGene();
2109  if( ! RAW_FIELD_IS_EMPTY_OR_UNSET(gene_ref, Locus) ) {
2110  locusToGeneAndLineMap.insert(
2112  gene_ref.GetLocus(), gene_feat_ref_and_line));
2113  }
2114  if( ! RAW_FIELD_IS_EMPTY_OR_UNSET(gene_ref, Locus_tag) ) {
2115  locusTagToGeneAndLineMap.insert(
2117  gene_ref.GetLocus_tag(), gene_feat_ref_and_line));
2118  }
2119  }
2120 
2121  // for each CDS, check for gene conflicts or create genes,
2122  // depending on various flags
2123  for( TChoiceCI cds_choice_ci = cds_equal_range.first;
2124  cds_choice_ci != cds_equal_range.second ; ++cds_choice_ci)
2125  {
2126  TFeatConstRef cds_feat_ref = cds_choice_ci->second.m_pFeat;
2127  const TSeqPos cds_line_num = cds_choice_ci->second.m_uLineNum;
2128 
2129  const CSeq_loc & cds_loc = cds_feat_ref->GetLocation();
2130 
2131  const CGene_ref * pGeneXrefOnCDS = cds_feat_ref->GetGeneXref();
2132  if( ! pGeneXrefOnCDS ) {
2133  // no xref, so can't do anything for this CDS
2134  // (this is NOT an error)
2135  continue;
2136  }
2137 
2138  // get all the already-existing genes that
2139  // this CDS xrefs. It should be somewhat uncommon for there
2140  // to be more than one matching gene.
2141  set<SFeatAndLineNum> matchingGenes;
2142 
2143  const string locus =
2144  pGeneXrefOnCDS->IsSetLocus() ?
2145  pGeneXrefOnCDS->GetLocus() :
2146  "";
2147 
2148  const string locus_tag =
2149  pGeneXrefOnCDS->IsSetLocus_tag() ?
2150  pGeneXrefOnCDS->GetLocus_tag() :
2151  "";
2152 
2153 
2154  {{
2155  // all the code in this scope is all just for setting up matchingGenes
2156 
2157  typedef TStringToGeneAndLineMap::iterator TStrToGeneCI;
2158  typedef pair<TStrToGeneCI, TStrToGeneCI> TStrToGeneEqualRange;
2159  set<SFeatAndLineNum> locusGeneMatches;
2160  // add the locus matches (if any) to genesAlreadyCreated
2161  if( !NStr::IsBlank(locus) ) {
2162  TStrToGeneEqualRange locus_equal_range =
2163  locusToGeneAndLineMap.equal_range(locus);
2164  for( TStrToGeneCI locus_gene_ci = locus_equal_range.first;
2165  locus_gene_ci != locus_equal_range.second;
2166  ++locus_gene_ci )
2167  {
2168  if (!NStr::IsBlank(locus_tag)) {
2169  auto gene_feat = locus_gene_ci->second.m_pFeat;
2170  if (gene_feat->GetData().GetGene().IsSetLocus_tag() &&
2171  gene_feat->GetData().GetGene().GetLocus_tag() != locus_tag) {
2172  continue;
2173  }
2174  }
2175  locusGeneMatches.insert(locus_gene_ci->second);
2176  }
2177  }
2178  // remove any that don't also match the locus-tag (if any)
2179  set<SFeatAndLineNum> locusTagGeneMatches;
2180  if( !NStr::IsBlank(locus_tag) ) {
2181  TStrToGeneEqualRange locus_tag_equal_range =
2182  locusTagToGeneAndLineMap.equal_range(locus_tag);
2183  for( TStrToGeneCI locus_tag_gene_ci = locus_tag_equal_range.first;
2184  locus_tag_gene_ci != locus_tag_equal_range.second;
2185  ++locus_tag_gene_ci )
2186  {
2187  if (!NStr::IsBlank(locus)) {
2188  auto gene_feat = locus_tag_gene_ci->second.m_pFeat;
2189  if (gene_feat->GetData().GetGene().IsSetLocus() &&
2190  gene_feat->GetData().GetGene().GetLocus() != locus) {
2191  continue;
2192  }
2193  }
2194  locusTagGeneMatches.insert(locus_tag_gene_ci->second);
2195  }
2196  }
2197  // analyze locusGeneMatches and locusTagGeneMatches to find matchingGenes.
2198  if( locusGeneMatches.empty() ) {
2199  // swap is faster than assignment
2200  matchingGenes.swap(locusTagGeneMatches);
2201  } else if( locusTagGeneMatches.empty() ) {
2202  // swap is faster than assignment
2203  matchingGenes.swap(locusGeneMatches);
2204  } else {
2205  // get only the genes that match both (that is, the intersection)
2206  set_intersection(
2207  locusGeneMatches.begin(), locusGeneMatches.end(),
2208  locusTagGeneMatches.begin(), locusTagGeneMatches.end(),
2209  inserter(matchingGenes, matchingGenes.begin()));
2210  }
2211  }}
2212 
2213  // if requested, check that the genes really do contain the CDS
2214  // (also check if we're trying to create a gene that already exists)
2215 
2216  ITERATE(set<SFeatAndLineNum>, gene_feat_and_line_ci, matchingGenes) {
2217  const CSeq_loc & gene_loc = gene_feat_and_line_ci->m_pFeat->GetLocation();
2218  const TSeqPos gene_line_num = gene_feat_and_line_ci->m_uLineNum;
2219 
2221 
2222  // CDS's loc minus gene's loc should be an empty location
2223  // because the CDS should be entirely on the gene
2224  CRef<CSeq_loc> pCdsMinusGeneLoc = cds_loc.Subtract(
2225  gene_loc, CSeq_loc::fSortAndMerge_All, nullptr, nullptr);
2226  if( pCdsMinusGeneLoc &&
2227  ! pCdsMinusGeneLoc->IsNull() &&
2228  ! pCdsMinusGeneLoc->IsEmpty() )
2229  {
2230  ILineError::TVecOfLines gene_lines;
2231  if( gene_line_num > 0 ) {
2232  gene_lines.push_back(gene_line_num);
2233  }
2234  x_ProcessMsg(
2235  cds_line_num,
2237  kCdsFeatName,
2239  gene_lines );
2240  }
2241  }
2242  }
2243 
2244  // if requested, create genes for the CDS if there isn't already one
2245  // (it is NOT an error if the gene is already created)
2247  matchingGenes.empty() )
2248  {
2249  // create the gene
2250  CRef<CSeq_feat> pNewGene( new CSeq_feat );
2251  pNewGene->SetData().SetGene().Assign( *pGeneXrefOnCDS );
2252  if( FIELD_EQUALS(*cds_feat_ref, Partial, true) ) pNewGene->SetPartial(true);
2253  pNewGene->SetLocation().Assign( cds_feat_ref->GetLocation() );
2254 
2255  // add gene the annot
2256  _ASSERT( sap->IsFtable() );
2257  TFtable & the_ftable = sap->SetData().SetFtable();
2258  the_ftable.push_back(pNewGene);
2259 
2260  // add it to our local information for later CDSs
2261  SFeatAndLineNum gene_feat_and_line(pNewGene, 0);
2262  choiceToFeatMap.insert(
2264  pNewGene->GetData().Which(), gene_feat_and_line ) );
2265  if( ! RAW_FIELD_IS_EMPTY_OR_UNSET(*pGeneXrefOnCDS, Locus) ) {
2266  locusToGeneAndLineMap.insert(
2268  pGeneXrefOnCDS->GetLocus(), gene_feat_and_line));
2269  }
2270  if( ! RAW_FIELD_IS_EMPTY_OR_UNSET(*pGeneXrefOnCDS, Locus_tag) ) {
2271  locusTagToGeneAndLineMap.insert(
2273  pGeneXrefOnCDS->GetLocus_tag(), gene_feat_and_line));
2274  }
2275  }
2276  } // end of iteration through the CDS's
2277 }
2278 
2279 static const string s_QualsWithCaps[] = {
2280  "EC_number",
2281  "PCR_conditions",
2282  "PubMed",
2283  "STS",
2284  "ncRNA_class"
2285 };
2286 
2287 static const int s_NumQualsWithCaps = sizeof (s_QualsWithCaps) / sizeof (string);
2288 
2289 static string s_FixQualCapitalization (const string& qual)
2290 {
2291  string lqual = qual;
2292  lqual = NStr::ToLower(lqual);
2293  for (int j = 0; j < s_NumQualsWithCaps; j++) {
2294  if (NStr::EqualNocase(lqual, s_QualsWithCaps[j])) {
2295  lqual = s_QualsWithCaps[j];
2296  break;
2297  }
2298  }
2299  return lqual;
2300 }
2301 
2302 
2304  CRef<CSeq_feat> sfp,
2305  const string& note)
2306 {
2307  if (sfp.IsNull()) {
2308  return false;
2309  }
2310 
2311  if (NStr::IsBlank(note)) { // Nothing to do
2312  return true;
2313  }
2314 
2315  string comment = (sfp->CanGetComment()) ?
2316  sfp->GetComment() + "; " + note :
2317  note;
2318  sfp->SetComment(comment);
2319  return true;
2320 }
2321 
2322 
2324  CRef<CSeq_feat> sfp,
2325  const string& feat_name,
2326  const string& qual,
2327  const string& val) {
2328 
2329  if (!x_AddNoteToFeature(sfp, val)) {
2330  return false;
2331  }
2332  // Else convert qualifier to note and issue warning
2333  if (qual != "note") {
2334  string error_message =
2335  qual + " is not a valid qualifier for this feature. Converting to note.";
2336  x_ProcessMsg(
2338  feat_name, qual, kEmptyStr, error_message);
2339  }
2340  return true;
2341 }
2342 
2344  CRef<CSeq_feat> sfp,
2345  const string &feat_name,
2346  const string& qual,
2347  const string& val,
2348  const TFlags flags
2349 )
2350 
2351 {
2352  CSeqFeatData& sfdata = sfp->SetData ();
2353  CSeqFeatData::E_Choice featType = sfdata.Which ();
2354 
2355  const CSeqFeatData::EQualifier qual_type =
2358  if( CSeqFeatData::IsDiscouragedQual(qual_type) ) {
2359  x_ProcessMsg(
2361  eDiag_Warning, feat_name, qual);
2362  }
2363  }
2364 
2365  if (featType == CSeqFeatData::e_Biosrc) {
2366 
2367  TOrgRefMap::const_iterator o_iter = sm_OrgRefKeys.find (qual.c_str ());
2368  if (o_iter != sm_OrgRefKeys.end ()) {
2369  EOrgRef rtype = o_iter->second;
2370  if (x_AddQualifierToBioSrc (sfdata, feat_name, rtype, val)) return true;
2371  } else {
2372 
2373  TSubSrcMap::const_iterator s_iter = sm_SubSrcKeys.find (qual.c_str ());
2374  if (s_iter != sm_SubSrcKeys.end ()) {
2375 
2376  CSubSource::ESubtype stype = s_iter->second;
2377  if (x_AddQualifierToBioSrc (sfdata, stype, val)) return true;
2378 
2379  } else {
2380 
2381  TOrgModMap::const_iterator m_iter = sm_OrgModKeys.find (qual.c_str ());
2382  if (m_iter != sm_OrgModKeys.end ()) {
2383 
2384  COrgMod::ESubtype mtype = m_iter->second;
2385  if (x_AddQualifierToBioSrc (sfdata, mtype, val)) return true;
2386  }
2387  }
2388  }
2389  return false;
2390  }
2391 
2392 
2393  // else type != CSeqFeatData::e_Biosrc
2394  string lqual = s_FixQualCapitalization(qual);
2395  TQualMap::const_iterator q_iter = sm_QualKeys.find (lqual.c_str ());
2396  if (q_iter != sm_QualKeys.end ()) {
2397  EQual qtype = q_iter->second;
2398  switch (featType) {
2399  case CSeqFeatData::e_Gene:
2400  if (x_AddQualifierToGene (sfdata, qtype, val)) return true;
2401  break;
2403  if (x_AddQualifierToCdregion (sfp, sfdata, qtype, val)) return true;
2404  break;
2405  case CSeqFeatData::e_Rna:
2406  if (x_AddQualifierToRna (sfp, qtype, val)) return true;
2407  break;
2408  case CSeqFeatData::e_Imp:
2409  if (x_AddQualifierToImp (sfp, sfdata, qtype, qual, val)) return true;
2410  break;
2412  if (qtype == eQual_region_name) {
2413  sfdata.SetRegion (val);
2414  return true;
2415  }
2416  break;
2417  case CSeqFeatData::e_Bond:
2418  if (qtype == eQual_bond_type) {
2420  if (CSeqFeatData::GetBondList()->IsBondName(val.c_str(), btyp)) {
2421  sfdata.SetBond (btyp);
2422  return true;
2423  }
2424  }
2425  break;
2426  case CSeqFeatData::e_Site:
2427  if (qtype == eQual_site_type) {
2429  if (CSeqFeatData::GetSiteList()->IsSiteName( val.c_str(), styp)) {
2430  sfdata.SetSite (styp);
2431  return true;
2432  }
2433  }
2434  break;
2435  case CSeqFeatData::e_Pub:
2436  if( qtype == eQual_PubMed ) {
2437  CRef<CPub> new_pub( new CPub );
2438  new_pub->SetPmid( CPubMedId( ENTREZ_ID_FROM(long, x_StringToLongNoThrow(val, feat_name, qual)) ) );
2439  sfdata.SetPub().SetPub().Set().push_back( new_pub );
2440  return true;
2441  }
2442  break;
2443  case CSeqFeatData::e_Prot:
2444  switch( qtype ) {
2445  case eQual_product:
2446  sfdata.SetProt().SetName().push_back( val );
2447  return true;
2448  case eQual_function:
2449  sfdata.SetProt().SetActivity().push_back( val );
2450  return true;
2451  case eQual_EC_number:
2452  sfdata.SetProt().SetEc().push_back( val );
2453  return true;
2454  default:
2455  break;
2456  }
2457  break;
2458  default:
2459  break;
2460  }
2461 
2462  switch (qtype) {
2463  case eQual_pseudo:
2464  sfp->SetPseudo (true);
2465  return true;
2466  case eQual_partial:
2467  sfp->SetPartial (true);
2468  return true;
2469  case eQual_exception:
2470  sfp->SetExcept (true);
2471  sfp->SetExcept_text (val);
2472  return true;
2474  sfp->SetExcept (true);
2475  sfp->SetExcept_text (qual);
2476  return true;
2477  case eQual_trans_splicing:
2478  sfp->SetExcept (true);
2479  sfp->SetExcept_text (qual);
2480  return true;
2481  case eQual_evidence:
2482  if (val == "experimental") {
2484  } else if (val == "not_experimental" || val == "non_experimental" ||
2485  val == "not-experimental" || val == "non-experimental") {
2487  }
2488  return true;
2489  case eQual_note:
2490  return x_AddNoteToFeature(sfp, val);
2491  case eQual_inference:
2492  {
2493  string prefix, remainder;
2495  if (!NStr::IsBlank(prefix)) {
2496  x_AddGBQualToFeature(sfp, qual, val);
2497  }
2498  else {
2499  x_ProcessMsg(
2501  feat_name, qual, val);
2502  }
2503  return true;
2504  }
2505  case eQual_replace:
2506  {
2507  string val_copy = val;
2508  NStr::ToLower( val_copy );
2509  x_AddGBQualToFeature (sfp, qual, val_copy );
2510  return true;
2511  }
2512  case eQual_allele:
2513  case eQual_bound_moiety:
2514  case eQual_clone:
2515  case eQual_compare:
2516  case eQual_cons_splice:
2517  case eQual_direction:
2518  case eQual_EC_number:
2520  case eQual_experiment:
2521  case eQual_frequency:
2522  case eQual_function:
2523  case eQual_gap_type:
2524  case eQual_insertion_seq:
2525  case eQual_label:
2527  case eQual_map:
2528  case eQual_ncRNA_class:
2529  case eQual_number:
2530  case eQual_old_locus_tag:
2531  case eQual_operon:
2532  case eQual_organism:
2533  case eQual_PCR_conditions:
2534  case eQual_phenotype:
2535  case eQual_product:
2536  case eQual_pseudogene:
2537  case eQual_satellite:
2538  case eQual_rpt_family:
2539  case eQual_rpt_type:
2540  case eQual_rpt_unit:
2541  case eQual_rpt_unit_range:
2542  case eQual_rpt_unit_seq:
2543  case eQual_standard_name:
2544  case eQual_tag_peptide:
2545  case eQual_transposon:
2546  case eQual_usedin:
2547  case eQual_cyt_map:
2548  case eQual_gen_map:
2549  case eQual_rad_map:
2551  {
2552  x_AddGBQualToFeature (sfp, qual, val);
2553  return true;
2554  }
2555  case eQual_gene:
2556  {
2557  if (CSeqFeatData::CanHaveGene(sfdata.GetSubtype())) {
2558  CGene_ref& grp = sfp->SetGeneXref ();
2559  if (val != "-") {
2560  grp.SetLocus (val);
2561  }
2562  return true;
2563  }
2564  // else:
2565  return x_AddNoteToFeature(sfp, feat_name, qual, val);
2566  }
2567  case eQual_gene_desc:
2568  {
2569  if (CSeqFeatData::CanHaveGene(sfdata.GetSubtype())) {
2570  CGene_ref& grp = sfp->SetGeneXref ();
2571  grp.SetDesc (val);
2572  return true;
2573  }
2574  // else:
2575  return x_AddNoteToFeature(sfp, feat_name, qual, val);
2576  }
2577  case eQual_gene_syn:
2578  {
2579  if (CSeqFeatData::CanHaveGene(sfdata.GetSubtype())) {
2580  CGene_ref& grp = sfp->SetGeneXref ();
2581  CGene_ref::TSyn& syn = grp.SetSyn ();
2582  syn.push_back (val);
2583  return true;
2584  }
2585  // else:
2586  return x_AddNoteToFeature(sfp, feat_name, qual, val);
2587  }
2588  case eQual_locus_tag:
2589  {
2590  if (CSeqFeatData::CanHaveGene(sfdata.GetSubtype())) {
2591  CGene_ref& grp = sfp->SetGeneXref ();
2592  grp.SetLocus_tag (val);
2593  return true;
2594  }
2595  // else:
2596  return x_AddNoteToFeature(sfp, feat_name, qual, val);
2597  }
2598  case eQual_db_xref:
2599  {
2600  CTempString db, tag;
2601  if (NStr::SplitInTwo (val, ":", db, tag)) {
2602  CSeq_feat::TDbxref& dblist = sfp->SetDbxref ();
2603  CRef<CDbtag> dbt (new CDbtag);
2604  dbt->SetDb (db);
2605  CRef<CObject_id> oid (new CObject_id);
2606  static const char* digits = "0123456789";
2607  if (tag.find_first_not_of(digits) == string::npos && !NStr::IsBlank(tag))
2608  oid->SetId(NStr::StringToLong(tag));
2609  else
2610  oid->SetStr(tag);
2611  dbt->SetTag (*oid);
2612  dblist.push_back (dbt);
2613  return true;
2614  }
2615  return true;
2616  }
2617  case eQual_nomenclature:
2618  {
2619  /* !!! need to implement !!! */
2620  return true;
2621  }
2622  case eQual_go_component:
2623  case eQual_go_function:
2624  case eQual_go_process:
2625  if (featType == CSeqFeatData::e_Gene ||
2626  featType == CSeqFeatData::e_Cdregion ||
2627  featType == CSeqFeatData::e_Rna) {
2628  try {
2629  CReadUtil::AddGeneOntologyTerm(*sfp, qual, val);
2630  }
2631  catch( ILineError& err) {
2632  x_ProcessMsg(
2633  err.Problem(),
2634  err.Severity(),
2635  feat_name, qual, val,
2636  err.ErrorMessage());
2637  }
2638  //rw-621: throw out the faulty qualifier but retain the rest of the feature.
2639  return true;
2640  }
2641  return false;
2642  case eQual_transcript_id:
2643  {
2644  if (featType == CSeqFeatData::e_Rna &&
2645  sfdata.GetRna().GetType() == CRNA_ref::eType_mRNA) {
2646  CBioseq::TId ids;
2647  try {
2648  CSeq_id::ParseIDs(ids, val,
2651  }
2652  catch (CSeqIdException&)
2653  {
2654  x_ProcessMsg(
2656  feat_name, qual, val,
2657  "Invalid transcript_id : " + val);
2658  return true;
2659  }
2660 
2661  for (const auto& id : ids) {
2662  auto id_string = id->GetSeqIdString(true);
2663  auto res = m_ProcessedTranscriptIds.insert(id_string);
2664  if (res.second == false) { // Insertion failed because Seq-id already encountered
2665  x_ProcessMsg(
2667  feat_name, qual, val,
2668  "Transcript ID " + id_string + " appears on multiple mRNA features"
2669  );
2670  }
2671  }
2672  }
2673  x_AddGBQualToFeature(sfp, qual, val);
2674  return true;
2675  }
2676  case eQual_protein_id:
2677  // see SQD-1535 and SQD-3496
2678  if (featType == CSeqFeatData::e_Cdregion ||
2679  (featType == CSeqFeatData::e_Rna &&
2680  sfdata.GetRna().GetType() == CRNA_ref::eType_mRNA) ||
2681  (featType == CSeqFeatData::e_Prot &&
2682  sfdata.GetProt().IsSetProcessed() &&
2684  {
2685  CBioseq::TId ids;
2686  try {
2687  CSeq_id::ParseIDs(ids, val,
2690  }
2691  catch (CSeqIdException&)
2692  {
2693  x_ProcessMsg(
2695  feat_name, qual, val,
2696  "Invalid protein_id : " + val);
2697  return true;
2698  }
2699 
2700  if (featType == CSeqFeatData::e_Cdregion) {
2701  for (const auto& id : ids) {
2702  auto id_string = id->GetSeqIdString(true);
2703  auto res = m_ProcessedProteinIds.insert(id_string);
2704  if (res.second == false) { // Insertion failed because Seq-id already encountered
2705  x_ProcessMsg(
2707  feat_name, qual, val,
2708  "Protein ID " + id_string + " appears on multiple CDS features"
2709  );
2710  }
2711  }
2712  }
2713 
2714  if (featType != CSeqFeatData::e_Rna) { // mRNA only has a protein_id qualifier
2715  auto pBestId = GetBestId(ids);
2716  if (pBestId) {
2717  sfp->SetProduct().SetWhole(*pBestId);
2718  }
2719  }
2720  }
2721 
2722  if (featType != CSeqFeatData::e_Prot) { // Mat-peptide has an instantiated product, but no qualifier
2723  x_AddGBQualToFeature(sfp, qual, val);
2724  }
2725  return true;
2727  // This should've been handled up in x_AddQualifierToImp
2728  // so it's always a bad value to be here
2729  x_ProcessMsg(
2731  feat_name, qual, val );
2732  return true;
2733  default:
2734  break;
2735  }
2736  }
2737  return false;
2738 }
2739 
2741 {
2742  // This function is testing for a match against the following regular
2743  // expression, but we avoid actual regexps for max speed:
2744  // "^(===================================================================| INFO:| WARNING:| ERROR:).*"
2745 
2746  // (that magic number is the size of the smallest possible match)
2747  if( line.length() < 6 ) {
2748  return false;
2749  }
2750 
2751  if( line[0] == '=' ) {
2752  static const CTempString kAllEqualsMatch =
2753  "===================================================================";
2754  if( NStr::StartsWith(line, kAllEqualsMatch) ) {
2755  return true;
2756  }
2757  } else if( line[0] == ' ') {
2758  switch(line[1]) {
2759  case 'I':
2760  {
2761  static const CTempString kInfo = " INFO:";
2762  if( NStr::StartsWith(line, kInfo) ) {
2763  return true;
2764  }
2765  }
2766  break;
2767  case 'W':
2768  {
2769  static const CTempString kWarning = " WARNING:";
2770  if( NStr::StartsWith(line, kWarning) ) {
2771  return true;
2772  }
2773  }
2774  break;
2775  case 'E':
2776  {
2777  static const CTempString kError = " ERROR:";
2778  if( NStr::StartsWith(line, kError) ) {
2779  return true;
2780  }
2781  }
2782  break;
2783  default:
2784  // no match
2785  break;
2786  }
2787  }
2788 
2789  // no match
2790  return false;
2791 }
2792 
2794  CTempString strFeatureName,
2795  CRef<CSeq_feat>& sfp,
2796  const SFeatLocInfo& loc_info
2797 )
2798 
2799 {
2800 
2801  auto start = loc_info.start_pos;
2802  auto stop = loc_info.stop_pos;
2803 
2804  const Int4 orig_start = start;
2806 
2807  if (start > stop) {
2808  swap(start, stop);
2809  strand = eNa_strand_minus;
2810  }
2811  if (loc_info.is_minus_strand) {
2812  strand = eNa_strand_minus;
2813  }
2814 
2815  // construct loc, which will be added to the mix
2816  CSeq_loc_mix::Tdata & mix_set = sfp->SetLocation().SetMix();
2817  CRef<CSeq_loc> loc(new CSeq_loc);
2818  if (loc_info.is_point || start == stop ) {
2819  // a point of some kind
2820  if (mix_set.empty())
2821  m_need_check_strand = true;
2822  else
2823  x_GetPointStrand(*sfp, strand);
2824 
2825  // note usage of orig_start instead of start
2826  // because we want the first part of the point
2827  // specified in the file, not the smallest because SetRightOf
2828  // works differently for plus vs. minus strand
2829  CRef<CSeq_point> pPoint(
2830  new CSeq_point(*m_seq_id, orig_start, strand) );
2831  if( loc_info.is_point ) {
2832  // between two bases
2833  pPoint->SetRightOf (true);
2834  // warning if stop is not start plus one
2835  if( stop != (start+1) ) {
2836  x_ProcessMsg(
2838  strFeatureName );
2839  }
2840  } else {
2841  // just a point. do nothing
2842  }
2843 
2844  if (loc_info.is_5p_partial) {
2845  pPoint->SetPartialStart (true, eExtreme_Biological);
2846  }
2847  if (loc_info.is_3p_partial) {
2848  pPoint->SetPartialStop (true, eExtreme_Biological);
2849  }
2850 
2851  loc->SetPnt( *pPoint );
2852  } else {
2853  // interval
2854  CRef<CSeq_interval> pIval( new CSeq_interval(*m_seq_id, start, stop, strand) );
2855  if (loc_info.is_5p_partial) {
2856  pIval->SetPartialStart (true, eExtreme_Biological);
2857  }
2858  if (loc_info.is_3p_partial) {
2859  pIval->SetPartialStop (true, eExtreme_Biological);
2860  }
2861  loc->SetInt(*pIval);
2862  if (m_need_check_strand)
2863  {
2864  x_UpdatePointStrand(*sfp, strand);
2865  m_need_check_strand = false;
2866  }
2867  }
2868 
2869  // check for internal partials
2870  if( ! mix_set.empty() ) {
2871  const CSeq_loc & last_loc = *mix_set.back();
2872  if( last_loc.IsPartialStop(eExtreme_Biological) ||
2874  {
2875  // internal partials
2877  eDiag_Warning, strFeatureName );
2878  }
2879  }
2880 
2881  mix_set.push_back(loc);
2882 
2883 
2884  if (loc_info.is_5p_partial || loc_info.is_3p_partial) {
2885  sfp->SetPartial (true);
2886  }
2887 
2888  return true;
2889 }
2890 
2891 
2892 
2894  CRef<CSeq_feat> sfp,
2895  const string& feat,
2896  const TFlags flags,
2897  ITableFilter *filter
2898 )
2899 
2900 {
2901  if (feat.empty ()) return false;
2902 
2903  // check filter, if any
2904  if (filter) {
2905  ITableFilter::EAction action = filter->GetFeatAction(feat);
2906  if( action != ITableFilter::eAction_Okay ) {
2907  x_ProcessMsg(
2909  eDiag_Warning, feat );
2910  if( action == ITableFilter::eAction_Disallowed ) {
2911  return false;
2912  }
2913  }
2914  }
2915 
2917  if (sbtyp != CSeqFeatData::eSubtype_bad) {
2918 
2919  // populate *sfp here...
2920 
2922  sfp->SetData ().Select (typ);
2923  CSeqFeatData& sfdata = sfp->SetData ();
2924 
2925  if (typ == CSeqFeatData::e_Rna) {
2926  CRNA_ref& rrp = sfdata.SetRna ();
2928  switch (sbtyp) {
2930  rnatyp = CRNA_ref::eType_premsg;
2931  break;
2933  rnatyp = CRNA_ref::eType_mRNA;
2934  break;
2936  rnatyp = CRNA_ref::eType_tRNA;
2937  break;
2939  rnatyp = CRNA_ref::eType_rRNA;
2940  break;
2942  rnatyp = CRNA_ref::eType_ncRNA;
2943  rrp.SetExt().SetGen().SetClass("snRNA");
2944  break;
2946  rnatyp = CRNA_ref::eType_ncRNA;
2947  rrp.SetExt().SetGen().SetClass("scRNA");
2948  break;
2950  rnatyp = CRNA_ref::eType_ncRNA;
2951  rrp.SetExt().SetGen().SetClass("snoRNA");
2952  break;
2954  rnatyp = CRNA_ref::eType_ncRNA;
2955  rrp.SetExt().SetGen();
2956  break;
2958  rnatyp = CRNA_ref::eType_tmRNA;
2959  rrp.SetExt().SetGen();
2960  break;
2962  rrp.SetExt().SetName("misc_RNA");
2963  rnatyp = CRNA_ref::eType_other;
2964  break;
2965  default :
2966  break;
2967  }
2968  rrp.SetType (rnatyp);
2969 
2970  } else if (typ == CSeqFeatData::e_Imp) {
2971  CImp_feat_Base& imp = sfdata.SetImp ();
2972  imp.SetKey (feat);
2973 
2974  } else if (typ == CSeqFeatData::e_Bond) {
2976 
2977  } else if (typ == CSeqFeatData::e_Site) {
2979  } else if (typ == CSeqFeatData::e_Prot ) {
2980  CProt_ref &prot_ref = sfdata.SetProt();
2981  switch (sbtyp) {
2982  default:
2983  break;
2986  break;
2989  break;
2992  break;
2995  break;
2998  break;
2999  }
3000  }
3001 
3002  // check for discouraged feature name
3004  if( CSeqFeatData::IsDiscouragedSubtype(sbtyp) ) {
3005  x_ProcessMsg(
3007  eDiag_Warning, feat);
3008  }
3009  }
3010 
3011  return true;
3012  }
3013 
3014  // unrecognized feature key
3015 
3018  }
3019 
3021 
3022  sfp->SetData ().Select (CSeqFeatData::e_Imp);
3023  CSeqFeatData& sfdata = sfp->SetData ();
3024  CImp_feat_Base& imp = sfdata.SetImp ();
3025  imp.SetKey ("misc_feature");
3026  x_AddQualifierToFeature (sfp, kEmptyStr, "standard_name", feat, flags);
3027 
3028  return true;
3029 
3030  } else if ((flags & CFeature_table_reader::fKeepBadKey) != 0) {
3031 
3032  sfp->SetData ().Select (CSeqFeatData::e_Imp);
3033  CSeqFeatData& sfdata = sfp->SetData ();
3034  CImp_feat_Base& imp = sfdata.SetImp ();
3035  imp.SetKey (feat);
3036 
3037  return true;
3038  }
3039 
3040  return false;
3041 }
3042 
3044  ILineError::EProblem eProblem,
3045  EDiagSev eSeverity,
3046  const string& strFeatureName,
3047  const string& strQualifierName,
3048  const string& strQualifierValue,
3049  const string& strErrorMessage,
3050  const ILineError::TVecOfLines & vecOfOtherLines)
3051 {
3052  x_ProcessMsg(m_reader ? static_cast<unsigned>(m_reader->GetLineNumber()) : m_LineNumber,
3053  eProblem,
3054  eSeverity,
3055  strFeatureName,
3056  strQualifierName,
3057  strQualifierValue,
3058  strErrorMessage,
3059  vecOfOtherLines);
3060 }
3061 
3062 
3064  int line_num,
3065  ILineError::EProblem eProblem,
3066  EDiagSev eSeverity,
3067  const string & strFeatureName,
3068  const string & strQualifierName,
3069  const string & strQualifierValue,
3070  const string& strErrorMessage,
3071  const ILineError::TVecOfLines & vecOfOtherLines )
3072 {
3073 
3074  if (!m_pMessageListener) {
3075  return;
3076  }
3077 
3080  eSeverity, line_num, strErrorMessage, eProblem, m_real_seqid, strFeatureName,
3081  strQualifierName, strQualifierValue));
3082  ITERATE( ILineError::TVecOfLines, line_it, vecOfOtherLines ) {
3083  pErr->AddOtherLine(*line_it);
3084  }
3085 
3086  if (!m_pMessageListener->PutError(*pErr)) {
3087  pErr->Throw();
3088  }
3089 }
3090 
3091 
3093  const CTempString& seq_id,
3094  const unsigned int line_number,
3095  ILineErrorListener* pListener)
3096 {
3097  if (!pListener) {
3098  return;
3099  }
3100 
3101  string msg = "Seq-id " + seq_id + ", line " + NStr::IntToString(line_number);
3102  pListener->PutProgress(msg);
3103 }
3104 
3105 
3106 // helper for CFeatureTableReader_Imp::ReadSequinFeatureTable,
3107 // just so we don't forget a step when we reset the feature
3108 //
3109 void CFeatureTableReader_Imp::x_ResetFeat(CRef<CSeq_feat> & sfp, bool & curr_feat_intervals_done)
3110 {
3111  m_need_check_strand = false;
3112  sfp.Reset(new CSeq_feat);
3113  //sfp->ResetLocation();
3114  curr_feat_intervals_done = false;
3115 }
3116 
3118 {
3119  if (feat.IsSetLocation() && feat.GetLocation().IsMix())
3120  {
3121  const CSeq_loc& last = *feat.GetLocation().GetMix().Get().back();
3122  if (last.IsInt() && last.GetInt().IsSetStrand())
3123  {
3124  strand = last.GetInt().GetStrand();
3125  }
3126  else
3127  if (last.IsPnt() && last.GetPnt().IsSetStrand())
3128  {
3129  strand = last.GetPnt().GetStrand();
3130  }
3131  }
3132 }
3133 
3135 {
3136  if (feat.IsSetLocation() && feat.GetLocation().IsMix())
3137  {
3138 
3139  for (auto pSeqLoc : feat.SetLocation().SetMix().Set()) {
3140  if (pSeqLoc->IsPnt()) {
3141  auto& seq_point = pSeqLoc->SetPnt();
3142  const auto old_strand =
3143  seq_point.IsSetStrand() ?
3144  seq_point.GetStrand() :
3146 
3147  seq_point.SetStrand(strand);
3148  if (old_strand != strand) {
3149  const bool is_5p_partial = seq_point.IsPartialStop(eExtreme_Biological);
3150  const bool is_3p_partial = seq_point.IsPartialStart(eExtreme_Biological);
3151  seq_point.SetPartialStart(is_5p_partial, eExtreme_Biological);
3152  seq_point.SetPartialStop(is_3p_partial, eExtreme_Biological);
3153  }
3154  }
3155  }
3156  }
3157 }
3158 
3159 
3161  TFtable& ftable)
3162 {
3163  if ( !feat ||
3164  feat.Empty() ||
3165  !feat->IsSetData() ||
3166  (feat->GetData().Which() == CSeqFeatData::e_not_set) )
3167  {
3168  return;
3169  }
3170 
3171  // Check for missing publication - RW-626
3172  if (feat->GetData().GetSubtype() == CSeqFeatData::eSubtype_pub &&
3173  (!feat->SetData().SetPub().IsSetPub() ||
3174  feat->SetData().SetPub().GetPub().Get().empty())) {
3175  const int line_number = m_reader->AtEOF() ?
3176  static_cast<unsigned>(m_reader->GetLineNumber()) :
3177  static_cast<unsigned>(m_reader->GetLineNumber())-1;
3178 
3179  string msg = "Reference feature is empty. Skipping feature.";
3180 
3181  x_ProcessMsg(line_number,
3183  eDiag_Warning,
3184  "Reference",
3185  kEmptyStr,
3186  kEmptyStr,
3187  msg);
3188  return;
3189  }
3190 
3191  if (feat->IsSetLocation() && feat->GetLocation().IsMix())
3192  {
3193  if (feat->GetLocation().GetMix().Get().empty()) {
3194  // turn empty seqlocmix into a null seq-loc
3195  feat->SetLocation().SetNull();
3196  }
3197  else
3198  if (feat->GetLocation().GetMix().Get().size() == 1) {
3199  // demote 1-part seqlocmixes to seq-loc with just that part
3200  CRef<CSeq_loc> keep_loc = *feat->SetLocation().SetMix().Set().begin();
3201  feat->SetLocation(*keep_loc);
3202  }
3203  }
3204  ftable.push_back(feat);
3205 }
3206 
3207 
3208 
3209 void CFeatureTableReader_Imp::x_ProcessQualifier(const string& qual_name,
3210  const string& qual_val,
3211  const string& feat_name,
3212  CRef<CSeq_feat> feat,
3213  TFlags flags)
3214 {
3215  if (NStr::IsBlank(qual_name)) {
3216  return;
3217  }
3218 
3219  if (!feat) {
3222  eDiag_Warning, kEmptyStr, qual_name, qual_val);
3223  }
3224  return;
3225  }
3226 
3227  if (NStr::IsBlank(qual_val)) {
3228  if (sc_SingleKeys.find(qual_name.c_str()) != sc_SingleKeys.end()) {
3229  x_AddQualifierToFeature(feat, feat_name, qual_name, qual_val, flags);
3230  }
3231  else {
3233  eDiag_Warning, feat_name, qual_name);
3234  }
3235  return;
3236  }
3237 
3238  // else qual_name and qual_val are not blank
3239  if (!x_AddQualifierToFeature(feat, feat_name, qual_name, qual_val, flags)) {
3242  eDiag_Warning, feat_name, qual_name, qual_val);
3243  }
3244 
3246  x_AddGBQualToFeature(feat, qual_name, qual_val);
3247  }
3248  }
3249 }
3250 
3251 
3252 
3254  const CTempString& in_seqid,
3255  const CTempString& in_annotname,
3256  const TFlags flags,
3257  ITableFilter *filter
3258 )
3259 {
3260  string feat, qual, qual_value;
3261  string curr_feat_name;
3262  // Int4 start, stop;
3263  //bool partial5, partial3, ispoint, isminus,
3264 
3265  bool ignore_until_next_feature_key = false;
3266  Int4 offset = 0;
3267  SFeatLocInfo loc_info;
3268 
3269  CRef<CSeq_annot> sap(new CSeq_annot);
3270 
3271  TFtable& ftable = sap->SetData().SetFtable();
3272  const bool bIgnoreWebComments =
3274 
3275  // if sequence ID is a list, use just one sequence ID string
3276  x_InitId(in_seqid, flags);
3277 
3278  // Use this to efficiently find the best CDS for a prot feature
3279  // (only add CDS's for it to work right)
3280  CBestFeatFinder best_CDS_finder;
3281 
3282  // map feature types to features
3283  TChoiceToFeatMap choiceToFeatMap;
3284 
3285  CRef<CSeq_feat> sfp;
3286  // This is true once this feature should not
3287  // have any more intervals.
3288  // This allows us to catch errors like the following:
3289  //
3290  //
3291  //>Feature lcl|Seq1
3292  //1 1008 CDS
3293  // gene THE_GENE_NAME
3294  //50 200
3295  // product THE_GENE_PRODUCT
3296  bool curr_feat_intervals_done = false;
3297 
3298  if (! in_annotname.empty ()) {
3299  CAnnot_descr& descr = sap->SetDesc ();
3300  CRef<CAnnotdesc> annot(new CAnnotdesc);
3301  annot->SetName (in_annotname);
3302  descr.Set().push_back (annot);
3303  }
3304 
3305  while ( !m_reader->AtEOF() ) {
3306 
3307  CTempString line = *++(*m_reader);
3308 
3309  if( m_reader->GetLineNumber() % 10000 == 0 &&
3310  m_reader->GetLineNumber() > 0 )
3311  {
3312  PutProgress(m_real_seqid, static_cast<unsigned>(m_reader->GetLineNumber()), m_pMessageListener);
3313  }
3314 
3315  // skip empty lines.
3316  // if requested, also skip webcomment lines
3317  if( line.empty () || (bIgnoreWebComments && x_IsWebComment(line) ) ) {
3318  continue;
3319  }
3320 
3321  // if next line is a new feature table, return current sap
3322  CTempStringEx dummy1, dummy2;
3323  if( ParseInitialFeatureLine(line, dummy1, dummy2) ) {
3324  m_reader->UngetLine(); // we'll get this feature line the next time around
3325  break;
3326  }
3327 
3328  if (line [0] == '[') {
3329 
3330  // try to parse it as an offset
3331  if( x_TryToParseOffset(line, offset) ) {
3332  // okay, known command
3333  } else {
3334  // warn for unknown square-bracket commands
3335  x_ProcessMsg(
3337  eDiag_Warning);
3338  }
3339 
3340  } else if ( s_LineIndicatesOrder(line) ) {
3341 
3342  // put nulls between feature intervals
3343  CRef<CSeq_loc> loc_with_nulls = s_LocationJoinToOrder( sfp->GetLocation() );
3344  // loc_with_nulls is unset if no change was needed
3345  if( loc_with_nulls ) {
3346  sfp->SetLocation( *loc_with_nulls );
3347  }
3348 
3349  } else if (x_ParseFeatureTableLine (line, loc_info, feat, qual, qual_value, offset)) {
3350  // process line in feature table
3351 
3352  replace( qual_value.begin(), qual_value.end(), '\"', '\'' );
3353 
3354  if ((! feat.empty ()) && loc_info.start_pos >= 0 && loc_info.stop_pos >= 0) {
3355 
3356  // process start - stop - feature line
3357 
3358  x_FinishFeature(sfp, ftable);
3359  x_ResetFeat( sfp, curr_feat_intervals_done );
3360 
3361  if (x_SetupSeqFeat (sfp, feat, flags, filter)) {
3362 
3363  // figure out type of feat, and store in map for later use
3365  if( sfp->CanGetData() ) {
3366  eChoice = sfp->GetData().Which();
3367  }
3368  choiceToFeatMap.insert(
3370  eChoice,
3371  SFeatAndLineNum(sfp, static_cast<unsigned>(m_reader->GetLineNumber()))));
3372 
3373  // if new feature is a CDS, remember it for later lookups
3374  if( eChoice == CSeqFeatData::e_Cdregion ) {
3375  best_CDS_finder.AddFeat( *sfp );
3376  }
3377 
3378  // and add first interval
3379  x_AddIntervalToFeature (curr_feat_name, sfp, loc_info);
3380 
3381  ignore_until_next_feature_key = false;
3382 
3383  curr_feat_name = feat;
3384 
3385  } else {
3386 
3387  // bad feature, set ignore flag
3388 
3389  ignore_until_next_feature_key = true;
3390  }
3391 
3392  } else if (ignore_until_next_feature_key) {
3393 
3394  // bad feature was found before, so ignore
3395  // qualifiers until next feature key
3396 
3397  }
3398  else
3399  if (loc_info.start_pos >= 0 &&
3400  loc_info.stop_pos >= 0 &&
3401  feat.empty () &&
3402  qual.empty () &&
3403  qual_value.empty ()) {
3404 
3405  if( curr_feat_intervals_done ) {
3406  // the feat intervals were done, so it's an error for there to be more intervals
3408  // this feature is in bad shape, so we ignore the rest of it
3409  ignore_until_next_feature_key = true;
3410  x_ResetFeat(sfp, curr_feat_intervals_done);
3411  } else if (sfp && sfp->IsSetLocation() && sfp->GetLocation().IsMix()) {
3412  // process start - stop multiple interval line
3413  x_AddIntervalToFeature (curr_feat_name, sfp, loc_info);
3414  // start, stop, partial5, partial3, ispoint, isminus);
3415  } else {
3418  eDiag_Warning);
3419  }
3420  }
3421 
3422  } else if (!NStr::IsBlank(qual)) {
3423  curr_feat_intervals_done = true;
3424  x_ProcessQualifier(qual, qual_value, curr_feat_name, sfp, flags);
3425  }
3426  else if (!feat.empty()) {
3427 
3428  // unrecognized location
3429 
3430  // there should no more ranges for this feature
3431  // (although there still can be ranges for quals, of course).
3432  curr_feat_intervals_done = true;
3433 
3435  x_ProcessMsg(
3437  feat );
3438  }
3439  }
3440  }
3441  }
3442 
3443  // make sure last feature is finished
3444  x_FinishFeature(sfp, ftable);
3445  x_ResetFeat( sfp, curr_feat_intervals_done );
3446 
3449  {
3450  x_CreateGenesFromCDSs(sap, choiceToFeatMap, flags);
3451  }
3452  return sap;
3453 }
3454 
3455 
3457  const string& feat,
3458  CSeq_loc& location,
3459  const TFlags flags,
3460  const string &seq_id,
3461  ITableFilter *filter
3462 )
3463 
3464 {
3465  CRef<CSeq_feat> sfp (new CSeq_feat);
3466 
3467  sfp->ResetLocation ();
3468 
3469  if ( ! x_SetupSeqFeat (sfp, feat, flags, filter) ) {
3470 
3471  // bad feature, make dummy
3472  sfp->SetData ().Select (CSeqFeatData::e_not_set);
3473  }
3474  sfp->SetLocation (location);
3475 
3476  return sfp;
3477 }
3478 
3480 {
3481  if (!NStr::IsBlank(seq_id)) {
3482  CBioseq::TId ids;
3483  CSeq_id::ParseIDs(ids, seq_id,
3485 
3486  m_seq_id.Reset();
3488  {
3489  for (auto id : ids)
3490  {
3491  if (id->IsGenbank())
3492  m_seq_id = id;
3493  }
3494  };
3495 
3496  if (m_seq_id.Empty())
3497  m_seq_id = ids.front();
3498 
3499  m_real_seqid.clear();
3501  }
3502 }
3503 
3505  CRef<CSeq_feat> sfp,
3506  const string& feat_name,
3507  const string& qual,
3508  const string& val,
3509  const TFlags flags,
3510  const string &seq_id1 )
3511 
3512 {
3513  x_InitId(seq_id1, flags);
3514 
3515  if (NStr::IsBlank(qual)) {
3516  return;
3517  }
3518 
3519  if (!val.empty ()) { // Should probably use NStr::IsBlank()
3520  if (! x_AddQualifierToFeature (sfp, feat_name, qual, val, flags)) {
3521  // unrecognized qualifier key
3523  ERR_POST_X (5, Warning << "Unrecognized qualifier '" << qual << "'");
3524  }
3526  x_AddGBQualToFeature (sfp, qual, val);
3527  }
3528  }
3529  }
3530  else { // empty val
3531  // check for the few qualifiers that do not need a value
3532  auto s_iter = sc_SingleKeys.find (qual.c_str ());
3533  if (s_iter != sc_SingleKeys.end ()) {
3534  x_AddQualifierToFeature (sfp, feat_name, qual, val, flags);
3535  }
3536  }
3537 }
3538 
3539 // static
3541  const CTempString& line_arg,
3542  CTempStringEx& out_seqid,
3543  CTempStringEx& out_annotname )
3544 {
3545  out_seqid.clear();
3546  out_annotname.clear();
3547 
3548  // copy the line_arg because we can't edit line_arg itself
3549  CTempString line = line_arg;
3550 
3551  // handle ">"
3553  if( ! NStr::StartsWith(line, ">") ) {
3554  return false;
3555  }
3556  line = line.substr(1); // remove '>'
3557 
3558  // handle "Feature"
3560  const CTempString kFeatureStr("Feature");
3561  if( ! NStr::StartsWith(line, kFeatureStr, NStr::eNocase) ) {
3562  return false;
3563  }
3564  line = line.substr( kFeatureStr.length() ); // remove "Feature"
3565 
3566  // throw out any non-space characters at the beginning,
3567  // so we can, for example, handle ">Features" (note the "s")
3568  while( !line.empty() && !isspace(line[0]) ) {
3569  line = line.substr(1);
3570  }
3571 
3572  // extract seqid and annotname
3574  NStr::SplitInTwo(line, " \t", out_seqid, out_annotname, NStr::fSplit_Tokenize);
3575 
3576  return true;
3577 }
3578 
3579 
3580 // public access functions
3581 
3583  TReaderFlags fReaderFlags)
3584  : CReaderBase(fReaderFlags)
3585 {
3586 }
3587 
3589  ILineReader& lr,
3590  ILineErrorListener* pErrors) :
3591  CReaderBase(0),
3592  m_pImpl(new CFeatureTableReader_Imp(&lr, 0, pErrors))
3593  {}
3594 
3597  ILineReader &lr, ILineErrorListener *pMessageListener)
3598 {
3599  CRef<CSerialObject> object(
3600  ReadSeqAnnot( lr, pMessageListener ).ReleaseOrNull() );
3601  return object;
3602 }
3603 
3604 
3607  ILineReader &lr, ILineErrorListener *pMessageListener)
3608 {
3609  return ReadSequinFeatureTable(lr, m_iFlags, pMessageListener);
3610 }
3611 
3612 
3614  CNcbiIstream& ifs,
3615  const string& seqid,
3616  const string& annotname,
3617  const TFlags flags,
3618  ILineErrorListener* pMessageListener,
3619  ITableFilter *filter
3620 )
3621 {
3622  CStreamLineReader reader(ifs);
3623  return ReadSequinFeatureTable(reader, seqid, annotname, flags, pMessageListener, filter);
3624 }
3625 
3627  ILineReader& reader,
3628  const string& seqid,
3629  const string& annotname,
3630  const TFlags flags,
3631  ILineErrorListener* pMessageListener,
3632  ITableFilter *filter
3633 )
3634 {
3635  // just read features from 5-column table
3636  CFeatureTableReader_Imp impl(&reader, 0, pMessageListener);
3637  return impl.ReadSequinFeatureTable(seqid, annotname, flags, filter);
3638 }
3639 
3641  CFeatureTableReader_Imp& reader,
3642  const CTempString& seqid,
3643  const CTempString& annot_name,
3644  TFlags flags,
3645  ITableFilter* filter) {
3646  return reader.ReadSequinFeatureTable(seqid, annot_name, flags, filter);
3647 }
3648 
3649 
3651  CNcbiIstream& ifs,
3652  const TFlags flags,
3653  ILineErrorListener* pMessageListener,
3654  ITableFilter *filter
3655 )
3656 {
3657  CStreamLineReader reader(ifs);
3658  return ReadSequinFeatureTable(reader, flags, pMessageListener, filter);
3659 }
3660 
3661 
3663  CFeatureTableReader_Imp& reader,
3664  const TFlags flags,
3665  ITableFilter* filter,
3666  const string& seqid_prefix)
3667 {
3668  auto pLineReader = reader.GetLineReaderPtr();
3669  if (!pLineReader) {
3670  return CRef<CSeq_annot>();
3671  }
3672 
3673 
3674  CTempStringEx orig_seqid, annotname;
3675  // first look for >Feature line, extract seqid and optional annotname
3676  while (orig_seqid.empty () && !pLineReader->AtEOF() ) {
3677  CTempString line = *++(*pLineReader);
3678  if( ParseInitialFeatureLine(line, orig_seqid, annotname) ) {
3680  static_cast<unsigned>(pLineReader->GetLineNumber()),
3681  reader.GetErrorListenerPtr());
3682  }
3683  }
3684 
3685  string temp_seqid;
3686  if (seqid_prefix.empty()) {
3687  //seqid = orig_seqid;
3688  } else {
3689  if (orig_seqid.find('|') == string::npos)
3690  temp_seqid = seqid_prefix + orig_seqid;
3691  else
3692  if (NStr::StartsWith(orig_seqid, "lcl|"))
3693  {
3694  temp_seqid = seqid_prefix + orig_seqid.substr(4);
3695  }
3696  orig_seqid = temp_seqid;
3697  }
3698  return x_ReadFeatureTable(reader, orig_seqid, annotname, flags, filter);
3699 }
3700 
3701 
3703  ILineReader& reader,
3704  const TFlags flags,
3705  ILineErrorListener* pMessageListener,
3706  ITableFilter* pFilter,
3707  const string& seqid_prefix
3708 )
3709 {
3710  CFeatureTableReader_Imp ftable_reader(&reader, 0, pMessageListener);
3711  return x_ReadFeatureTable(ftable_reader, flags, pFilter, seqid_prefix);
3712 }
3713 
3714 
3716  const TFlags flags,
3717  ITableFilter* pFilter,
3718  const string& seqid_prefix
3719 )
3720 {
3721  return x_ReadFeatureTable(*m_pImpl, flags, pFilter, seqid_prefix);
3722 }
3723 
3724 
3726  CNcbiIstream& ifs,
3727  CSeq_entry& entry,
3728  const TFlags flags,
3729  ILineErrorListener* pMessageListener,
3730  ITableFilter *filter
3731 )
3732 {
3733  CStreamLineReader reader(ifs);
3734  return ReadSequinFeatureTables(reader, entry, flags, pMessageListener, filter);
3735 }
3736 
3737 void
3739  const list<string>& stringFlags,
3740  TFlags& baseFlags)
3741 {
3742  static const map<string, CFeature_table_reader::TReaderFlags> flagsMap = {
3743  { "KeepBadKey", CFeature_table_reader::fKeepBadKey},
3744  { "TranslateBadKey", CFeature_table_reader::fTranslateBadKey},
3745  { "IgnoreWebComments", CFeature_table_reader::fIgnoreWebComments},
3746  { "CreateGenesFromCDSs", CFeature_table_reader::fCreateGenesFromCDSs},
3747  { "CDSsMustBeInTheirGenes", CFeature_table_reader::fCDSsMustBeInTheirGenes},
3748  { "ReportDiscouragedKey", CFeature_table_reader::fReportDiscouragedKey},
3749  { "LeaveProteinIds", CFeature_table_reader::fLeaveProteinIds},
3750  { "AllIdsAsLocal", CFeature_table_reader::fAllIdsAsLocal},
3751  { "PreferGenbankId", CFeature_table_reader::fPreferGenbankId},
3752  { "SuppressBadKeyWarning", CFeature_table_reader::fSuppressBadKeyWarning},
3753  };
3754 
3755  return CReaderBase::xAddStringFlagsWithMap(stringFlags, flagsMap, baseFlags);
3756 };
3757 
3758 
3759 struct SCSeqidCompare
3760 {
3761  inline
3762  bool operator()(const CSeq_id* left, const CSeq_id* right) const
3763  {
3764  return *left < *right;
3765  };
3766 };
3767 
3769  ILineReader& reader,
3770  CSeq_entry& entry,
3771  const TFlags flags,
3772  ILineErrorListener* pMessageListener,
3773  ITableFilter *filter
3774 )
3775 {
3776  // let's use map to speedup matching on very large files, see SQD-1847
3777  map<const CSeq_id*, CRef<CBioseq>, SCSeqidCompare> seq_map;
3778 
3779  for (CTypeIterator<CBioseq> seqit(entry); seqit; ++seqit) {
3780  ITERATE (CBioseq::TId, seq_id, seqit->GetId()) {
3781  seq_map[seq_id->GetPointer()].Reset(&*seqit);
3782  }
3783  }
3784 
3785  CFeatureTableReader_Imp ftable_reader(&reader, 0, pMessageListener);
3786  while ( !reader.AtEOF() ) {
3787  auto annot = x_ReadFeatureTable(ftable_reader, flags, filter);
3788  //CRef<CSeq_annot> annot = ReadSequinFeatureTable(reader, flags, pMessageListener, filter);
3789  if (entry.IsSeq()) { // only one place to go
3790  entry.SetSeq().SetAnnot().push_back(annot);
3791  continue;
3792  }
3793  _ASSERT(annot->GetData().IsFtable());
3794  if (annot->GetData().GetFtable().empty()) {
3795  continue;
3796  }
3797  // otherwise, take the first feature, which should be representative
3798  const CSeq_feat& feat = *annot->GetData().GetFtable().front();
3799  const CSeq_id* feat_id = feat.GetLocation().GetId();
3800  CBioseq* seq = nullptr;
3801  _ASSERT(feat_id); // we expect a uniform sequence ID
3802  seq = seq_map[feat_id].GetPointer();
3803  if (seq) { // found a match
3804  seq->SetAnnot().push_back(annot);
3805  } else { // just package on the set
3806  ERR_POST_X(6, Warning
3807  << "ReadSequinFeatureTables: unable to find match for "
3808  << feat_id->AsFastaString());
3809  entry.SetSet().SetAnnot().push_back(annot);
3810  }
3811  }
3812 }
3813 
3814 
3816  const string& feat,
3817  CSeq_loc& location,
3818  const TFlags flags,
3819  ILineErrorListener* pMessageListener,
3820  unsigned int line_number,
3821  string *seq_id,
3822  ITableFilter *filter
3823 )
3824 {
3825  CFeatureTableReader_Imp impl(nullptr, line_number, pMessageListener);
3826  return impl.CreateSeqFeat (feat, location, flags, (seq_id ? *seq_id : string() ), filter);
3827 }
3828 
3829 
3831  CRef<CSeq_feat> sfp,
3832  const string& feat_name,
3833  const string& qual,
3834  const string& val,
3836  ILineErrorListener* pMessageListener,
3837  int line_number,
3838  const string &seq_id
3839 )
3840 
3841 {
3842  CFeatureTableReader_Imp impl(nullptr, line_number, pMessageListener);
3843  impl.AddFeatQual (sfp, feat_name, qual, val, flags, seq_id) ;
3844 }
3845 
3846 bool
3848  const CTempString& line_arg,
3849  CTempStringEx& out_seqid,
3850  CTempStringEx& out_annotname )
3851 {
3852  return CFeatureTableReader_Imp::ParseInitialFeatureLine(line_arg, out_seqid, out_annotname);
3853 }
3854 
3855 
3857 
3858 END_objects_SCOPE
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
@ eExtreme_Biological
5' and 3'
Definition: Na_strand.hpp:62
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
void remove_if(Container &c, Predicate *__pred)
Definition: chainer.hpp:69
AutoPtr –.
Definition: ncbimisc.hpp:401
CAnnot_descr –.
Definition: Annot_descr.hpp:66
CAnnotdesc –.
Definition: Annotdesc.hpp:66
bool AddFeat(const CSeq_feat &new_cds)
CCdregion –.
Definition: Cdregion.hpp:66
Definition: Dbtag.hpp:53
bool x_AddQualifierToFeature(CRef< CSeq_feat > sfp, const string &feat_name, const string &qual, const string &val, const TFlags flags)
Definition: readfeat.cpp:2343
static void PutProgress(const CTempString &seq_id, const unsigned int line_number, ILineErrorListener *pListener)
Definition: readfeat.cpp:3092
CSeq_annot::C_Data::TFtable TFtable
Definition: readfeat.cpp:297
bool x_TryToParseOffset(const CTempString &sLine, Int4 &out_offset)
Definition: readfeat.cpp:866
void AddFeatQual(CRef< CSeq_feat > sfp, const string &feat_name, const string &qual, const string &val, const TFlags flags, const string &seq_id)
Definition: readfeat.cpp:3504
bool x_AddQualifierToImp(CRef< CSeq_feat > sfp, CSeqFeatData &sfdata, EQual qtype, const string &qual, const string &val)
Definition: readfeat.cpp:1827
static bool ParseInitialFeatureLine(const CTempString &line_arg, CTempStringEx &out_seqid, CTempStringEx &out_annotname)
Definition: readfeat.cpp:3540
bool x_AddNoteToFeature(CRef< CSeq_feat > sfp, const string &note)
Definition: readfeat.cpp:2303
CConstRef< CSeq_feat > TFeatConstRef
Definition: readfeat.cpp:418
void x_TokenizeStrict(const CTempString &line, vector< string > &out_tokens)
Definition: readfeat.cpp:1174
bool x_AddQualifierToGene(CSeqFeatData &sfdata, EQual qtype, const string &val)
Definition: readfeat.cpp:1293
ILineReader *const GetLineReaderPtr(void)
Definition: readfeat.cpp:334
string x_TrnaToAaString(const string &val)
Definition: readfeat.cpp:1497
bool x_AddIntervalToFeature(CTempString strFeatureName, CRef< CSeq_feat > &sfp, const SFeatLocInfo &loc_info)
Definition: readfeat.cpp:2793
unsigned int m_LineNumber
Definition: readfeat.cpp:501
void x_ProcessQualifier(const string &qual_name, const string &qual_val, const string &feat_name, CRef< CSeq_feat > feat, TFlags flags)
Definition: readfeat.cpp:3209
void x_InitId(const CTempString &seq_id, const TFlags flags)
Definition: readfeat.cpp:3479
multimap< CSeqFeatData::E_Choice, SFeatAndLineNum > TChoiceToFeatMap
Definition: readfeat.cpp:444
ILineReader * m_reader
Definition: readfeat.cpp:500
bool x_AddQualifierToRna(CRef< CSeq_feat > sfp, EQual qtype, const string &val)
Definition: readfeat.cpp:1666
void x_ResetFeat(CRef< CSeq_feat > &feat, bool &curr_feat_intervals_done)
Definition: readfeat.cpp:3109
CRef< CSeq_annot > ReadSequinFeatureTable(const CTempString &seqid, const CTempString &annotname, const TFlags flags, ITableFilter *filter)
Definition: readfeat.cpp:3253
bool x_ParseTrnaExtString(CTrna_ext &ext_trna, const string &str)
Definition: readfeat.cpp:1518
void x_CreateGenesFromCDSs(CRef< CSeq_annot > sap, TChoiceToFeatMap &choiceToFeatMap, const TFlags flags)
Definition: readfeat.cpp:2081
long x_StringToLongNoThrow(CTempString strToConvert, CTempString strFeatureName, CTempString strQualifierName, ILineError::EProblem eProblem=ILineError::eProblem_Unset)
Definition: readfeat.cpp:1621
ILineErrorListener * m_pMessageListener
Definition: readfeat.cpp:502
void x_FinishFeature(CRef< CSeq_feat > &feat, TFtable &ftable)
Definition: readfeat.cpp:3160
CFeatureTableReader_Imp(ILineReader *reader, unsigned int line_num, ILineErrorListener *pMessageListener)
Definition: readfeat.cpp:855
bool x_AddGBQualToFeature(CRef< CSeq_feat > sfp, const string &qual, const string &val)
Definition: readfeat.cpp:2044
bool x_IsWebComment(CTempString line)
Definition: readfeat.cpp:2740
CRef< CSeq_id > m_seq_id
Definition: readfeat.cpp:499
CFeature_table_reader::TFlags TFlags
Definition: readfeat.cpp:296
ILineErrorListener *const GetErrorListenerPtr(void)
Definition: readfeat.cpp:338
bool x_StringIsJustQuotes(const string &str)
Definition: readfeat.cpp:1429
void x_GetPointStrand(const CSeq_feat &feat, CSeq_interval::TStrand &strand) const
Definition: readfeat.cpp:3117
CRef< CSeq_feat > CreateSeqFeat(const string &feat, CSeq_loc &location, const TFlags flags, const string &seq_id, ITableFilter *filter)
Definition: readfeat.cpp:3456
SIZE_TYPE x_MatchingParenPos(const string &str, SIZE_TYPE open_paren_pos)
Definition: readfeat.cpp:1586
bool x_AddQualifierToCdregion(CRef< CSeq_feat > sfp, CSeqFeatData &sfdata, EQual qtype, const string &val)
Definition: readfeat.cpp:1333
void x_TokenizeLenient(const CTempString &line, vector< string > &out_tokens)
Definition: readfeat.cpp:1214
void x_UpdatePointStrand(CSeq_feat &feat, CSeq_interval::TStrand strand) const
Definition: readfeat.cpp:3134
unordered_set< string > m_ProcessedProteinIds
Definition: readfeat.cpp:504
bool x_ParseFeatureTableLine(const CTempString &line, SFeatLocInfo &loc_info, string &feat, string &qual, string &val, Int4 offset)
Definition: readfeat.cpp:912
bool x_AddQualifierToBioSrc(CSeqFeatData &sfdata, const string &feat_name, EOrgRef rtype, const string &val)
Definition: readfeat.cpp:1942
CFeatureTableReader_Imp & operator=(const CFeatureTableReader_Imp &value)
bool x_AddCodons(const string &val, CTrna_ext &trna_ext) const
Definition: readfeat.cpp:1793
bool x_SetupSeqFeat(CRef< CSeq_feat > sfp, const string &feat, const TFlags flags, ITableFilter *filter)
Definition: readfeat.cpp:2893
unordered_set< string > m_ProcessedTranscriptIds
Definition: readfeat.cpp:503
void x_ProcessMsg(int line_num, ILineError::EProblem eProblem, EDiagSev eSeverity, const std::string &strFeatureName=kEmptyStr, const std::string &strQualifierName=kEmptyStr, const std::string &strQualifierValue=kEmptyStr, const std::string &strErrorMessage=kEmptyStr, const ILineError::TVecOfLines &vecOfOtherLines=ILineError::TVecOfLines())
void x_ProcessMsg(ILineError::EProblem eProblem, EDiagSev eSeverity, const std::string &strFeatureName=kEmptyStr, const std::string &strQualifierName=kEmptyStr, const std::string &strQualifierValue=kEmptyStr, const std::string &strErrorMessage=kEmptyStr, const ILineError::TVecOfLines &vecOfOtherLines=ILineError::TVecOfLines())
CFeatureTableReader_Imp(const CFeatureTableReader_Imp &value)
CFeature_table_reader(TReaderFlags fReaderFlags=0)
Definition: readfeat.cpp:3582
CRef< CSerialObject > ReadObject(ILineReader &lr, ILineErrorListener *pErrors) override
Read an object from a given line reader, render it as the most appropriate Genbank object.
Definition: readfeat.cpp:3596
long TFlags
binary OR of EFlags
Definition: readfeat.hpp:79
static bool ParseInitialFeatureLine(const CTempString &line_arg, CTempStringEx &out_seqid, CTempStringEx &out_annotname)
If line_arg is a feature line (e.g.
Definition: readfeat.cpp:3847
static void ReadSequinFeatureTables(ILineReader &reader, CSeq_entry &entry, const TFlags flags=0, ILineErrorListener *pMessageListener=nullptr, ITableFilter *filter=nullptr)
Definition: readfeat.cpp:3768
@ fSuppressBadKeyWarning
= 0x400 (Suppress 'bad key' errors; Not recommended.)
Definition: readfeat.hpp:77
@ fReportDiscouragedKey
= 0x40 (Report discouraged keys into the error container)
Definition: readfeat.hpp:73
@ fKeepBadKey
= 0x02 (As much as possible, try to use bad keys as if they were acceptable)
Definition: readfeat.hpp:68
@ fIgnoreWebComments
= 0x08 (ignore web comment lines such as lines that start with " INFO:", or consist of many equals si...
Definition: readfeat.hpp:70
@ fAllIdsAsLocal
= 0x100 (Do not attempt to parse accessions)
Definition: readfeat.hpp:75
@ fLeaveProteinIds
= 0x80 (Leave all protein_id as a qualifiers)
Definition: readfeat.hpp:74
@ fCreateGenesFromCDSs
= 0x10 (If a CDS has a gene xref, create a gene with the same intervals if one doesn't already exist....
Definition: readfeat.hpp:71
@ fPreferGenbankId
= 0x200 (Prefer Genbank accession ids)
Definition: readfeat.hpp:76
@ fTranslateBadKey
= 0x04 (yields misc_feature /standard_name="...")
Definition: readfeat.hpp:69
@ fCDSsMustBeInTheirGenes
= 0x20 (If a CDS has a gene xref, it *must* be inside of that gene)
Definition: readfeat.hpp:72
CRef< CSeq_annot > ReadSeqAnnot(ILineReader &lr, ILineErrorListener *pErrors) override
Read an object from a given line reader, render it as a single Seq-annot, if possible.
Definition: readfeat.cpp:3606
unique_ptr< CFeatureTableReader_Imp > m_pImpl
Definition: readfeat.hpp:191
static CRef< CSeq_annot > x_ReadFeatureTable(CFeatureTableReader_Imp &reader, const CTempString &seqid, const CTempString &annot_name, const TFlags flags, ITableFilter *filter)
Definition: readfeat.cpp:3640
static CRef< CSeq_feat > CreateSeqFeat(const string &feat, CSeq_loc &location, const TFlags flags=0, ILineErrorListener *pMessageListener=nullptr, unsigned int line=0, std::string *seq_id=nullptr, ITableFilter *filter=nullptr)
Definition: readfeat.cpp:3815
static void AddFeatQual(CRef< CSeq_feat > sfp, const string &feat_name, const string &qual, const string &val, const TFlags flags=0, ILineErrorListener *pMessageListener=nullptr, int line=0, const string &seq_id=std::string())
Definition: readfeat.cpp:3830
static void AddStringFlags(const list< string > &stringFlags, TFlags &baseFlags)
Definition: readfeat.cpp:3738
CRef< CSeq_annot > ReadSequinFeatureTable(const TFlags flags=0, ITableFilter *filter=nullptr, const string &seqid_prefix=kEmptyStr)
Definition: readfeat.cpp:3715
@Gb_qual.hpp User-defined methods of the data storage class.
Definition: Gb_qual.hpp:61
static const CTrans_table & GetTransTable(int id)
static int CodonToIndex(char base1, char base2, char base3)
*** Import *********************************************** * * Features imported from other databases...
Definition: Imp_feat_.hpp:77
static void GetPrefixAndRemainder(const string &inference, string &prefix, string &remainder)
Definition: Gb_qual.cpp:381
bool operator()(char c)
Definition: readfeat.cpp:1211
bool operator()(char c)
Definition: readfeat.cpp:1206
static CObjReaderLineException * Create(EDiagSev eSeverity, unsigned int uLine, const std::string &strMessage, EProblem eProblem=eProblem_GeneralParsingError, const std::string &strSeqId=string(""), const std::string &strFeatureName=string(""), const std::string &strQualifierName=string(""), const std::string &strQualifierValue=string(""), CObjReaderLineException::EErrCode eErrCode=eFormat, const TVecOfLines &vecOfOtherLines=TVecOfLines())
Please use this instead of the constructor because the ctor is protected.
Definition: line_error.cpp:194
@OrgMod.hpp User-defined methods of the data storage class.
Definition: OrgMod.hpp:54
Definition: Pub.hpp:56
CRNA_qual –.
Definition: RNA_qual.hpp:66
@RNA_ref.hpp User-defined methods of the data storage class.
Definition: RNA_ref.hpp:54
static void AddGeneOntologyTerm(CSeq_feat &feature, const CTempString &qual, const CTempString &val)
Definition: read_util.cpp:296
Defines and provides stubs for a general interface to a variety of file readers.
Definition: reader_base.hpp:63
long TReaderFlags
Definition: reader_base.hpp:84
TReaderFlags m_iFlags
static void xAddStringFlagsWithMap(const list< string > &stringFlags, const map< string, TReaderFlags > flagMap, TReaderFlags &baseFlags)
void SetProt(TProt &v)
void SetRegion(const TRegion &v)
void SetBiosrc(TBiosrc &v)
static bool IsDiscouragedQual(EQualifier qual)
EQualifier
List of available qualifiers for feature keys.
void SetBond(const TBond &v)
static bool CanHaveGene(ESubtype subtype)
void SetSite(const TSite &v)
static const CSiteList * GetSiteList()
void SetPub(TPub &v)
ESubtype GetSubtype(void) const
void SetImp(TImp &v)
static bool IsDiscouragedSubtype(ESubtype subtype)
static E_Choice GetTypeFromSubtype(ESubtype subtype)
void SetRna(TRna &v)
void SetCdregion(TCdregion &v)
@ eSubtype_bad
These no longer need to match the FEATDEF values in the C toolkit's objfdef.h.
@ eSubtype_transit_peptide_aa
static EQualifier GetQualifierType(CTempString qual)
convert qual string to enumerated value
static const CBondList * GetBondList()
static CTempString GetQualifierAsString(EQualifier qual)
Convert a qualifier from an enumerated value to a string representation or empty if not found.
static ESubtype SubtypeNameToValue(CTempString sName)
Turn a string into its ESubtype which is NOT necessarily related to the identifier of the enum.
static bool IsRegulatory(ESubtype subtype)
void SetGene(TGene &v)
static const vector< string > & GetRegulatoryClassList()
CSeqIdException –.
Definition: Seq_id.hpp:1001
bool IsFtable(void) const
Definition: Seq_annot.cpp:177
Definition: Seq_entry.hpp:56
namespace ncbi::objects::
Definition: Seq_feat.hpp:58
const CGene_ref * GetGeneXref(void) const
See related function in util/feature.hpp.
Definition: Seq_feat.cpp:181
void SetGeneXref(CGene_ref &value)
Definition: Seq_feat.cpp:192
void SetProtXref(CProt_ref &value)
Definition: Seq_feat.cpp:233
void SetPartialStart(bool val, ESeqLocExtremes ext)
void SetPartialStop(bool val, ESeqLocExtremes ext)
Seq-loc iterator class – iterates all intervals from a seq-loc in the correct order.
Definition: Seq_loc.hpp:453
void SetRightOf(bool val)
Definition: Seq_point.cpp:193
void SetPartialStart(bool val, ESeqLocExtremes ext)
Definition: Seq_point.cpp:100
void SetPartialStop(bool val, ESeqLocExtremes ext)
Definition: Seq_point.cpp:116
class CStaticArrayMap<> is an array adaptor that provides an STLish interface to statically-defined a...
Definition: static_map.hpp:105
TBase::const_iterator const_iterator
Definition: static_map.hpp:109
Simple implementation of ILineReader for i(o)streams.
CStringException –.
Definition: ncbistr.hpp:4506
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
Definition: tempstr.hpp:65
aa this carries
Definition: Trna_ext_.hpp:96
CUser_object & AddField(const string &label, const string &value, EParseField parse=eParse_String)
add a data field to the user object that holds a given value
@ eParse_Number
Parse a real or integer number, otherwise string.
Definition: User_object.hpp:62
virtual bool PutError(const ILineError &)=0
Store error in the container, and return true if error was stored fine, and return false if the calle...
virtual void PutProgress(const string &sMessage, const Uint8 iNumDone=0, const Uint8 iNumTotal=0)=0
This is used for processing progress messages.
virtual EDiagSev Severity(void) const
Definition: line_error.hpp:370
@ eProblem_InvalidQualifier
Definition: line_error.hpp:92
@ eProblem_QualifierBadValue
Definition: line_error.hpp:68
@ eProblem_NumericQualifierValueIsNotANumber
Definition: line_error.hpp:61
@ eProblem_InternalPartialsInFeatLocation
Definition: line_error.hpp:72
@ eProblem_FeatMustBeInXrefdGene
Definition: line_error.hpp:73
@ eProblem_UnrecognizedFeatureName
Definition: line_error.hpp:58
@ eProblem_FeatureNameNotAllowed
Definition: line_error.hpp:62
@ eProblem_DuplicateIDs
Definition: line_error.hpp:94
@ eProblem_IncompleteFeature
Definition: line_error.hpp:65
@ eProblem_QualifierWithoutFeature
Definition: line_error.hpp:64
@ eProblem_FeatureBadStartAndOrStop
Definition: line_error.hpp:66
@ eProblem_NumericQualifierValueHasExtraTrailingCharacters
Definition: line_error.hpp:60
@ eProblem_UnrecognizedSquareBracketCommand
Definition: line_error.hpp:75
@ eProblem_UnrecognizedQualifierName
Definition: line_error.hpp:59
@ eProblem_BadFeatureInterval
Definition: line_error.hpp:67
@ eProblem_DiscouragedFeatureName
Definition: line_error.hpp:90
@ eProblem_NoFeatureProvidedOnIntervals
Definition: line_error.hpp:63
@ eProblem_DiscouragedQualifierName
Definition: line_error.hpp:91
virtual const std::string & ErrorMessage(void) const
Definition: line_error.hpp:174
vector< unsigned int > TVecOfLines
Definition: line_error.hpp:128
virtual EProblem Problem(void) const =0
Abstract base class for lightweight line-by-line reading.
Definition: line_reader.hpp:54
Use to give a feature filter to CFeature_table_reader.
EAction
How a given feature name should be handled.
@ eAction_Okay
Just accept the feat.
@ eAction_Disallowed
Do not accept the feat and give message eProblem_FeatureNameNotAllowed.
virtual EAction GetFeatAction(const string &feature_name) const =0
Returns how we should treat the given feature name.
Definition: map.hpp:338
const_iterator_pair equal_range(const key_type &key) const
Definition: map.hpp:296
iterator insert(const value_type &val)
Definition: map.hpp:305
Definition: set.hpp:45
iterator_bool insert(const value_type &val)
Definition: set.hpp:149
const_iterator begin() const
Definition: set.hpp:135
const_iterator_pair equal_range(const key_type &key) const
Definition: set.hpp:140
bool empty() const
Definition: set.hpp:133
const_iterator end() const
Definition: set.hpp:136
void swap(this_type &m)
Definition: set.hpp:102
Include a standard set of the NCBI C++ Toolkit most basic headers.
static uch flags
static DLIST_TYPE *DLIST_NAME() first(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:46
static DLIST_TYPE *DLIST_NAME() last(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:51
static unsigned int line_num
Definition: attributes.c:11
static const char * str(char *buf, int n)
Definition: stats.c:84
int offset
Definition: replacements.h:160
static const char location[]
Definition: config.c:97
const TResidue codons[4][4]
Definition: gnomon_seq.cpp:76
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
void swap(NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair1, NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair2)
Definition: ncbimisc.hpp:1508
#define ENTREZ_ID_FROM(T, value)
Definition: ncbimisc.hpp:1098
string
Definition: cgiapp.hpp:687
#define ERR_POST_X(err_subcode, message)
Error posting with default error code and given error subcode.
Definition: ncbidiag.hpp:550
EDiagSev
Severity level for the posted diagnostics.
Definition: ncbidiag.hpp:650
@ eDiag_Error
Error message.
Definition: ncbidiag.hpp:653
@ eDiag_Warning
Warning message.
Definition: ncbidiag.hpp:652
void Warning(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1191
virtual void UngetLine(void)=0
Unget current line, which must be valid.
virtual Uint8 GetLineNumber(void) const =0
Returns the current line number (counting from 1, not 0).
virtual bool AtEOF(void) const =0
Indicates (negatively) whether there is any more input.
const string AsFastaString(void) const
Definition: Seq_id.cpp:2266
static SIZE_TYPE ParseIDs(CBioseq::TId &ids, const CTempString &s, TParseFlags flags=fParse_Default)
Parse a string representing one or more Seq-ids, appending the results to IDS.
Definition: Seq_id.cpp:2613
void GetLabel(string *label, ELabelType type=eDefault, TLabelFlags flags=fLabel_Default) const
Append a label for this Seq-id to the supplied string.
Definition: Seq_id.cpp:2040
@ fParse_PartialOK
Warn rather than throwing an exception when a FASTA-style ID set contains unparsable portions,...
Definition: Seq_id.hpp:80
@ fParse_AnyLocal
Treat otherwise unidentified strings as local accessions as long as they don't resemble FASTA-style I...
Definition: Seq_id.hpp:90
@ fParse_Default
By default in ParseIDs and IsValid, allow raw parsable non-numeric accessions and plausible local acc...
Definition: Seq_id.hpp:102
@ fParse_ValidLocal
Treat otherwise unidentified strings as raw accessions, provided that they pass rudimentary validatio...
Definition: Seq_id.hpp:87
@ eFasta
Tagged ID in NCBI's traditional FASTA style.
Definition: Seq_id.hpp:607
bool IsPartialStart(ESeqLocExtremes ext) const
check start or stop of location for e_Lim fuzz
Definition: Seq_loc.cpp:3222
ENa_strand GetStrand(void) const
Get the location's strand.
Definition: Seq_loc.cpp:882
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Override Assign() to incorporate cache invalidation.
Definition: Seq_loc.cpp:337
void SetPnt(TPnt &v)
Definition: Seq_loc.hpp:985
void SetInt(TInt &v)
Definition: Seq_loc.hpp:983
CRef< CSeq_loc > Subtract(const CSeq_loc &other, TOpFlags flags, ISynonymMapper *syn_mapper, ILengthGetter *len_getter) const
Subtract seq-loc from this, merge/sort resulting ranges depending on flags.
Definition: Seq_loc.cpp:5087
const CSeq_loc & GetEmbeddingSeq_loc(void) const
Get the nearest seq-loc containing the current range.
Definition: Seq_loc.cpp:2573
const CSeq_id * GetId(void) const
Get the id of the location return NULL if has multiple ids or no id at all.
Definition: Seq_loc.hpp:941
void SetNull(void)
Override all setters to incorporate cache invalidation.
Definition: Seq_loc.hpp:960
bool IsPartialStop(ESeqLocExtremes ext) const
Definition: Seq_loc.cpp:3251
@ fSortAndMerge_All
Definition: Seq_loc.hpp:334
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
TObjectType * GetPointerOrNull(void) const THROWS_NONE
Get pointer value.
Definition: ncbiobj.hpp:1672
bool IsNull(void) const THROWS_NONE
Check if pointer is null – same effect as Empty().
Definition: ncbiobj.hpp:735
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
Definition: ncbiobj.hpp:719
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
unsigned char Uchar
Alias for unsigned char.
Definition: ncbitype.h:95
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
Definition: ncbistre.hpp:146
NCBI_NS_STD::string::size_type SIZE_TYPE
Definition: ncbistr.hpp:132
const_iterator end() const
Return an iterator to the string's ending position (one past the end of the represented sequence)
Definition: tempstr.hpp:306
#define kEmptyStr
Definition: ncbistr.hpp:123
static int CompareNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive compare of a substring with another string.
Definition: ncbistr.cpp:219
static int StringToInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to int.
Definition: ncbistr.cpp:630
static SIZE_TYPE FindNoCase(const CTempString str, const CTempString pattern, SIZE_TYPE start, SIZE_TYPE end, EOccurrence which=eFirst)
Find the pattern in the specified range of a string using a case insensitive search.
Definition: ncbistr.cpp:2993
static bool EndsWith(const CTempString str, const CTempString end, ECase use_case=eCase)
Check if a string ends with a specified suffix value.
Definition: ncbistr.hpp:5430
CTempStringEx substr(size_type pos) const
Obtain a substring from this string, beginning at a given offset.
Definition: tempstr.hpp:1010
static bool IsBlank(const CTempString str, SIZE_TYPE pos=0)
Check if a string is blank (has no text).
Definition: ncbistr.cpp:106
#define NPOS
Definition: ncbistr.hpp:133
static void TruncateSpacesInPlace(string &str, ETrunc where=eTrunc_Both)
Truncate spaces in a string (in-place)
Definition: ncbistr.cpp:3201
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5084
bool empty(void) const
Return true if the represented string is empty (i.e., the length is zero)
Definition: tempstr.hpp:334
void clear(void)
Clear value to an empty string.
Definition: tempstr.hpp:1003
static long StringToLong(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to long.
Definition: ncbistr.cpp:653
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
Definition: ncbistr.hpp:5412
size_t size_type
Definition: tempstr.hpp:70
static bool SplitInTwo(const CTempString str, const CTempString delim, string &str1, string &str2, TSplitFlags flags=0)
Split a string into two pieces using the specified delimiters.
Definition: ncbistr.cpp:3554
size_type length(void) const
Return the length of the represented array.
Definition: tempstr.hpp:320
CTempString substr(size_type pos) const
Obtain a substring from this string, beginning at a given offset.
Definition: tempstr.hpp:776
size_type find_first_not_of(const CTempString match, size_type pos=0) const
Find the first occurrence of any character not in the matching string within the current string,...
Definition: tempstr.hpp:553
static bool EqualNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive equality of a substring with another string.
Definition: ncbistr.hpp:5353
size_type find(const CTempString match, size_type pos=0) const
Find the first instance of the entire matching string within the current string, beginning at an opti...
Definition: tempstr.hpp:655
static string TruncateSpaces(const string &str, ETrunc where=eTrunc_Both)
Truncate spaces in a string.
Definition: ncbistr.cpp:3186
size_type size(void) const
Return the length of the represented array.
Definition: tempstr.hpp:327
static string & ToLower(string &str)
Convert string to lower case – string& version.
Definition: ncbistr.cpp:405
static const size_type npos
Definition: tempstr.hpp:72
const_iterator begin() const
Return an iterator to the string's starting position.
Definition: tempstr.hpp:299
@ fAllowTrailingSymbols
Ignore trailing non-numerics characters.
Definition: ncbistr.hpp:298
@ fSplit_Tokenize
All delimiters are merged and trimmed, to get non-empty tokens only.
Definition: ncbistr.hpp:2508
@ eTrunc_End
Truncate trailing spaces only.
Definition: ncbistr.hpp:2241
@ eTrunc_Begin
Truncate leading spaces only.
Definition: ncbistr.hpp:2240
@ eNocase
Case insensitive compare.
Definition: ncbistr.hpp:1206
void SetSubtype(TSubtype value)
Assign a value to Subtype data member.
Definition: SubSource_.hpp:319
list< CRef< CSubSource > > TSubtype
Definition: BioSource_.hpp:145
void SetGenome(TGenome value)
Assign a value to Genome data member.
Definition: BioSource_.hpp:428
void SetOrg(TOrg &value)
Assign a value to Org data member.
Definition: BioSource_.cpp:108
void SetName(const TName &value)
Assign a value to Name data member.
Definition: SubSource_.hpp:359
EGenome
biological context
Definition: BioSource_.hpp:97
TSubtype & SetSubtype(void)
Assign a value to Subtype data member.
Definition: BioSource_.hpp:545
@ eSubtype_collection_date
DD-MMM-YYYY format.
Definition: SubSource_.hpp:114
@ eSubtype_fwd_primer_seq
sequence (possibly more than one; semicolon-separated)
Definition: SubSource_.hpp:117
@ eSubtype_lat_lon
+/- decimal degrees
Definition: SubSource_.hpp:113
@ eSubtype_collected_by
name of person who collected the sample
Definition: SubSource_.hpp:115
@ eSubtype_rev_primer_seq
sequence (possibly more than one; semicolon-separated)
Definition: SubSource_.hpp:118
@ eSubtype_environmental_sample
Definition: SubSource_.hpp:111
@ eSubtype_endogenous_virus_name
Definition: SubSource_.hpp:109
@ eSubtype_identified_by
name of person who identified the sample
Definition: SubSource_.hpp:116
TSyn & SetSyn(void)
Assign a value to Syn data member.
Definition: Gene_ref_.hpp:774
void SetAllele(const TAllele &value)
Assign a value to Allele data member.
Definition: Gene_ref_.hpp:561
bool IsSetLocus_tag(void) const
systematic gene name (e.g., MI0001, ORF0069) Check if a value has been assigned to Locus_tag data mem...
Definition: Gene_ref_.hpp:781
bool IsSetLocus(void) const
Official gene symbol Check if a value has been assigned to Locus data member.
Definition: Gene_ref_.hpp:493
void SetLocus(const TLocus &value)
Assign a value to Locus data member.
Definition: Gene_ref_.hpp:514
void SetLocus_tag(const TLocus_tag &value)
Assign a value to Locus_tag data member.
Definition: Gene_ref_.hpp:802
void SetMaploc(const TMaploc &value)
Assign a value to Maploc data member.
Definition: Gene_ref_.hpp:655
const TLocus_tag & GetLocus_tag(void) const
Get the Locus_tag member data.
Definition: Gene_ref_.hpp:793
list< string > TSyn
Definition: Gene_ref_.hpp:102
const TLocus & GetLocus(void) const
Get the Locus member data.
Definition: Gene_ref_.hpp:505
void SetDesc(const TDesc &value)
Assign a value to Desc data member.
Definition: Gene_ref_.hpp:608
bool IsStr(void) const
Check if variant Str is selected.
Definition: Object_id_.hpp:291
void SetTag(TTag &value)
Assign a value to Tag data member.
Definition: Dbtag_.cpp:66
const TStr & GetStr(void) const
Get the variant data.
Definition: Object_id_.hpp:297
TStr & SetStr(void)
Select the variant.
Definition: Object_id_.hpp:304
void SetType(TType &value)
Assign a value to Type data member.
void SetDb(const TDb &value)
Assign a value to Db data member.
Definition: Dbtag_.hpp:229
TId & SetId(void)
Select the variant.
Definition: Object_id_.hpp:277
void SetDiv(const TDiv &value)
Assign a value to Div data member.
Definition: OrgName_.hpp:1014
void SetSubtype(TSubtype value)
Assign a value to Subtype data member.
Definition: OrgMod_.hpp:316
void SetTaxname(const TTaxname &value)
Assign a value to Taxname data member.
Definition: Org_ref_.hpp:381
list< CRef< COrgMod > > TMod
Definition: OrgName_.hpp:332
void SetGcode(TGcode value)
Assign a value to Gcode data member.
Definition: OrgName_.hpp:927
void SetMgcode(TMgcode value)
Assign a value to Mgcode data member.
Definition: OrgName_.hpp:974
TMod & SetMod(void)
Assign a value to Mod data member.
Definition: OrgName_.hpp:845
void SetOrgname(TOrgname &value)
Assign a value to Orgname data member.
Definition: Org_ref_.cpp:87
void SetSubname(const TSubname &value)
Assign a value to Subname data member.
Definition: OrgMod_.hpp:356
void SetLineage(const TLineage &value)
Assign a value to Lineage data member.
Definition: OrgName_.hpp:873
@ eSubtype_biotype
Definition: OrgMod_.hpp:97
@ eSubtype_subgroup
Definition: OrgMod_.hpp:99
@ eSubtype_gb_acronym
used by taxonomy database
Definition: OrgMod_.hpp:115
@ eSubtype_gb_synonym
used by taxonomy database
Definition: OrgMod_.hpp:117
@ eSubtype_substrain
Definition: OrgMod_.hpp:86
@ eSubtype_anamorph
Definition: OrgMod_.hpp:112
@ eSubtype_pathovar
Definition: OrgMod_.hpp:94
@ eSubtype_dosage
chromosome dosage of hybrid
Definition: OrgMod_.hpp:103
@ eSubtype_authority
Definition: OrgMod_.hpp:107
@ eSubtype_sub_species
Definition: OrgMod_.hpp:105
@ eSubtype_nat_host
natural host of this specimen
Definition: OrgMod_.hpp:104
@ eSubtype_cultivar
Definition: OrgMod_.hpp:93
@ eSubtype_variety
Definition: OrgMod_.hpp:89
@ eSubtype_strain
Definition: OrgMod_.hpp:85
@ eSubtype_metagenome_source
Definition: OrgMod_.hpp:120
@ eSubtype_biovar
Definition: OrgMod_.hpp:96
@ eSubtype_old_name
Definition: OrgMod_.hpp:124
@ eSubtype_subtype
Definition: OrgMod_.hpp:88
@ eSubtype_teleomorph
Definition: OrgMod_.hpp:113
@ eSubtype_serogroup
Definition: OrgMod_.hpp:91
@ eSubtype_synonym
Definition: OrgMod_.hpp:111
@ eSubtype_group
Definition: OrgMod_.hpp:98
@ eSubtype_type_material
Definition: OrgMod_.hpp:121
@ eSubtype_acronym
Definition: OrgMod_.hpp:102
@ eSubtype_specimen_voucher
Definition: OrgMod_.hpp:106
@ eSubtype_serotype
Definition: OrgMod_.hpp:90
@ eSubtype_chemovar
Definition: OrgMod_.hpp:95
@ eSubtype_serovar
Definition: OrgMod_.hpp:92
@ eSubtype_bio_material
Definition: OrgMod_.hpp:119
@ eSubtype_gb_anamorph
used by taxonomy database
Definition: OrgMod_.hpp:116
@ eSubtype_culture_collection
Definition: OrgMod_.hpp:118
@ eSubtype_ecotype
Definition: OrgMod_.hpp:110
@ eSubtype_forma_specialis
Definition: OrgMod_.hpp:109
@ eSubtype_old_lineage
Definition: OrgMod_.hpp:123
@ eSubtype_isolate
Definition: OrgMod_.hpp:100
TActivity & SetActivity(void)
Assign a value to Activity data member.
Definition: Prot_ref_.hpp:481
list< string > TName
Definition: Prot_ref_.hpp:108
TEc & SetEc(void)
Assign a value to Ec data member.
Definition: Prot_ref_.hpp:456
list< string > TActivity
Definition: Prot_ref_.hpp:111
void SetDesc(const TDesc &value)
Assign a value to Desc data member.
Definition: Prot_ref_.hpp:412
list< string > TEc
Definition: Prot_ref_.hpp:110
TProcessed GetProcessed(void) const
Get the Processed member data.
Definition: Prot_ref_.hpp:538
bool IsSetProcessed(void) const
Check if a value has been assigned to Processed data member.
Definition: Prot_ref_.hpp:513
void SetProcessed(TProcessed value)
Assign a value to Processed data member.
Definition: Prot_ref_.hpp:544
TName & SetName(void)
Assign a value to Name data member.
Definition: Prot_ref_.hpp:384
@ eProcessed_signal_peptide
Definition: Prot_ref_.hpp:99
@ eProcessed_transit_peptide
Definition: Prot_ref_.hpp:100
TPmid & SetPmid(void)
Select the variant.
Definition: Pub_.hpp:690
E_Choice
Choice variants.
Definition: RNA_ref_.hpp:132
void SetQual(const TQual &value)
Assign a value to Qual data member.
Definition: RNA_qual_.hpp:223
TTRNA & SetTRNA(void)
Select the variant.
Definition: RNA_ref_.cpp:140
TName & SetName(void)
Select the variant.
Definition: RNA_ref_.hpp:491
void SetVal(const TVal &value)
Assign a value to Val data member.
Definition: RNA_qual_.hpp:270
TType GetType(void) const
Get the Type member data.
Definition: RNA_ref_.hpp:529
void ResetAa(void)
Reset Aa data member.
Definition: Trna_ext_.cpp:130
E_Choice Which(void) const
Which variant is currently selected.
Definition: RNA_ref_.hpp:449
EType
type of RNA feature
Definition: RNA_ref_.hpp:95
void SetAnticodon(TAnticodon &value)
Assign a value to Anticodon data member.
Definition: Trna_ext_.cpp:158
void SetExt(TExt &value)
Assign a value to Ext data member.
Definition: RNA_ref_.cpp:211
bool IsSetExt(void) const
generic fields for ncRNA, tmRNA, miscRNA Check if a value has been assigned to Ext data member.
Definition: RNA_ref_.hpp:604
TCodon & SetCodon(void)
Assign a value to Codon data member.
Definition: Trna_ext_.hpp:630
void SetAa(TAa &value)
Assign a value to Aa data member.
Definition: Trna_ext_.cpp:135
void SetType(TType value)
Assign a value to Type data member.
Definition: RNA_ref_.hpp:538
const TExt & GetExt(void) const
Get the Ext member data.
Definition: RNA_ref_.hpp:616
@ e_Name
for naming "other" type
Definition: RNA_ref_.hpp:134
@ eType_scRNA
will become ncRNA, with RNA-gen.class = scRNA
Definition: RNA_ref_.hpp:102
@ eType_snoRNA
will become ncRNA, with RNA-gen.class = snoRNA
Definition: RNA_ref_.hpp:103
@ eType_ncRNA
non-coding RNA; subsumes snRNA, scRNA, snoRNA
Definition: RNA_ref_.hpp:104
@ eType_snRNA
will become ncRNA, with RNA-gen.class = snRNA
Definition: RNA_ref_.hpp:101
void SetQual(const TQual &value)
Assign a value to Qual data member.
Definition: Gb_qual_.hpp:221
vector< CRef< CDbtag > > TDbxref
Definition: Seq_feat_.hpp:123
TDbxref & SetDbxref(void)
Assign a value to Dbxref data member.
Definition: Seq_feat_.hpp:1339
bool IsSetData(void) const
the specific data Check if a value has been assigned to Data data member.
Definition: Seq_feat_.hpp:913
E_Choice Which(void) const
Which variant is currently selected.
void SetLocation(TLocation &value)
Assign a value to Location data member.
Definition: Seq_feat_.cpp:131
void SetComment(const TComment &value)
Assign a value to Comment data member.
Definition: Seq_feat_.hpp:1058
void SetPartial(TPartial value)
Assign a value to Partial data member.
Definition: Seq_feat_.hpp:971
void SetProduct(TProduct &value)
Assign a value to Product data member.
Definition: Seq_feat_.cpp:110
void SetCode(TCode &value)
Assign a value to Code data member.
Definition: Cdregion_.cpp:68
const TLocation & GetLocation(void) const
Get the Location member data.
Definition: Seq_feat_.hpp:1117
E_Choice
Choice variants.
void SetExcept(TExcept value)
Assign a value to Except data member.
Definition: Seq_feat_.hpp:1018
void SetExt(TExt &value)
Assign a value to Ext data member.
Definition: Seq_feat_.cpp:153
const TData & GetData(void) const
Get the Data member data.
Definition: Seq_feat_.hpp:925
bool CanGetData(void) const
Check if it is safe to call GetData method.
Definition: Seq_feat_.hpp:919
void SetData(TData &value)
Assign a value to Data data member.
Definition: Seq_feat_.cpp:94
void SetExp_ev(TExp_ev value)
Assign a value to Exp_ev data member.
Definition: Seq_feat_.hpp:1277
const TComment & GetComment(void) const
Get the Comment member data.
Definition: Seq_feat_.hpp:1049
void SetVal(const TVal &value)
Assign a value to Val data member.
Definition: Gb_qual_.hpp:268
void SetPseudo(TPseudo value)
Assign a value to Pseudo data member.
Definition: Seq_feat_.hpp:1374
const TGene & GetGene(void) const
Get the variant data.
void SetExcept_text(const TExcept_text &value)
Assign a value to Except_text data member.
Definition: Seq_feat_.hpp:1414
const TProt & GetProt(void) const
Get the variant data.
void ResetLocation(void)
Reset Location data member.
Definition: Seq_feat_.cpp:122
const TRna & GetRna(void) const
Get the variant data.
TQual & SetQual(void)
Assign a value to Qual data member.
Definition: Seq_feat_.hpp:1153
void SetFrame(TFrame value)
Assign a value to Frame data member.
Definition: Cdregion_.hpp:540
bool IsSetLocation(void) const
feature made from Check if a value has been assigned to Location data member.
Definition: Seq_feat_.hpp:1105
void SetKey(const TKey &value)
Assign a value to Key data member.
Definition: Imp_feat_.hpp:268
bool CanGetComment(void) const
Check if it is safe to call GetComment method.
Definition: Seq_feat_.hpp:1043
@ e_not_set
No variant selected.
@ e_Region
named region (globin locus)
@ e_Pub
publication applies to this seq
@ eExp_ev_experimental
any reasonable experimental check
Definition: Seq_feat_.hpp:102
@ eExp_ev_not_experimental
similarity, pattern, etc
Definition: Seq_feat_.hpp:103
@ eFrame_not_set
not set, code uses one
Definition: Cdregion_.hpp:95
@ eFrame_three
reading frame
Definition: Cdregion_.hpp:98
bool IsMix(void) const
Check if variant Mix is selected.
Definition: Seq_loc_.hpp:552
bool IsEmpty(void) const
Check if variant Empty is selected.
Definition: Seq_loc_.hpp:516
ENa_strand
strand of nucleic acid
Definition: Na_strand_.hpp:64
list< CRef< CSeq_loc > > Tdata
const Tdata & Get(void) const
Get the member data.
bool IsNull(void) const
Check if variant Null is selected.
Definition: Seq_loc_.hpp:504
const TMix & GetMix(void) const
Get the variant data.
Definition: Seq_loc_.cpp:282
@ eNa_strand_plus
Definition: Na_strand_.hpp:66
@ eNa_strand_minus
Definition: Na_strand_.hpp:67
@ eNa_strand_unknown
Definition: Na_strand_.hpp:65
@ e_MaxChoice
== e_Named_annot_track+1
Definition: Seq_id_.hpp:118
TSet & SetSet(void)
Select the variant.
Definition: Seq_entry_.cpp:130
TAnnot & SetAnnot(void)
Assign a value to Annot data member.
bool IsSeq(void) const
Check if variant Seq is selected.
Definition: Seq_entry_.hpp:257
TSeq & SetSeq(void)
Select the variant.
Definition: Seq_entry_.cpp:108
Tdata & Set(void)
Assign a value to data member.
void SetData(TData &value)
Assign a value to Data data member.
Definition: Seq_annot_.cpp:244
void SetDesc(TDesc &value)
Assign a value to Desc data member.
Definition: Seq_annot_.cpp:223
TAnnot & SetAnnot(void)
Assign a value to Annot data member.
Definition: Bioseq_.hpp:372
TName & SetName(void)
Select the variant.
Definition: Annotdesc_.hpp:508
list< CRef< CSeq_id > > TId
Definition: Bioseq_.hpp:94
list< CRef< CSeq_feat > > TFtable
Definition: Seq_annot_.hpp:193
Definition of all error codes used in objtools libraries.
int len
const TrnaAa taa[]
Definition: loadfeat.cpp:126
double value_type
The numeric datatype used by the parser.
Definition: muParserDef.h:228
const GenericPointer< typename T::ValueType > T2 value
Definition: pointer.h:1227
const char * tag
int isspace(Uchar c)
Definition: ncbictype.hpp:69
int isdigit(Uchar c)
Definition: ncbictype.hpp:64
Multi-threading – classes, functions, and features.
void copy(Njn::Matrix< S > *matrix_, const Njn::Matrix< T > &matrix0_)
Definition: njn_matrix.hpp:613
static const char * prefix[]
Definition: pcregrep.c:405
SStaticPair< const char *, CFeatureTableReader_Imp::EOrgRef > TOrgRefKey
Definition: readfeat.cpp:617
static bool s_LineIndicatesOrder(const CTempString &line)
Definition: readfeat.cpp:1443
static const TOrgRefKey orgref_key_to_subtype[]
Definition: readfeat.cpp:619
DEFINE_STATIC_ARRAY_MAP(TQualMap, sm_QualKeys, qual_key_to_subtype)
CStaticPairArrayMap< const char *, CFeatureTableReader_Imp::EOrgRef, PCase_CStr > TOrgRefMap
Definition: readfeat.cpp:628
static const int s_NumQualsWithCaps
Definition: readfeat.cpp:2287
SStaticPair< const char *, CBioSource::EGenome > TGenomeKey
Definition: readfeat.cpp:632
static CRef< CSeq_loc > s_LocationJoinToOrder(const CSeq_loc &loc)
Definition: readfeat.cpp:1467
static string s_FixQualCapitalization(const string &qual)
Definition: readfeat.cpp:2289
static const map< const char *, int, PNocase_CStr > sm_TrnaKeys
Definition: readfeat.cpp:774
CStaticPairArrayMap< const char *, CFeatureTableReader_Imp::EQual, PCase_CStr > TQualMap
Definition: readfeat.cpp:613
CStaticPairArrayMap< const char *, CBioSource::EGenome, PCase_CStr > TGenomeMap
Definition: readfeat.cpp:667
static const TSubSrcKey subsrc_key_to_subtype[]
Definition: readfeat.cpp:673
static const TGenomeKey genome_key_to_subtype[]
Definition: readfeat.cpp:634
CStaticPairArrayMap< const char *, CSubSource::ESubtype, PNocase_CStr > TSubSrcNoCaseMap
Definition: readfeat.cpp:719
static set< const char *, PCase_CStr > sc_SingleKeys
Definition: readfeat.cpp:841
static const string s_QualsWithCaps[]
Definition: readfeat.cpp:2279
SStaticPair< const char *, CFeatureTableReader_Imp::EQual > TQualKey
Definition: readfeat.cpp:508
CStaticPairArrayMap< const char *, COrgMod::ESubtype, PCase_CStr > TOrgModMap
Definition: readfeat.cpp:770
CStaticPairArrayMap< const char *, CSubSource::ESubtype, PCase_CStr > TSubSrcMap
Definition: readfeat.cpp:715
SStaticPair< const char *, COrgMod::ESubtype > TOrgModKey
Definition: readfeat.cpp:723
SStaticPair< const char *, CSubSource::ESubtype > TSubSrcKey
Definition: readfeat.cpp:671
static const TQualKey qual_key_to_subtype[]
Definition: readfeat.cpp:510
static const TOrgModKey orgmod_key_to_subtype[]
Definition: readfeat.cpp:725
CConstRef< CSeq_id > GetBestId(const CBioseq &bioseq)
CRef< CSeq_loc > GetSeqLocFromString(const string &text, const CSeq_id *id, CGetSeqLocFromStringHelper *helper)
#define RAW_FIELD_IS_EMPTY_OR_UNSET(Var, Fld)
RAW_FIELD_IS_EMPTY_OR_UNSET macro.
#define FIELD_EQUALS(Var, Fld, Value)
FIELD_EQUALS base macro.
bool operator!=(const SFeatAndLineNum &rhs) const
Definition: readfeat.cpp:429
bool operator<(const SFeatAndLineNum &rhs) const
Definition: readfeat.cpp:431
int Compare(const SFeatAndLineNum &rhs) const
Definition: readfeat.cpp:434
bool operator==(const SFeatAndLineNum &rhs) const
Definition: readfeat.cpp:427
SFeatAndLineNum(TFeatConstRef pFeat, TSeqPos uLineNum)
Definition: readfeat.cpp:420
bool operator()(const CSeq_id *left, const CSeq_id *right) const
Definition: readfeat.cpp:3762
Template structure SStaticPair is simlified replacement of STL pair<> Main reason of introducing this...
Definition: static_set.hpp:60
Definition: inftrees.h:24
#define _ASSERT
else result
Definition: token2.c:20
#define ftable
Definition: utilfeat.h:37
Modified on Fri Apr 26 16:28:05 2024 by modify_doxy.py rev. 669887