NCBI C++ ToolKit
readfeat.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1  /*
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Jonathan Kans, Michael Kornbluh
27  *
28  * File Description:
29  * Feature table reader
30  *
31  */
32 
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbistd.hpp>
35 #include <corelib/ncbithr.hpp>
36 
37 #include <util/static_map.hpp>
38 
39 #include <serial/iterator.hpp>
40 #include <serial/objistrasn.hpp>
41 
42 // Objects includes
48 
53 
57 #include <objects/pub/Pub.hpp>
59 #include <objects/seq/Pubdesc.hpp>
62 
81 
83 
86 
89 #include <objtools/error_codes.hpp>
90 
91 #include <algorithm>
92 #include <unordered_set>
93 
96 #include "best_feat_finder.hpp"
97 
98 #define NCBI_USE_ERRCODE_X Objtools_Rd_Feature
99 
100 
102 
103 BEGIN_objects_SCOPE // namespace ncbi::objects::
104 
105 
106 
107 namespace {
108  static const char * const kCdsFeatName = "CDS";
109  // priorities, inherited from C toolkit
110  static Uchar std_order[CSeq_id::e_MaxChoice] = {
111  83, /* 0 = not set */
112  80, /* 1 = local Object-id */
113  70, /* 2 = gibbsq */
114  70, /* 3 = gibbmt */
115  70, /* 4 = giim Giimport-id */
116  60, /* 5 = genbank */
117  60, /* 6 = embl */
118  60, /* 7 = pir */
119  60, /* 8 = swissprot */
120  81, /* 9 = patent */
121  65, /* 10 = other TextSeqId */
122  80, /* 11 = general Dbtag */
123  82, /* 12 = gi */
124  60, /* 13 = ddbj */
125  60, /* 14 = prf */
126  60, /* 15 = pdb */
127  60, /* 16 = tpg */
128  60, /* 17 = tpe */
129  60, /* 18 = tpd */
130  68, /* 19 = gpp */
131  69 /* 20 = nat */
132  };
133 
135 {
136  if (ids.size() == 1)
137  return ids.front();
138 
139  CRef<CSeq_id> id;
140  if (!ids.empty())
141  {
142  Uchar best_weight = UCHAR_MAX;
143  ITERATE(CBioseq::TId, it, ids)
144  {
145  Uchar new_weight = std_order[(*it)->Which()];
146  if (new_weight < best_weight)
147  {
148  id = *it;
149  best_weight = new_weight;
150  }
151  };
152  }
153 
154  return id;
155 }
156 
157 
158 map<char, list<char>> s_IUPACmap
159 {
160  {'A', list<char>({'A'})},
161  {'G', list<char>({'G'})},
162  {'C', list<char>({'C'})},
163  {'T', list<char>({'T'})},
164  {'U', list<char>({'U'})},
165  {'M', list<char>({'A', 'C'})},
166  {'R', list<char>({'A', 'G'})},
167  {'W', list<char>({'A', 'T'})},
168  {'S', list<char>({'C', 'G'})},
169  {'Y', list<char>({'C', 'T'})},
170  {'K', list<char>({'G', 'T'})},
171  {'V', list<char>({'A', 'C', 'G'})},
172  {'H', list<char>({'A', 'C', 'T'})},
173  {'D', list<char>({'A', 'G', 'T'})},
174  {'B', list<char>({'C', 'G', 'T'})},
175  {'N', list<char>({'A', 'C', 'G', 'T'})}
176 };
177 
178 }
179 
180 
181 class /* NCBI_XOBJREAD_EXPORT */ CFeatureTableReader_Imp
182 {
183 public:
184  enum EQual {
285  };
286 
287  enum EOrgRef {
294  };
295 
298 
299  // constructor
300  CFeatureTableReader_Imp(ILineReader* reader, unsigned int line_num, ILineErrorListener* pMessageListener);
301  // destructor
303 
304  // read 5-column feature table and return Seq-annot
306  const CTempString& annotname,
307  const TFlags flags,
308  ITableFilter *filter);
309 
310  // create single feature from key
311  CRef<CSeq_feat> CreateSeqFeat (const string& feat,
313  const TFlags flags,
314  const string &seq_id,
315  ITableFilter *filter);
316 
317  // add single qualifier to feature
318  void AddFeatQual (CRef<CSeq_feat> sfp,
319  const string& feat_name,
320  const string& qual,
321  const string& val,
322  const TFlags flags,
323  const string &seq_id );
324 
325  static bool ParseInitialFeatureLine (
326  const CTempString& line_arg,
327  CTempStringEx& out_seqid,
328  CTempStringEx& out_annotname );
329 
330  static void PutProgress(const CTempString& seq_id,
331  const unsigned int line_number,
332  ILineErrorListener* pListener);
333 
335  return m_reader;
336  }
337 
339  return m_pMessageListener;
340  }
341 
342 private:
343 
344  // Prohibit copy constructor and assignment operator
347 
348  void x_InitId(const CTempString& seq_id, const TFlags flags);
349  // returns true if parsed (otherwise, out_offset is left unchanged)
350  bool x_TryToParseOffset(const CTempString & sLine, Int4 & out_offset );
351 
352 
353  struct SFeatLocInfo {
358  bool is_point;
360  };
361 
362 
364  const CTempString& line,
365  SFeatLocInfo& loc_info,
366  string& feat,
367  string& qual,
368  string& val,
369  Int4 offset);
370 
371 
372  bool x_IsWebComment(CTempString line);
373 
375  CTempString strFeatureName,
376  CRef<CSeq_feat>& sfp,
377  const SFeatLocInfo& loc_info);
378 
380  const string &feat_name,
381  const string& qual, const string& val,
382  const TFlags flags);
383 
384  void x_ProcessQualifier(const string& qual_name,
385  const string& qual_val,
386  const string& feat_name,
387  CRef<CSeq_feat> feat,
388  TFlags flags);
389 
390  bool x_AddQualifierToGene (CSeqFeatData& sfdata,
391  EQual qtype, const string& val);
393  EQual qtype, const string& val);
395  EQual qtype, const string& val);
397  EQual qtype, const string& qual, const string& val);
398  bool x_AddQualifierToBioSrc (CSeqFeatData& sfdata,
399  const string &feat_name,
400  EOrgRef rtype, const string& val);
401  bool x_AddQualifierToBioSrc (CSeqFeatData& sfdata,
402  CSubSource::ESubtype stype, const string& val);
403  bool x_AddQualifierToBioSrc (CSeqFeatData& sfdata,
404  COrgMod::ESubtype mtype, const string& val);
405 
406  bool x_AddNoteToFeature(CRef<CSeq_feat> sfp, const string& note);
407 
409  const string& feat_name,
410  const string& qual,
411  const string& val);
412 
414  const string& qual, const string& val);
415 
416  bool x_AddCodons(const string& val, CTrna_ext& trna_ext) const;
417 
421  TFeatConstRef pFeat,
422  TSeqPos uLineNum ) :
423  m_pFeat(pFeat), m_uLineNum(uLineNum) {
424  _ASSERT(pFeat);
425  }
426 
427  bool operator==(const SFeatAndLineNum & rhs) const {
428  return Compare(rhs) == 0; }
429  bool operator!=(const SFeatAndLineNum & rhs) const {
430  return Compare(rhs) != 0; }
431  bool operator<(const SFeatAndLineNum & rhs) const {
432  return Compare(rhs) < 0; }
433 
434  int Compare(const SFeatAndLineNum & rhs) const {
435  if( m_uLineNum != rhs.m_uLineNum ) {
436  return ( m_uLineNum < rhs.m_uLineNum ? -1 : 1 );
437  }
438  return (m_pFeat.GetPointerOrNull() < rhs.m_pFeat.GetPointerOrNull() ? -1 : 1 );
439  }
440 
441  TFeatConstRef m_pFeat; // must be non-NULL
442  TSeqPos m_uLineNum; // the line where this feature was created (or zero if programmatically created)
443  };
446  CRef<CSeq_annot> sap,
447  TChoiceToFeatMap & choiceToFeatMap, // an input param, but might get more items added
448  const TFlags flags);
449 
450  bool x_StringIsJustQuotes (const string& str);
451 
452  string x_TrnaToAaString(const string& val);
453 
454  bool x_ParseTrnaExtString(CTrna_ext & ext_trna, const string & str);
455  SIZE_TYPE x_MatchingParenPos( const string &str, SIZE_TYPE open_paren_pos );
456 
457  long x_StringToLongNoThrow (
458  CTempString strToConvert,
459  CTempString strFeatureName,
460  CTempString strQualifierName,
461  // user can override the default problem types that are set on error
463  );
464 
465  bool x_SetupSeqFeat (CRef<CSeq_feat> sfp, const string& feat,
466  const TFlags flags,
467  ITableFilter *filter);
468 
470  ILineError::EProblem eProblem,
471  EDiagSev eSeverity,
472  const std::string & strFeatureName = kEmptyStr,
473  const std::string & strQualifierName = kEmptyStr,
474  const std::string & strQualifierValue = kEmptyStr,
475  const std::string & strErrorMessage = kEmptyStr,
476  const ILineError::TVecOfLines & vecOfOtherLines =
478 
480  int line_num,
481  ILineError::EProblem eProblem,
482  EDiagSev eSeverity,
483  const std::string & strFeatureName = kEmptyStr,
484  const std::string & strQualifierName = kEmptyStr,
485  const std::string & strQualifierValue = kEmptyStr,
486  const std::string & strErrorMessage = kEmptyStr,
487  const ILineError::TVecOfLines & vecOfOtherLines =
489 
490  void x_TokenizeStrict( const CTempString &line, vector<string> &out_tokens );
491  void x_TokenizeLenient( const CTempString &line, vector<string> &out_tokens );
493  void x_ResetFeat(CRef<CSeq_feat>& feat, bool & curr_feat_intervals_done);
494  void x_UpdatePointStrand(CSeq_feat& feat, CSeq_interval::TStrand strand) const;
495  void x_GetPointStrand(const CSeq_feat& feat, CSeq_interval::TStrand& strand) const;
496 
498  string m_real_seqid;
501  unsigned int m_LineNumber;
503  unordered_set<string> m_ProcessedTranscriptIds;
504  unordered_set<string> m_ProcessedProteinIds;
505 };
506 
507 
509 
510 static const TQualKey qual_key_to_subtype [] = {
512  { "PCR_conditions", CFeatureTableReader_Imp::eQual_PCR_conditions },
524  { "codon_recognized", CFeatureTableReader_Imp::eQual_codon_recognized },
526  { "codons_recognized", CFeatureTableReader_Imp::eQual_codon_recognized },
533  { "estimated_length", CFeatureTableReader_Imp::eQual_estimated_length },
544  { "gene_synonym", CFeatureTableReader_Imp::eQual_gene_syn },
552  { "linkage_evidence", CFeatureTableReader_Imp::eQual_linkage_evidence },
558  { "mobile_element_type", CFeatureTableReader_Imp::eQual_mobile_element_type },
577  { "regulatory_class", CFeatureTableReader_Imp::eQual_regulatory_class },
579  { "ribosomal_slippage", CFeatureTableReader_Imp::eQual_ribosomal_slippage },
583  { "rpt_unit_range", CFeatureTableReader_Imp::eQual_rpt_unit_range },
587  { "secondary_accession", CFeatureTableReader_Imp::eQual_secondary_accession },
588  { "secondary_accessions", CFeatureTableReader_Imp::eQual_secondary_accession },
602  { "trans_splicing", CFeatureTableReader_Imp::eQual_trans_splicing },
611 };
612 
615 
616 
618 
626 };
627 
630 
631 
633 
635  { "apicoplast", CBioSource::eGenome_apicoplast },
636  { "chloroplast", CBioSource::eGenome_chloroplast },
637  { "chromatophore", CBioSource::eGenome_chromatophore },
638  { "chromoplast", CBioSource::eGenome_chromoplast },
639  { "chromosome", CBioSource::eGenome_chromosome },
640  { "cyanelle", CBioSource::eGenome_cyanelle },
641  { "endogenous_virus", CBioSource::eGenome_endogenous_virus },
642  { "extrachrom", CBioSource::eGenome_extrachrom },
643  { "genomic", CBioSource::eGenome_genomic },
644  { "hydrogenosome", CBioSource::eGenome_hydrogenosome },
645  { "insertion_seq", CBioSource::eGenome_insertion_seq },
646  { "kinetoplast", CBioSource::eGenome_kinetoplast },
647  { "leucoplast", CBioSource::eGenome_leucoplast },
648  { "macronuclear", CBioSource::eGenome_macronuclear },
649  { "mitochondrion", CBioSource::eGenome_mitochondrion },
650  { "mitochondrion:kinetoplast", CBioSource::eGenome_kinetoplast },
651  { "nucleomorph", CBioSource::eGenome_nucleomorph },
652  { "plasmid", CBioSource::eGenome_plasmid },
653  { "plastid", CBioSource::eGenome_plastid },
654  { "plastid:apicoplast", CBioSource::eGenome_apicoplast },
655  { "plastid:chloroplast", CBioSource::eGenome_chloroplast },
656  { "plastid:chromoplast", CBioSource::eGenome_chromoplast },
657  { "plastid:cyanelle", CBioSource::eGenome_cyanelle },
658  { "plastid:leucoplast", CBioSource::eGenome_leucoplast },
659  { "plastid:proplastid", CBioSource::eGenome_proplastid },
660  { "proplastid", CBioSource::eGenome_proplastid },
661  { "proviral", CBioSource::eGenome_proviral },
662  { "transposon", CBioSource::eGenome_transposon },
663  { "unknown", CBioSource::eGenome_unknown },
664  { "virion", CBioSource::eGenome_virion }
665 };
666 
669 
670 
672 
674  { "altitude", CSubSource::eSubtype_altitude },
675  { "cell_line", CSubSource::eSubtype_cell_line },
676  { "cell_type", CSubSource::eSubtype_cell_type },
677  { "chromosome", CSubSource::eSubtype_chromosome },
678  { "clone", CSubSource::eSubtype_clone },
679  { "clone_lib", CSubSource::eSubtype_clone_lib },
680  { "collected_by", CSubSource::eSubtype_collected_by },
681  { "collection_date", CSubSource::eSubtype_collection_date },
682  { "country", CSubSource::eSubtype_country },
683  { "dev_stage", CSubSource::eSubtype_dev_stage },
684  { "endogenous_virus", CSubSource::eSubtype_endogenous_virus_name },
685  { "environmental_sample", CSubSource::eSubtype_environmental_sample },
686  { "frequency", CSubSource::eSubtype_frequency },
687  { "fwd_primer_name", CSubSource::eSubtype_fwd_primer_name },
688  { "fwd_primer_seq", CSubSource::eSubtype_fwd_primer_seq },
689  { "genotype", CSubSource::eSubtype_genotype },
690  { "germline", CSubSource::eSubtype_germline },
691  { "haplotype", CSubSource::eSubtype_haplotype },
692  { "identified_by", CSubSource::eSubtype_identified_by },
693  { "insertion_seq", CSubSource::eSubtype_insertion_seq_name },
694  { "isolation_source", CSubSource::eSubtype_isolation_source },
695  { "lab_host", CSubSource::eSubtype_lab_host },
696  { "lat_lon", CSubSource::eSubtype_lat_lon },
697  { "map", CSubSource::eSubtype_map },
698  { "metagenomic", CSubSource::eSubtype_metagenomic },
699  { "plasmid", CSubSource::eSubtype_plasmid_name },
700  { "plastid", CSubSource::eSubtype_plastid_name },
701  { "pop_variant", CSubSource::eSubtype_pop_variant },
702  { "rearranged", CSubSource::eSubtype_rearranged },
703  { "rev_primer_name", CSubSource::eSubtype_rev_primer_name },
704  { "rev_primer_seq", CSubSource::eSubtype_rev_primer_seq },
705  { "segment", CSubSource::eSubtype_segment },
706  { "sex", CSubSource::eSubtype_sex },
707  { "subclone", CSubSource::eSubtype_subclone },
708  { "tissue_lib ", CSubSource::eSubtype_tissue_lib },
709  { "tissue_type", CSubSource::eSubtype_tissue_type },
710  { "transgenic", CSubSource::eSubtype_transgenic },
711  { "transposon", CSubSource::eSubtype_transposon_name }
712 };
713 
716 
717 // case-insensitive version of sm_SubSrcKeys
720  TSubSrcNoCaseMap, sm_SubSrcNoCaseKeys, subsrc_key_to_subtype);
721 
723 
725  { "acronym", COrgMod::eSubtype_acronym },
726  { "anamorph", COrgMod::eSubtype_anamorph },
727  { "authority", COrgMod::eSubtype_authority },
728  { "bio_material", COrgMod::eSubtype_bio_material },
729  { "biotype", COrgMod::eSubtype_biotype },
730  { "biovar", COrgMod::eSubtype_biovar },
731  { "breed", COrgMod::eSubtype_breed },
732  { "chemovar", COrgMod::eSubtype_chemovar },
733  { "common", COrgMod::eSubtype_common },
734  { "cultivar", COrgMod::eSubtype_cultivar },
735  { "culture_collection", COrgMod::eSubtype_culture_collection },
736  { "dosage", COrgMod::eSubtype_dosage },
737  { "ecotype", COrgMod::eSubtype_ecotype },
738  { "forma", COrgMod::eSubtype_forma },
739  { "forma_specialis", COrgMod::eSubtype_forma_specialis },
740  { "gb_acronym", COrgMod::eSubtype_gb_acronym },
741  { "gb_anamorph", COrgMod::eSubtype_gb_anamorph },
742  { "gb_synonym", COrgMod::eSubtype_gb_synonym },
743  { "group", COrgMod::eSubtype_group },
744  { "isolate", COrgMod::eSubtype_isolate },
745  { "metagenome_source", COrgMod::eSubtype_metagenome_source },
746  { "nat_host", COrgMod::eSubtype_nat_host },
747  { "natural_host", COrgMod::eSubtype_nat_host },
748  { "old_lineage", COrgMod::eSubtype_old_lineage },
749  { "old_name", COrgMod::eSubtype_old_name },
750  { "pathovar", COrgMod::eSubtype_pathovar },
751  { "serogroup", COrgMod::eSubtype_serogroup },
752  { "serotype", COrgMod::eSubtype_serotype },
753  { "serovar", COrgMod::eSubtype_serovar },
754  { "spec_host", COrgMod::eSubtype_nat_host },
755  { "specific_host", COrgMod::eSubtype_nat_host },
756  { "specimen_voucher", COrgMod::eSubtype_specimen_voucher },
757  { "strain", COrgMod::eSubtype_strain },
758  { "sub_species", COrgMod::eSubtype_sub_species },
759  { "subgroup", COrgMod::eSubtype_subgroup },
760  { "substrain", COrgMod::eSubtype_substrain },
761  { "subtype", COrgMod::eSubtype_subtype },
762  { "synonym", COrgMod::eSubtype_synonym },
763  { "teleomorph", COrgMod::eSubtype_teleomorph },
764  { "type", COrgMod::eSubtype_type },
765  { "type_material", COrgMod::eSubtype_type_material },
766  { "variety", COrgMod::eSubtype_variety }
767 };
768 
771 
773 {
774  { "Ala", 'A' },
775  { "Alanine", 'A' },
776  { "Arg", 'R' },
777  { "Arginine", 'R' },
778  { "Asn", 'N' },
779  { "Asp", 'D' },
780  { "Asp or Asn", 'B' },
781  { "Asparagine", 'N' },
782  { "Aspartate", 'D' },
783  { "Aspartic Acid", 'D' },
784  { "Asx", 'B' },
785  { "Cys", 'C' },
786  { "Cysteine", 'C' },
787  { "Gln", 'Q' },
788  { "Glu", 'E' },
789  { "Glu or Gln", 'Z' },
790  { "Glutamate", 'E' },
791  { "Glutamic Acid", 'E' },
792  { "Glutamine", 'Q' },
793  { "Glx", 'Z' },
794  { "Gly", 'G' },
795  { "Glycine", 'G' },
796  { "His", 'H' },
797  { "Histidine", 'H' },
798  { "Ile", 'I' },
799  { "Ile2", 'I' },
800  { "Isoleucine", 'I' },
801  { "Leu", 'L' },
802  { "Leu or Ile", 'J' },
803  { "Leucine", 'L' },
804  { "Lys", 'K' },
805  { "Lysine", 'K' },
806  { "Met", 'M' },
807  { "Methionine", 'M' },
808  { "OTHER", 'X' },
809  { "Phe", 'F' },
810  { "Phenylalanine", 'F' },
811  { "Pro", 'P' },
812  { "Proline", 'P' },
813  { "Pyl", 'O' },
814  { "Pyrrolysine", 'O' },
815  { "Sec", 'U' },
816  { "Selenocysteine", 'U' },
817  { "Ser", 'S' },
818  { "Serine", 'S' },
819  { "TERM", '*' },
820  { "Ter", '*' },
821  { "Termination", '*' },
822  { "Thr", 'T' },
823  { "Threonine", 'T' },
824  { "Trp", 'W' },
825  { "Tryptophan", 'W' },
826  { "Tyr", 'Y' },
827  { "Tyrosine", 'Y' },
828  { "Val", 'V' },
829  { "Valine", 'V' },
830  { "Xle", 'J' },
831  { "Xxx", 'X' },
832  { "Undet", 'X' },
833  { "fMet", 'M' },
834  { "iMet", 'M' }
835 };
836 
837 
838 static
841  "environmental_sample",
842  "germline",
843  "metagenomic",
844  "partial",
845  "pseudo",
846  "rearranged",
847  "ribosomal_slippage",
848  "trans_splicing",
849  "transgenic",
850  "replace" // RW-882
851 };
852 
853 // constructor
855  : m_reader(reader), m_LineNumber(line_num), m_pMessageListener(pMessageListener)
856 {
857 }
858 
859 // destructor
861 {
862 }
863 
864 
866  const CTempString & sLine, Int4 & out_offset )
867 {
868  // offset strings are of the form [offset=SOME_NUMBER], but here we try
869  // to be as forgiving of whitespace as possible.
870 
871  CTempString sKey;
872  CTempString sValue;
873  if( ! NStr::SplitInTwo(sLine, "=", sKey, sValue) ) {
874  // "=" not found
875  return false;
876  }
877 
878  // check key
880  if( NStr::StartsWith(sKey, "[") ) {
881  sKey = sKey.substr(1); // remove initial "["
882  }
884  if( ! NStr::EqualNocase(sKey, "offset") ) {
885  // key is not offset
886  return false;
887  }
888 
889  // check value
891  if( ! NStr::EndsWith(sValue, "]") ) {
892  // no closing bracket
893  return false;
894  }
895  // remove closing bracket
896  sValue = sValue.substr(0, (sValue.length() - 1) );
898  // is it a number?
899  try {
900  Int4 new_offset = NStr::StringToInt(sValue);
901  // if( new_offset < 0 ) {
902  // return false;
903  // }
904  out_offset = new_offset;
905  return true;
906  } catch ( CStringException & ) {
907  return false;
908  }
909 }
910 
912  const CTempString& line,
913  SFeatLocInfo& loc_info,
914  string& featP,
915  string& qualP,
916  string& valP,
917  Int4 offset
918 )
919 
920 {
921  SIZE_TYPE numtkns;
922  bool isminus = false;
923  bool ispoint = false;
924  size_t len;
925  bool partial5 = false;
926  bool partial3 = false;
927  Int4 startv = -1;
928  Int4 stopv = -1;
929  Int4 swp;
930  string start, stop, feat, qual, val, stnd;
931  vector<string> tkns;
932 
933 
934  if (line.empty ()) return false;
935 
936  /* offset and other instructions encoded in brackets */
937  if (NStr::StartsWith (line, '[')) return false;
938 
939  tkns.clear ();
940  x_TokenizeLenient(line, tkns);
941  numtkns = tkns.size ();
942 
943  if (numtkns > 0) {
944  start = NStr::TruncateSpaces(tkns[0]);
945  }
946  if (numtkns > 1) {
947  stop = NStr::TruncateSpaces(tkns[1]);
948  }
949  if (numtkns > 2) {
950  feat = NStr::TruncateSpaces(tkns[2]);
951  }
952  if (numtkns > 3) {
953  qual = NStr::TruncateSpaces(tkns[3]);
954  }
955  if (numtkns > 4) {
956  val = NStr::TruncateSpaces(tkns[4]);
957  // trim enclosing double-quotes
958  if( val.length() >= 2 && val[0] == '"' && val[val.length()-1] == '"' ) {
959  val = val.substr(1, val.length() - 2);
960  }
961  }
962  if (numtkns > 5) {
963  stnd = NStr::TruncateSpaces(tkns[5]);
964  }
965 
966  bool has_start = false;
967  if (! start.empty ()) {
968  if (start [0] == '<') {
969  partial5 = true;
970  start.erase (0, 1);
971  }
972  len = start.length ();
973  if (len > 1 && start [len - 1] == '^') {
974  ispoint = true;
975  start [len - 1] = '\0';
976  }
977  startv = x_StringToLongNoThrow(start, feat, qual,
979  has_start = true;
980  }
981 
982  bool has_stop = false;
983  if (! stop.empty ()) {
984  if (stop [0] == '>') {
985  partial3 = true;
986  stop.erase (0, 1);
987  }
988  stopv = x_StringToLongNoThrow (stop, feat, qual,
990  has_stop = true;
991  }
992 
993  if ( startv <= 0 || stopv <= 0 ) {
994  startv = -1;
995  stopv = -1;
996  } else {
997  startv--;
998  stopv--;
999  if (! stnd.empty ()) {
1000  if (stnd == "minus" || stnd == "-" || stnd == "complement") {
1001  if (start < stop) {
1002  swp = startv;
1003  startv = stopv;
1004  stopv = swp;
1005  }
1006  isminus = true;
1007  }
1008  }
1009  }
1010 
1011  if (startv >= 0) {
1012  startv += offset;
1013  }
1014  if (stopv >= 0) {
1015  stopv += offset;
1016  }
1017 
1018  if ((has_start && startv < 0) || (has_stop && stopv < 0)) {
1019  x_ProcessMsg(
1021  eDiag_Error,
1022  feat);
1023  }
1024 
1025  loc_info.start_pos = ( startv < 0 ? -1 : startv);
1026  loc_info.stop_pos = ( stopv < 0 ? -1 : stopv);
1027 
1028  loc_info.is_5p_partial = partial5;
1029  loc_info.is_3p_partial = partial3;
1030  loc_info.is_point = ispoint;
1031  loc_info.is_minus_strand = isminus;
1032  featP = feat;
1033  qualP = qual;
1034  valP = val;
1035 
1036  return true;
1037 }
1038 
1039 /*
1040 bool CFeatureTableReader_Imp::x_ParseFeatureTableLine (
1041  const CTempString& line,
1042  Int4* startP,
1043  Int4* stopP,
1044  bool* partial5P,
1045  bool* partial3P,
1046  bool* ispointP,
1047  bool* isminusP,
1048  string& featP,
1049  string& qualP,
1050  string& valP,
1051  Int4 offset
1052 )
1053 
1054 {
1055  SIZE_TYPE numtkns;
1056  bool isminus = false;
1057  bool ispoint = false;
1058  size_t len;
1059  bool partial5 = false;
1060  bool partial3 = false;
1061  Int4 startv = -1;
1062  Int4 stopv = -1;
1063  Int4 swp;
1064  string start, stop, feat, qual, val, stnd;
1065  vector<string> tkns;
1066 
1067 
1068  if (line.empty ()) return false;
1069 
1070  if (NStr::StartsWith (line, '[')) return false;
1071 
1072  tkns.clear ();
1073  x_TokenizeLenient(line, tkns);
1074  numtkns = tkns.size ();
1075 
1076  if (numtkns > 0) {
1077  start = NStr::TruncateSpaces(tkns[0]);
1078  }
1079  if (numtkns > 1) {
1080  stop = NStr::TruncateSpaces(tkns[1]);
1081  }
1082  if (numtkns > 2) {
1083  feat = NStr::TruncateSpaces(tkns[2]);
1084  }
1085  if (numtkns > 3) {
1086  qual = NStr::TruncateSpaces(tkns[3]);
1087  }
1088  if (numtkns > 4) {
1089  val = NStr::TruncateSpaces(tkns[4]);
1090  // trim enclosing double-quotes
1091  if( val.length() >= 2 && val[0] == '"' && val[val.length()-1] == '"' ) {
1092  val = val.substr(1, val.length() - 2);
1093  }
1094  }
1095  if (numtkns > 5) {
1096  stnd = NStr::TruncateSpaces(tkns[5]);
1097  }
1098 
1099  bool has_start = false;
1100  if (! start.empty ()) {
1101  if (start [0] == '<') {
1102  partial5 = true;
1103  start.erase (0, 1);
1104  }
1105  len = start.length ();
1106  if (len > 1 && start [len - 1] == '^') {
1107  ispoint = true;
1108  start [len - 1] = '\0';
1109  }
1110  startv = x_StringToLongNoThrow(start, feat, qual,
1111  ILineError::eProblem_BadFeatureInterval);
1112  has_start = true;
1113  }
1114 
1115  bool has_stop = false;
1116  if (! stop.empty ()) {
1117  if (stop [0] == '>') {
1118  partial3 = true;
1119  stop.erase (0, 1);
1120  }
1121  stopv = x_StringToLongNoThrow (stop, feat, qual,
1122  ILineError::eProblem_BadFeatureInterval);
1123  has_stop = true;
1124  }
1125 
1126  if ( startv <= 0 || stopv <= 0 ) {
1127  startv = -1;
1128  stopv = -1;
1129  } else {
1130  startv--;
1131  stopv--;
1132  if (! stnd.empty ()) {
1133  if (stnd == "minus" || stnd == "-" || stnd == "complement") {
1134  if (start < stop) {
1135  swp = startv;
1136  startv = stopv;
1137  stopv = swp;
1138  }
1139  isminus = true;
1140  }
1141  }
1142  }
1143 
1144  if (startv >= 0) {
1145  startv += offset;
1146  }
1147  if (stopv >= 0) {
1148  stopv += offset;
1149  }
1150 
1151  if ((has_start && startv < 0) || (has_stop && stopv < 0)) {
1152  x_ProcessMsg(
1153  ILineError::eProblem_FeatureBadStartAndOrStop,
1154  eDiag_Error,
1155  feat);
1156  }
1157 
1158  *startP = ( startv < 0 ? -1 : startv);
1159  *stopP = ( stopv < 0 ? -1 : stopv);
1160 
1161  *partial5P = partial5;
1162  *partial3P = partial3;
1163  *ispointP = ispoint;
1164  *isminusP = isminus;
1165  featP = feat;
1166  qualP = qual;
1167  valP = val;
1168 
1169  return true;
1170 }
1171 */
1172 
1174  const CTempString &line,
1175  vector<string> &out_tokens )
1176 {
1177  out_tokens.clear();
1178 
1179  // each token has spaces before it and a tab or end-of-line after it
1180  string::size_type startPosOfNextRoundOfTokenization = 0;
1181  while ( startPosOfNextRoundOfTokenization < line.size() ) {
1182  auto posAfterSpaces = line.find_first_not_of( " ", startPosOfNextRoundOfTokenization );
1183  if( posAfterSpaces == string::npos ) {
1184  return;
1185  }
1186 
1187  string::size_type posOfTab = line.find( '\t', posAfterSpaces );
1188  if( posOfTab == string::npos ) {
1189  posOfTab = line.length();
1190  }
1191 
1192  // The next token is between the spaces and the tab (or end of string)
1193  out_tokens.push_back(kEmptyStr);
1194  string &new_token = out_tokens.back();
1195  copy( line.begin() + posAfterSpaces, line.begin() + posOfTab, back_inserter(new_token) );
1196  NStr::TruncateSpacesInPlace( new_token );
1197 
1198  startPosOfNextRoundOfTokenization = ( posOfTab + 1 );
1199  }
1200 }
1201 
1202 // since some compilers won't let me use isspace for find_if
1203 class CIsSpace {
1204 public:
1205  bool operator()( char c ) { return isspace(c); }
1206 };
1207 
1209 public:
1210  bool operator()( char c ) { return ! isspace(c); }
1211 };
1212 
1214  const CTempString &line,
1215  vector<string> &out_tokens )
1216 {
1217  out_tokens.clear();
1218 
1219  if( line.empty() ) {
1220  return;
1221  }
1222 
1223  // if it starts with whitespace, it must be a qual line, else it's a feature line
1224  if( isspace(line[0]) ) {
1225  // In regex form, we're doing something like this:
1226  // \s+(\S+)(\s+(\S.*))?
1227  // Where the first is the qual, and the rest is the val
1228  auto start_of_qual = find_if( line.begin(), line.end(), CIsNotSpace() );
1229  if( start_of_qual == line.end() ) {
1230  return;
1231  }
1232  auto start_of_whitespace_after_qual = find_if( start_of_qual, line.end(), CIsSpace() );
1233  auto start_of_val = find_if( start_of_whitespace_after_qual, line.end(), CIsNotSpace() );
1234 
1235  // first 3 are empty
1236  out_tokens.push_back(kEmptyStr);
1237  out_tokens.push_back(kEmptyStr);
1238  out_tokens.push_back(kEmptyStr);
1239 
1240  // then qual
1241  out_tokens.push_back(kEmptyStr);
1242  string &qual = out_tokens.back();
1243  copy( start_of_qual, start_of_whitespace_after_qual, back_inserter(qual) );
1244 
1245  // then val
1246  if( start_of_val != line.end() ) {
1247  out_tokens.push_back(kEmptyStr);
1248  string &val = out_tokens.back();
1249  copy( start_of_val, line.end(), back_inserter(val) );
1251  }
1252 
1253  } else {
1254  // parse a feature line
1255 
1256  // Since we're being lenient, we consider it to be 3 ( or 6 ) parts separated by whitespace
1257  auto first_column_start = line.begin();
1258  auto first_whitespace = find_if( first_column_start, line.end(), CIsSpace() );
1259  auto second_column_start = find_if( first_whitespace, line.end(), CIsNotSpace() );
1260  auto second_whitespace = find_if( second_column_start, line.end(), CIsSpace() );
1261  auto third_column_start = find_if( second_whitespace, line.end(), CIsNotSpace() );
1262  auto third_whitespace = find_if( third_column_start, line.end(), CIsSpace() );
1263  // columns 4 and 5 are unused on feature lines
1264  auto sixth_column_start = find_if( third_whitespace, line.end(), CIsNotSpace() );
1265  auto sixth_whitespace = find_if( sixth_column_start, line.end(), CIsSpace() );
1266 
1267  out_tokens.push_back(kEmptyStr);
1268  string &first = out_tokens.back();
1269  copy( first_column_start, first_whitespace, back_inserter(first) );
1270 
1271  out_tokens.push_back(kEmptyStr);
1272  string &second = out_tokens.back();
1273  copy( second_column_start, second_whitespace, back_inserter(second) );
1274 
1275  out_tokens.push_back(kEmptyStr);
1276  string &third = out_tokens.back();
1277  copy( third_column_start, third_whitespace, back_inserter(third) );
1278 
1279  if( sixth_column_start != line.end() ) {
1280  // columns 4 and 5 are unused
1281  out_tokens.push_back(kEmptyStr);
1282  out_tokens.push_back(kEmptyStr);
1283 
1284  out_tokens.push_back(kEmptyStr);
1285  string &sixth = out_tokens.back();
1286  copy( sixth_column_start, sixth_whitespace, back_inserter(sixth) );
1287  }
1288  }
1289 }
1290 
1291 
1293  CSeqFeatData& sfdata,
1294  EQual qtype,
1295  const string& val
1296 )
1297 
1298 {
1299  CGene_ref& grp = sfdata.SetGene ();
1300  switch (qtype) {
1301  case eQual_gene:
1302  grp.SetLocus (val);
1303  return true;
1304  case eQual_allele:
1305  grp.SetAllele (val);
1306  return true;
1307  case eQual_gene_desc:
1308  grp.SetDesc (val);
1309  return true;
1310  case eQual_gene_syn:
1311  {
1312  CGene_ref::TSyn& syn = grp.SetSyn ();
1313  syn.push_back (val);
1314  return true;
1315  }
1316  case eQual_map:
1317  grp.SetMaploc (val);
1318  return true;
1319  case eQual_locus_tag:
1320  grp.SetLocus_tag (val);
1321  return true;
1322  case eQual_nomenclature:
1323  /* !!! need to implement !!! */
1324  return true;
1325  default:
1326  break;
1327  }
1328  return false;
1329 }
1330 
1331 
1333  CRef<CSeq_feat> sfp,
1334  CSeqFeatData& sfdata,
1335  EQual qtype, const string& val
1336 )
1337 
1338 {
1339  CCdregion& crp = sfdata.SetCdregion ();
1340  switch (qtype) {
1341  case eQual_codon_start:
1342  {
1343  int frame = x_StringToLongNoThrow (val, kCdsFeatName, "codon_start");
1344  switch (frame) {
1345  case 0:
1347  break;
1348  case 1:
1350  break;
1351  case 2:
1353  break;
1354  case 3:
1356  break;
1357  default:
1358  break;
1359  }
1360  return true;
1361  }
1362  case eQual_EC_number:
1363  {
1364  CProt_ref& prp = sfp->SetProtXref ();
1365  CProt_ref::TEc& ec = prp.SetEc ();
1366  ec.push_back (val);
1367  return true;
1368  }
1369  case eQual_function:
1370  {
1371  CProt_ref& prp = sfp->SetProtXref ();
1372  CProt_ref::TActivity& fun = prp.SetActivity ();
1373  fun.push_back (val);
1374  return true;
1375  }
1376  case eQual_product:
1377  {
1378  CProt_ref& prp = sfp->SetProtXref ();
1379  CProt_ref::TName& prod = prp.SetName ();
1380  prod.push_back (val);
1381  return true;
1382  }
1383  case eQual_prot_desc:
1384  {
1385  CProt_ref& prp = sfp->SetProtXref ();
1386  prp.SetDesc (val);
1387  return true;
1388  }
1389  case eQual_prot_note:
1390  return x_AddGBQualToFeature(sfp, "prot_note", val);
1391  case eQual_transl_except:
1392  // add as GBQual, let cleanup convert to code_break
1393  return x_AddGBQualToFeature(sfp, "transl_except", val);
1394  case eQual_translation:
1395  // we should accept, but ignore this qual on CDSs.
1396  // so, do nothing but return success
1397  return true;
1398  case eQual_transl_table:
1399  // set genetic code directly, or add qualifier and let cleanup convert?
1400  try {
1401  int num = NStr::StringToLong(val);
1402  CGen_code_table::GetTransTable(num); // throws if bad num
1404  code->SetId(num);
1405  crp.SetCode().Set().push_back(code);
1406  return true;
1407  } catch( CStringException ) {
1408  // if val is not a number, add qualifier directly and
1409  // let cleanup convert?
1410  return x_AddGBQualToFeature(sfp, "transl_table", val);
1411  } catch( ... ) {
1412  // invalid genome code table so don't even try to make
1413  // the transl_table qual
1414  x_ProcessMsg(
1416  kCdsFeatName, "transl_table", val);
1417  return true;
1418  }
1419  break;
1420 
1421  default:
1422  break;
1423  }
1424  return false;
1425 }
1426 
1427 
1429  const string& str
1430 )
1431 
1432 {
1433  ITERATE (string, it, str) {
1434  char ch = *it;
1435  if (ch > ' ' && ch != '"' && ch != '\'') return false;
1436  }
1437 
1438  return true;
1439 }
1440 
1441 static bool
1443 {
1444  // basically, this is true if the line starts with "order" (whitespaces disregarded)
1445 
1446  const static char* kOrder = "ORDER";
1447 
1448  // find first non-whitespace character
1449  string::size_type pos = 0;
1450  for( ; pos < line.length() && isspace(line[pos]); ++pos) {
1451  // nothing to do here
1452  }
1453 
1454  // line is all whitespace
1455  if( pos >= line.length() ) {
1456  return false;
1457  }
1458 
1459  // check if starts with "order" after whitespace
1460  return ( 0 == NStr::CompareNocase( line, pos, strlen(kOrder), kOrder ) );
1461 }
1462 
1463 // Turns a "join" location into an "order" by putting nulls between it
1464 // Returns an unset CRef if the loc doesn't need nulls (e.g. if it's just an interval)
1465 static CRef<CSeq_loc>
1467 {
1468  // create result we're returning
1470  CSeq_loc_mix::Tdata & mix_pieces = result->SetMix().Set();
1471 
1472  // keep this around for whenever we need a "null" piece
1473  CRef<CSeq_loc> loc_piece_null( new CSeq_loc );
1474  loc_piece_null->SetNull();
1475 
1476  // push pieces of source, with NULLs between
1477  CSeq_loc_CI loc_iter( loc );
1478  for( ; loc_iter; ++loc_iter ) {
1479  if( ! mix_pieces.empty() ) {
1480  mix_pieces.push_back( loc_piece_null );
1481  }
1482  CRef<CSeq_loc> new_piece( new CSeq_loc );
1483  new_piece->Assign( loc_iter.GetEmbeddingSeq_loc() );
1484  mix_pieces.push_back( new_piece );
1485  }
1486 
1487  // Only wrap in "mix" if there was more than one piece
1488  if( mix_pieces.size() > 1 ) {
1489  return result;
1490  } else {
1491  return CRef<CSeq_loc>();
1492  }
1493 }
1494 
1495 
1497  const string& val
1498 )
1499 {
1501 
1502  if (NStr::StartsWith(value, "tRNA-")) {
1503  value.assign(value, strlen("tRNA-"), CTempString::npos);
1504  }
1505 
1506  CTempString::size_type pos = value.find_first_of("-,;:()=\'_~");
1507  if (pos != CTempString::npos) {
1508  value.erase(pos);
1510  }
1511 
1512  return string(value);
1513 }
1514 
1515 
1516 bool
1518 {
1519  if (NStr::IsBlank (str)) return false;
1520 
1521  string normalized_string = str;
1522  normalized_string.erase(
1523  remove_if(begin(normalized_string),
1524  end(normalized_string),
1525  [](char c) { return isspace(c);}),
1526  end(normalized_string));
1527 
1528  if ( NStr::StartsWith(normalized_string, "(pos:") ) {
1529  // find position of closing paren
1530  string::size_type pos_end = x_MatchingParenPos( normalized_string, 0 );
1531  if (pos_end != string::npos) {
1532  string pos_str = normalized_string.substr (5, pos_end - 5);
1533  string::size_type aa_start = NStr::FindNoCase(pos_str, "aa:");
1534  if (aa_start != string::npos) {
1535  auto seq_start = NStr::FindNoCase(pos_str, ",seq:");
1536  if (seq_start != string::npos &&
1537  seq_start < aa_start+3) {
1538  return false;
1539  }
1540 
1541  size_t aa_length = (seq_start == NPOS) ?
1542  pos_str.size() - (aa_start+3) :
1543  seq_start - (aa_start+3);
1544 
1545  string abbrev = pos_str.substr (aa_start + 3, aa_length);
1546  //TTrnaMap::const_iterator
1547  auto t_iter = sm_TrnaKeys.find (abbrev.c_str ());
1548  if (t_iter == sm_TrnaKeys.end ()) {
1549  // unable to parse
1550  return false;
1551  }
1553  aa->SetNcbieaa (t_iter->second);
1554  ext_trna.SetAa(*aa);
1555  pos_str = pos_str.substr (0, aa_start);
1556  NStr::TruncateSpacesInPlace (pos_str);
1557  if (NStr::EndsWith (pos_str, ",")) {
1558  pos_str = pos_str.substr (0, pos_str.length() - 1);
1559  }
1560  }
1562  CRef<CSeq_loc> anticodon = GetSeqLocFromString (pos_str, m_seq_id, & helper);
1563  if (! anticodon) {
1564  ext_trna.ResetAa();
1565  return false;
1566  } else {
1567  switch( anticodon->GetStrand() ) {
1568  case eNa_strand_unknown:
1569  case eNa_strand_plus:
1570  case eNa_strand_minus:
1571  ext_trna.SetAnticodon(*anticodon);
1572  return true;
1573  default:
1574  ext_trna.ResetAa();
1575  return false;
1576  }
1577  }
1578  }
1579  }
1580 
1581  return false;
1582 }
1583 
1584 
1586  const string &str, SIZE_TYPE open_paren_pos )
1587 {
1588  _ASSERT( str[open_paren_pos] == '(' );
1589  _ASSERT( open_paren_pos < str.length() );
1590 
1591  // nesting level. start at 1 since we know there's an open paren
1592  int level = 1;
1593 
1594  SIZE_TYPE pos = open_paren_pos + 1;
1595  for( ; pos < str.length(); ++pos ) {
1596  switch( str[pos] ) {
1597  case '(':
1598  // nesting deeper
1599  ++level;
1600  break;
1601  case ')':
1602  // closed a level of nesting
1603  --level;
1604  if( 0 == level ) {
1605  // reached the top: we're closing the initial paren,
1606  // so we return our position
1607  return pos;
1608  }
1609  break;
1610  default:
1611  // ignore other characters.
1612  // maybe in the future we'll handle ignoring parens in quotes or
1613  // things like that.
1614  break;
1615  }
1616  }
1617  return NPOS;
1618 }
1619 
1621  CTempString strToConvert,
1622  CTempString strFeatureName,
1623  CTempString strQualifierName,
1624  ILineError::EProblem eProblem
1625 )
1626 {
1627  try {
1628  return NStr::StringToLong(strToConvert);
1629  } catch( ... ) {
1630  // See if we start with a number, but there's extra junk after it, try again
1631  if( ! strToConvert.empty() && isdigit(strToConvert[0]) ) {
1632  try {
1634 
1635  ILineError::EProblem problem =
1637  if( eProblem != ILineError::eProblem_Unset ) {
1638  problem = eProblem;
1639  }
1640 
1641  x_ProcessMsg(
1642  problem,
1643  eDiag_Warning,
1644  strFeatureName, strQualifierName, strToConvert );
1645  return result;
1646  } catch( ... ) { } // fall-thru to usual handling
1647  }
1648 
1649  ILineError::EProblem problem =
1651  if( eProblem != ILineError::eProblem_Unset ) {
1652  problem = eProblem;
1653  }
1654 
1655  x_ProcessMsg(
1656  problem,
1657  eDiag_Warning,
1658  strFeatureName, strQualifierName, strToConvert );
1659  // we have no idea, so just return zero
1660  return 0;
1661  }
1662 }
1663 
1664 
1666  CRef<CSeq_feat> sfp,
1667  EQual qtype,
1668  const string& val
1669 )
1670 {
1671  CSeqFeatData& sfdata = sfp->SetData();
1672  CRNA_ref& rrp = sfdata.SetRna ();
1673  CRNA_ref::EType rnatyp = rrp.GetType ();
1674  switch (rnatyp) {
1676  case CRNA_ref::eType_mRNA:
1677  case CRNA_ref::eType_rRNA:
1678  switch (qtype) {
1679  case eQual_product:
1680  {
1681  CRNA_ref::TExt& tex = rrp.SetExt ();
1682  CRNA_ref::C_Ext::E_Choice exttype = tex.Which ();
1683  if (exttype == CRNA_ref::C_Ext::e_TRNA) return false;
1684  tex.SetName (val);
1685  return true;
1686  }
1687  default:
1688  break;
1689  }
1690  break;
1691  case CRNA_ref::eType_ncRNA:
1692  switch (qtype) {
1693  case eQual_product:
1694  rrp.SetExt().SetGen().SetProduct(val);
1695  return true;
1696  break;
1697  case eQual_ncRNA_class:
1698  rrp.SetExt().SetGen().SetClass(val);
1699  return true;
1700  break;
1701  default:
1702  break;
1703  }
1704  break;
1705  case CRNA_ref::eType_tmRNA:
1706  switch (qtype) {
1707  case eQual_product:
1708  rrp.SetExt().SetGen().SetProduct(val);
1709  return true;
1710  case eQual_tag_peptide:
1711  {
1712  CRef<CRNA_qual> q(new CRNA_qual());
1713  q->SetQual("tag_peptide");
1714  q->SetVal(val);
1715  rrp.SetExt().SetGen().SetQuals().Set().push_back(q);
1716  return true;
1717  }
1718  break;
1719  default:
1720  break;
1721  }
1722  break;
1723  case CRNA_ref::eType_snRNA:
1724  case CRNA_ref::eType_scRNA:
1726  case CRNA_ref::eType_other:
1727  return false;
1728  case CRNA_ref::eType_tRNA:
1729  switch (qtype) {
1730  case eQual_product: {
1731  if (rrp.IsSetExt() && rrp.GetExt().Which() == CRNA_ref::C_Ext::e_Name)
1732  return false;
1733 
1734  const string& aa_string = x_TrnaToAaString(val);
1735  const auto aaval_it = sm_TrnaKeys.find(aa_string.c_str());
1736 
1737  if (aaval_it != sm_TrnaKeys.end()) {
1738  CRNA_ref::TExt& tex = rrp.SetExt ();
1739  CTrna_ext& trx = tex.SetTRNA();
1740  CTrna_ext::TAa& taa = trx.SetAa();
1741  taa.SetNcbieaa(aaval_it->second);
1742  if (aa_string == "fMet" ||
1743  aa_string == "iMet" ||
1744  aa_string == "Ile2") {
1745  x_AddGBQualToFeature(sfp, "product", val);
1746  }
1747  }
1748  else {
1749  x_ProcessMsg(
1751  "tRNA", "product", val);
1752  }
1753  return true;
1754  }
1755  break;
1756  case eQual_anticodon:
1757  {
1758  CRNA_ref::TExt& tex = rrp.SetExt ();
1759  CRNA_ref::C_Ext::TTRNA & ext_trna = tex.SetTRNA();
1760  if( ! x_ParseTrnaExtString(ext_trna, val) ) {
1761  x_ProcessMsg(
1763  "tRNA", "anticodon", val );
1764  }
1765  return true;
1766  }
1767  break;
1769  {
1770  //const auto codon_index = CGen_code_table::CodonToIndex(val);
1771  //if (codon_index >= 0) {
1772  CRNA_ref::TExt& tex = rrp.SetExt ();
1773  CRNA_ref::C_Ext::TTRNA & ext_trna = tex.SetTRNA();
1774  if (!x_AddCodons(val, ext_trna)) {
1775  return false;
1776  }
1777  //}
1778  return true;
1779  }
1780  break;
1781  default:
1782  break;
1783  }
1784  break;
1785  default:
1786  break;
1787  }
1788  return false;
1789 }
1790 
1791 
1793  const string& val,
1794  CTrna_ext& trna_ext
1795  ) const
1796 {
1797  if (val.size() != 3) {
1798  return false;
1799  }
1800 
1801  set<int> codons;
1802  try {
1803  for (char char1 : s_IUPACmap.at(val[0])) {
1804  for (char char2 : s_IUPACmap.at(val[1])) {
1805  for (char char3 : s_IUPACmap.at(val[2])) {
1806  const auto codon_index = CGen_code_table::CodonToIndex(char1, char2, char3);
1807  codons.insert(codon_index);
1808  }
1809  }
1810  }
1811 
1812  if (!codons.empty()) {
1813  trna_ext.SetAa().SetNcbieaa();
1814  for (const auto codon_index : codons) {
1815  trna_ext.SetCodon().push_back(codon_index);
1816  }
1817  }
1818  return true;
1819  }
1820  catch(...) {}
1821 
1822  return false;
1823 }
1824 
1825 
1827  CRef<CSeq_feat> sfp,
1828  CSeqFeatData& sfdata,
1829  EQual qtype,
1830  const string& qual,
1831  const string& val
1832 )
1833 
1834 {
1835  const char* str = nullptr;
1836 
1837  CSeqFeatData::ESubtype subtype = sfdata.GetSubtype ();
1838 
1839  // used if-statement because CSeqFeatData::IsRegulatory won't work in a
1840  // switch statement.
1841  if( (subtype == CSeqFeatData::eSubtype_regulatory) ||
1842  CSeqFeatData::IsRegulatory(subtype) )
1843  {
1844  if (qtype == eQual_regulatory_class) {
1845  if (val != "other") { // RW-374 "other" is a special case
1846 
1847  const vector<string>& allowed_values =
1849  if (find(allowed_values.cbegin(), allowed_values.cend(), val)
1850  == allowed_values.cend()) {
1851  return false;
1852  }
1853 
1854 /*
1855  const CSeqFeatData::ESubtype regulatory_class_subtype =
1856  CSeqFeatData::GetRegulatoryClass(val);
1857  if( regulatory_class_subtype == CSeqFeatData::eSubtype_bad ) {
1858  // msg will be sent in caller x_AddQualifierToFeature
1859  return false;
1860  }
1861  */
1862  }
1863  // okay
1864  // (Note that at this time we don't validate
1865  // if the regulatory_class actually matches the
1866  // subtype)
1867  x_AddGBQualToFeature(sfp, qual, val);
1868  return true;
1869  }
1870  }
1871 
1872  switch (subtype) {
1874  {
1875  switch (qtype) {
1876  case eQual_chrcnt:
1877  case eQual_ctgcnt:
1878  case eQual_loccnt:
1879  case eQual_snp_class:
1880  case eQual_snp_gtype:
1881  case eQual_snp_het:
1882  case eQual_snp_het_se:
1883  case eQual_snp_linkout:
1884  case eQual_snp_maxrate:
1885  case eQual_snp_valid:
1886  case eQual_weight:
1887  str = "dbSnpSynonymyData";
1888  break;
1889  default:
1890  break;
1891  }
1892  }
1893  break;
1895  {
1896  switch (qtype) {
1897  case eQual_sts_aliases:
1898  case eQual_sts_dsegs:
1899  case eQual_weight:
1900  str = "stsUserObject";
1901  break;
1902  default:
1903  break;
1904  }
1905  }
1906  break;
1908  {
1909  switch (qtype) {
1910  case eQual_bac_ends:
1911  case eQual_clone_id:
1912  case eQual_method:
1913  case eQual_sequence:
1914  case eQual_STS:
1915  case eQual_weight:
1916  str = "cloneUserObject";
1917  break;
1918  default:
1919  break;
1920  }
1921  }
1922  break;
1923  default:
1924  break;
1925  }
1926 
1927  if (str) {
1928  CSeq_feat::TExt& ext = sfp->SetExt ();
1929  CObject_id& obj = ext.SetType ();
1930  if ((! obj.IsStr ()) || obj.GetStr ().empty ()) {
1931  obj.SetStr ();
1932  }
1934  return true;
1935  }
1936 
1937  return false;
1938 }
1939 
1940 
1942  CSeqFeatData& sfdata,
1943  const string &feat_name,
1944  EOrgRef rtype,
1945  const string& val
1946 )
1947 {
1948  CBioSource& bsp = sfdata.SetBiosrc ();
1949 
1950  switch (rtype) {
1951  case eOrgRef_organism:
1952  {
1953  CBioSource::TOrg& orp = bsp.SetOrg ();
1954  orp.SetTaxname (val);
1955  return true;
1956  }
1957  case eOrgRef_organelle:
1958  {
1959  TGenomeMap::const_iterator g_iter = sm_GenomeKeys.find (val.c_str ());
1960  if (g_iter != sm_GenomeKeys.end ()) {
1961  CBioSource::EGenome gtype = g_iter->second;
1962  bsp.SetGenome (gtype);
1963  } else {
1964  x_ProcessMsg(
1966  feat_name, "organelle", val );
1967  }
1968  return true;
1969  }
1970  case eOrgRef_div:
1971  {
1972  CBioSource::TOrg& orp = bsp.SetOrg ();
1973  COrg_ref::TOrgname& onp = orp.SetOrgname ();
1974  onp.SetDiv (val);
1975  return true;
1976  }
1977  case eOrgRef_lineage:
1978  {
1979  CBioSource::TOrg& orp = bsp.SetOrg ();
1980  COrg_ref::TOrgname& onp = orp.SetOrgname ();
1981  onp.SetLineage (val);
1982  return true;
1983  }
1984  case eOrgRef_gcode:
1985  {
1986  CBioSource::TOrg& orp = bsp.SetOrg ();
1987  COrg_ref::TOrgname& onp = orp.SetOrgname ();
1988  int code = x_StringToLongNoThrow (val, feat_name, "gcode");
1989  onp.SetGcode (code);
1990  return true;
1991  }
1992  case eOrgRef_mgcode:
1993  {
1994  CBioSource::TOrg& orp = bsp.SetOrg ();
1995  COrg_ref::TOrgname& onp = orp.SetOrgname ();
1996  int code = x_StringToLongNoThrow (val, feat_name, "mgcode");
1997  onp.SetMgcode (code);
1998  return true;
1999  }
2000  default:
2001  break;
2002  }
2003  return false;
2004 }
2005 
2006 
2008  CSeqFeatData& sfdata,
2009  CSubSource::ESubtype stype,
2010  const string& val
2011 )
2012 
2013 {
2014  CBioSource& bsp = sfdata.SetBiosrc ();
2015  CBioSource::TSubtype& slist = bsp.SetSubtype ();
2016  CRef<CSubSource> ssp (new CSubSource);
2017  ssp->SetSubtype (stype);
2018  ssp->SetName (val);
2019  slist.push_back (ssp);
2020  return true;
2021 }
2022 
2023 
2025  CSeqFeatData& sfdata,
2026  COrgMod::ESubtype mtype,
2027  const string& val
2028 )
2029 
2030 {
2031  CBioSource& bsp = sfdata.SetBiosrc ();
2032  CBioSource::TOrg& orp = bsp.SetOrg ();
2033  COrg_ref::TOrgname& onp = orp.SetOrgname ();
2034  COrgName::TMod& mlist = onp.SetMod ();
2035  CRef<COrgMod> omp (new COrgMod);
2036  omp->SetSubtype (mtype);
2037  omp->SetSubname (val);
2038  mlist.push_back (omp);
2039  return true;
2040 }
2041 
2042 
2044  CRef<CSeq_feat> sfp,
2045  const string& qual,
2046  const string& val
2047 )
2048 
2049 {
2050  if (qual.empty ()) return false;
2051 
2052  // need this pointer because references can't be repointed
2053  CTempString normalized_qual = qual;
2054 
2055  // normalize qual if needed, especially regarding case, and
2056  // use as-is if no normalization applies
2057  auto qual_type = CSeqFeatData::GetQualifierType(qual);
2058  if( qual_type != CSeqFeatData::eQual_bad ) {
2059  // swap is constant time
2060  CTempString potential_normalized_qual = CSeqFeatData::GetQualifierAsString(qual_type);
2061  if( ! potential_normalized_qual.empty() ) {
2062  normalized_qual = potential_normalized_qual;
2063  }
2064  }
2065 
2066  auto& qlist = sfp->SetQual ();
2067  CRef<CGb_qual> gbq (new CGb_qual);
2068  gbq->SetQual() = normalized_qual;
2069  if (x_StringIsJustQuotes (val)) {
2070  gbq->SetVal() = kEmptyStr;
2071  } else {
2072  gbq->SetVal() = val;
2073  }
2074  qlist.push_back (gbq);
2075 
2076  return true;
2077 }
2078 
2079 
2081  CRef<CSeq_annot> sap,
2082  TChoiceToFeatMap & choiceToFeatMap,
2083  const TFlags flags)
2084 {
2085  // load cds_equal_range to hold the CDSs
2086  typedef TChoiceToFeatMap::iterator TChoiceCI;
2087  typedef pair<TChoiceCI, TChoiceCI> TChoiceEqualRange;
2088  TChoiceEqualRange cds_equal_range =
2089  choiceToFeatMap.equal_range(CSeqFeatData::e_Cdregion);
2090  if( cds_equal_range.first == cds_equal_range.second )
2091  {
2092  // nothing to do if there are no CDSs
2093  return;
2094  }
2095 
2096  // load mappings from locus or locus-tag to gene
2097  typedef multimap<string, SFeatAndLineNum> TStringToGeneAndLineMap;
2098  TStringToGeneAndLineMap locusToGeneAndLineMap;
2099  TStringToGeneAndLineMap locusTagToGeneAndLineMap;
2100  const TChoiceEqualRange gene_equal_range =
2101  choiceToFeatMap.equal_range(CSeqFeatData::e_Gene);
2102  for( TChoiceCI gene_choice_ci = gene_equal_range.first;
2103  gene_choice_ci != gene_equal_range.second;
2104  ++gene_choice_ci )
2105  {
2106  SFeatAndLineNum gene_feat_ref_and_line = gene_choice_ci->second;
2107  const CGene_ref & gene_ref = gene_feat_ref_and_line.m_pFeat->GetData().GetGene();
2108  if( ! RAW_FIELD_IS_EMPTY_OR_UNSET(gene_ref, Locus) ) {
2109  locusToGeneAndLineMap.insert(
2111  gene_ref.GetLocus(), gene_feat_ref_and_line));
2112  }
2113  if( ! RAW_FIELD_IS_EMPTY_OR_UNSET(gene_ref, Locus_tag) ) {
2114  locusTagToGeneAndLineMap.insert(
2116  gene_ref.GetLocus_tag(), gene_feat_ref_and_line));
2117  }
2118  }
2119 
2120  // for each CDS, check for gene conflicts or create genes,
2121  // depending on various flags
2122  for( TChoiceCI cds_choice_ci = cds_equal_range.first;
2123  cds_choice_ci != cds_equal_range.second ; ++cds_choice_ci)
2124  {
2125  TFeatConstRef cds_feat_ref = cds_choice_ci->second.m_pFeat;
2126  const TSeqPos cds_line_num = cds_choice_ci->second.m_uLineNum;
2127 
2128  const CSeq_loc & cds_loc = cds_feat_ref->GetLocation();
2129 
2130  const CGene_ref * pGeneXrefOnCDS = cds_feat_ref->GetGeneXref();
2131  if( ! pGeneXrefOnCDS ) {
2132  // no xref, so can't do anything for this CDS
2133  // (this is NOT an error)
2134  continue;
2135  }
2136 
2137  // get all the already-existing genes that
2138  // this CDS xrefs. It should be somewhat uncommon for there
2139  // to be more than one matching gene.
2140  set<SFeatAndLineNum> matchingGenes;
2141 
2142  const string locus =
2143  pGeneXrefOnCDS->IsSetLocus() ?
2144  pGeneXrefOnCDS->GetLocus() :
2145  "";
2146 
2147  const string locus_tag =
2148  pGeneXrefOnCDS->IsSetLocus_tag() ?
2149  pGeneXrefOnCDS->GetLocus_tag() :
2150  "";
2151 
2152 
2153  {{
2154  // all the code in this scope is all just for setting up matchingGenes
2155 
2156  typedef TStringToGeneAndLineMap::iterator TStrToGeneCI;
2157  typedef pair<TStrToGeneCI, TStrToGeneCI> TStrToGeneEqualRange;
2158  set<SFeatAndLineNum> locusGeneMatches;
2159  // add the locus matches (if any) to genesAlreadyCreated
2160  if( !NStr::IsBlank(locus) ) {
2161  TStrToGeneEqualRange locus_equal_range =
2162  locusToGeneAndLineMap.equal_range(locus);
2163  for( TStrToGeneCI locus_gene_ci = locus_equal_range.first;
2164  locus_gene_ci != locus_equal_range.second;
2165  ++locus_gene_ci )
2166  {
2167  if (!NStr::IsBlank(locus_tag)) {
2168  auto gene_feat = locus_gene_ci->second.m_pFeat;
2169  if (gene_feat->GetData().GetGene().IsSetLocus_tag() &&
2170  gene_feat->GetData().GetGene().GetLocus_tag() != locus_tag) {
2171  continue;
2172  }
2173  }
2174  locusGeneMatches.insert(locus_gene_ci->second);
2175  }
2176  }
2177  // remove any that don't also match the locus-tag (if any)
2178  set<SFeatAndLineNum> locusTagGeneMatches;
2179  if( !NStr::IsBlank(locus_tag) ) {
2180  TStrToGeneEqualRange locus_tag_equal_range =
2181  locusTagToGeneAndLineMap.equal_range(locus_tag);
2182  for( TStrToGeneCI locus_tag_gene_ci = locus_tag_equal_range.first;
2183  locus_tag_gene_ci != locus_tag_equal_range.second;
2184  ++locus_tag_gene_ci )
2185  {
2186  if (!NStr::IsBlank(locus)) {
2187  auto gene_feat = locus_tag_gene_ci->second.m_pFeat;
2188  if (gene_feat->GetData().GetGene().IsSetLocus() &&
2189  gene_feat->GetData().GetGene().GetLocus() != locus) {
2190  continue;
2191  }
2192  }
2193  locusTagGeneMatches.insert(locus_tag_gene_ci->second);
2194  }
2195  }
2196  // analyze locusGeneMatches and locusTagGeneMatches to find matchingGenes.
2197  if( locusGeneMatches.empty() ) {
2198  // swap is faster than assignment
2199  matchingGenes.swap(locusTagGeneMatches);
2200  } else if( locusTagGeneMatches.empty() ) {
2201  // swap is faster than assignment
2202  matchingGenes.swap(locusGeneMatches);
2203  } else {
2204  // get only the genes that match both (that is, the intersection)
2205  set_intersection(
2206  locusGeneMatches.begin(), locusGeneMatches.end(),
2207  locusTagGeneMatches.begin(), locusTagGeneMatches.end(),
2208  inserter(matchingGenes, matchingGenes.begin()));
2209  }
2210  }}
2211 
2212  // if requested, check that the genes really do contain the CDS
2213  // (also check if we're trying to create a gene that already exists)
2214 
2215  ITERATE(set<SFeatAndLineNum>, gene_feat_and_line_ci, matchingGenes) {
2216  const CSeq_loc & gene_loc = gene_feat_and_line_ci->m_pFeat->GetLocation();
2217  const TSeqPos gene_line_num = gene_feat_and_line_ci->m_uLineNum;
2218 
2220 
2221  // CDS's loc minus gene's loc should be an empty location
2222  // because the CDS should be entirely on the gene
2223  CRef<CSeq_loc> pCdsMinusGeneLoc = cds_loc.Subtract(
2224  gene_loc, CSeq_loc::fSortAndMerge_All, nullptr, nullptr);
2225  if( pCdsMinusGeneLoc &&
2226  ! pCdsMinusGeneLoc->IsNull() &&
2227  ! pCdsMinusGeneLoc->IsEmpty() )
2228  {
2229  ILineError::TVecOfLines gene_lines;
2230  if( gene_line_num > 0 ) {
2231  gene_lines.push_back(gene_line_num);
2232  }
2233  x_ProcessMsg(
2234  cds_line_num,
2236  kCdsFeatName,
2238  gene_lines );
2239  }
2240  }
2241  }
2242 
2243  // if requested, create genes for the CDS if there isn't already one
2244  // (it is NOT an error if the gene is already created)
2246  matchingGenes.empty() )
2247  {
2248  // create the gene
2249  CRef<CSeq_feat> pNewGene( new CSeq_feat );
2250  pNewGene->SetData().SetGene().Assign( *pGeneXrefOnCDS );
2251  if( FIELD_EQUALS(*cds_feat_ref, Partial, true) ) pNewGene->SetPartial(true);
2252  pNewGene->SetLocation().Assign( cds_feat_ref->GetLocation() );
2253 
2254  // add gene the annot
2255  _ASSERT( sap->IsFtable() );
2256  TFtable & the_ftable = sap->SetData().SetFtable();
2257  the_ftable.push_back(pNewGene);
2258 
2259  // add it to our local information for later CDSs
2260  SFeatAndLineNum gene_feat_and_line(pNewGene, 0);
2261  choiceToFeatMap.insert(
2263  pNewGene->GetData().Which(), gene_feat_and_line ) );
2264  if( ! RAW_FIELD_IS_EMPTY_OR_UNSET(*pGeneXrefOnCDS, Locus) ) {
2265  locusToGeneAndLineMap.insert(
2267  pGeneXrefOnCDS->GetLocus(), gene_feat_and_line));
2268  }
2269  if( ! RAW_FIELD_IS_EMPTY_OR_UNSET(*pGeneXrefOnCDS, Locus_tag) ) {
2270  locusTagToGeneAndLineMap.insert(
2272  pGeneXrefOnCDS->GetLocus_tag(), gene_feat_and_line));
2273  }
2274  }
2275  } // end of iteration through the CDS's
2276 }
2277 
2278 static const string s_QualsWithCaps[] = {
2279  "EC_number",
2280  "PCR_conditions",
2281  "PubMed",
2282  "STS",
2283  "ncRNA_class"
2284 };
2285 
2286 static const int s_NumQualsWithCaps = sizeof (s_QualsWithCaps) / sizeof (string);
2287 
2288 static string s_FixQualCapitalization (const string& qual)
2289 {
2290  string lqual = qual;
2291  lqual = NStr::ToLower(lqual);
2292  for (int j = 0; j < s_NumQualsWithCaps; j++) {
2293  if (NStr::EqualNocase(lqual, s_QualsWithCaps[j])) {
2294  lqual = s_QualsWithCaps[j];
2295  break;
2296  }
2297  }
2298  return lqual;
2299 }
2300 
2301 
2303  CRef<CSeq_feat> sfp,
2304  const string& note)
2305 {
2306  if (sfp.IsNull()) {
2307  return false;
2308  }
2309 
2310  if (NStr::IsBlank(note)) { // Nothing to do
2311  return true;
2312  }
2313 
2314  string comment = (sfp->CanGetComment()) ?
2315  sfp->GetComment() + "; " + note :
2316  note;
2317  sfp->SetComment(comment);
2318  return true;
2319 }
2320 
2321 
2323  CRef<CSeq_feat> sfp,
2324  const string& feat_name,
2325  const string& qual,
2326  const string& val) {
2327 
2328  if (!x_AddNoteToFeature(sfp, val)) {
2329  return false;
2330  }
2331  // Else convert qualifier to note and issue warning
2332  if (qual != "note") {
2333  string error_message =
2334  qual + " is not a valid qualifier for this feature. Converting to note.";
2335  x_ProcessMsg(
2337  feat_name, qual, kEmptyStr, error_message);
2338  }
2339  return true;
2340 }
2341 
2343  CRef<CSeq_feat> sfp,
2344  const string &feat_name,
2345  const string& qual,
2346  const string& val,
2347  const TFlags flags
2348 )
2349 
2350 {
2351  CSeqFeatData& sfdata = sfp->SetData ();
2352  CSeqFeatData::E_Choice featType = sfdata.Which ();
2353 
2354  const CSeqFeatData::EQualifier qual_type =
2357  if( CSeqFeatData::IsDiscouragedQual(qual_type) ) {
2358  x_ProcessMsg(
2360  eDiag_Warning, feat_name, qual);
2361  }
2362  }
2363 
2364  if (featType == CSeqFeatData::e_Biosrc) {
2365 
2366  TOrgRefMap::const_iterator o_iter = sm_OrgRefKeys.find (qual.c_str ());
2367  if (o_iter != sm_OrgRefKeys.end ()) {
2368  EOrgRef rtype = o_iter->second;
2369  if (x_AddQualifierToBioSrc (sfdata, feat_name, rtype, val)) return true;
2370  } else {
2371 
2372  TSubSrcMap::const_iterator s_iter = sm_SubSrcKeys.find (qual.c_str ());
2373  if (s_iter != sm_SubSrcKeys.end ()) {
2374 
2375  CSubSource::ESubtype stype = s_iter->second;
2376  if (x_AddQualifierToBioSrc (sfdata, stype, val)) return true;
2377 
2378  } else {
2379 
2380  TOrgModMap::const_iterator m_iter = sm_OrgModKeys.find (qual.c_str ());
2381  if (m_iter != sm_OrgModKeys.end ()) {
2382 
2383  COrgMod::ESubtype mtype = m_iter->second;
2384  if (x_AddQualifierToBioSrc (sfdata, mtype, val)) return true;
2385  }
2386  }
2387  }
2388  return false;
2389  }
2390 
2391 
2392  // else type != CSeqFeatData::e_Biosrc
2393  string lqual = s_FixQualCapitalization(qual);
2394  TQualMap::const_iterator q_iter = sm_QualKeys.find (lqual.c_str ());
2395  if (q_iter != sm_QualKeys.end ()) {
2396  EQual qtype = q_iter->second;
2397  switch (featType) {
2398  case CSeqFeatData::e_Gene:
2399  if (x_AddQualifierToGene (sfdata, qtype, val)) return true;
2400  break;
2402  if (x_AddQualifierToCdregion (sfp, sfdata, qtype, val)) return true;
2403  break;
2404  case CSeqFeatData::e_Rna:
2405  if (x_AddQualifierToRna (sfp, qtype, val)) return true;
2406  break;
2407  case CSeqFeatData::e_Imp:
2408  if (x_AddQualifierToImp (sfp, sfdata, qtype, qual, val)) return true;
2409  break;
2411  if (qtype == eQual_region_name) {
2412  sfdata.SetRegion (val);
2413  return true;
2414  }
2415  break;
2416  case CSeqFeatData::e_Bond:
2417  if (qtype == eQual_bond_type) {
2419  if (CSeqFeatData::GetBondList()->IsBondName(val.c_str(), btyp)) {
2420  sfdata.SetBond (btyp);
2421  return true;
2422  }
2423  }
2424  break;
2425  case CSeqFeatData::e_Site:
2426  if (qtype == eQual_site_type) {
2428  if (CSeqFeatData::GetSiteList()->IsSiteName( val.c_str(), styp)) {
2429  sfdata.SetSite (styp);
2430  return true;
2431  }
2432  }
2433  break;
2434  case CSeqFeatData::e_Pub:
2435  if( qtype == eQual_PubMed ) {
2436  CRef<CPub> new_pub( new CPub );
2437  new_pub->SetPmid( CPubMedId( ENTREZ_ID_FROM(long, x_StringToLongNoThrow(val, feat_name, qual)) ) );
2438  sfdata.SetPub().SetPub().Set().push_back( new_pub );
2439  return true;
2440  }
2441  break;
2442  case CSeqFeatData::e_Prot:
2443  switch( qtype ) {
2444  case eQual_product:
2445  sfdata.SetProt().SetName().push_back( val );
2446  return true;
2447  case eQual_function:
2448  sfdata.SetProt().SetActivity().push_back( val );
2449  return true;
2450  case eQual_EC_number:
2451  sfdata.SetProt().SetEc().push_back( val );
2452  return true;
2453  default:
2454  break;
2455  }
2456  break;
2457  default:
2458  break;
2459  }
2460 
2461  switch (qtype) {
2462  case eQual_pseudo:
2463  sfp->SetPseudo (true);
2464  return true;
2465  case eQual_partial:
2466  sfp->SetPartial (true);
2467  return true;
2468  case eQual_exception:
2469  sfp->SetExcept (true);
2470  sfp->SetExcept_text (val);
2471  return true;
2473  sfp->SetExcept (true);
2474  sfp->SetExcept_text (qual);
2475  return true;
2476  case eQual_trans_splicing:
2477  sfp->SetExcept (true);
2478  sfp->SetExcept_text (qual);
2479  return true;
2480  case eQual_evidence:
2481  if (val == "experimental") {
2483  } else if (val == "not_experimental" || val == "non_experimental" ||
2484  val == "not-experimental" || val == "non-experimental") {
2486  }
2487  return true;
2488  case eQual_note:
2489  return x_AddNoteToFeature(sfp, val);
2490  case eQual_inference:
2491  {
2492  string prefix, remainder;
2494  if (!NStr::IsBlank(prefix)) {
2495  x_AddGBQualToFeature(sfp, qual, val);
2496  }
2497  else {
2498  x_ProcessMsg(
2500  feat_name, qual, val);
2501  }
2502  return true;
2503  }
2504  case eQual_replace:
2505  {
2506  string val_copy = val;
2507  NStr::ToLower( val_copy );
2508  x_AddGBQualToFeature (sfp, qual, val_copy );
2509  return true;
2510  }
2511  case eQual_allele:
2512  case eQual_bound_moiety:
2513  case eQual_clone:
2514  case eQual_compare:
2515  case eQual_cons_splice:
2516  case eQual_direction:
2517  case eQual_EC_number:
2519  case eQual_experiment:
2520  case eQual_frequency:
2521  case eQual_function:
2522  case eQual_gap_type:
2523  case eQual_insertion_seq:
2524  case eQual_label:
2526  case eQual_map:
2527  case eQual_ncRNA_class:
2528  case eQual_number:
2529  case eQual_old_locus_tag:
2530  case eQual_operon:
2531  case eQual_organism:
2532  case eQual_PCR_conditions:
2533  case eQual_phenotype:
2534  case eQual_product:
2535  case eQual_pseudogene:
2536  case eQual_satellite:
2537  case eQual_rpt_family:
2538  case eQual_rpt_type:
2539  case eQual_rpt_unit:
2540  case eQual_rpt_unit_range:
2541  case eQual_rpt_unit_seq:
2542  case eQual_standard_name:
2543  case eQual_tag_peptide:
2544  case eQual_transposon:
2545  case eQual_usedin:
2546  case eQual_cyt_map:
2547  case eQual_gen_map:
2548  case eQual_rad_map:
2550  {
2551  x_AddGBQualToFeature (sfp, qual, val);
2552  return true;
2553  }
2554  case eQual_gene:
2555  {
2556  if (CSeqFeatData::CanHaveGene(sfdata.GetSubtype())) {
2557  CGene_ref& grp = sfp->SetGeneXref ();
2558  if (val != "-") {
2559  grp.SetLocus (val);
2560  }
2561  return true;
2562  }
2563  // else:
2564  return x_AddNoteToFeature(sfp, feat_name, qual, val);
2565  }
2566  case eQual_gene_desc:
2567  {
2568  if (CSeqFeatData::CanHaveGene(sfdata.GetSubtype())) {
2569  CGene_ref& grp = sfp->SetGeneXref ();
2570  grp.SetDesc (val);
2571  return true;
2572  }
2573  // else:
2574  return x_AddNoteToFeature(sfp, feat_name, qual, val);
2575  }
2576  case eQual_gene_syn:
2577  {
2578  if (CSeqFeatData::CanHaveGene(sfdata.GetSubtype())) {
2579  CGene_ref& grp = sfp->SetGeneXref ();
2580  CGene_ref::TSyn& syn = grp.SetSyn ();
2581  syn.push_back (val);
2582  return true;
2583  }
2584  // else:
2585  return x_AddNoteToFeature(sfp, feat_name, qual, val);
2586  }
2587  case eQual_locus_tag:
2588  {
2589  if (CSeqFeatData::CanHaveGene(sfdata.GetSubtype())) {
2590  CGene_ref& grp = sfp->SetGeneXref ();
2591  grp.SetLocus_tag (val);
2592  return true;
2593  }
2594  // else:
2595  return x_AddNoteToFeature(sfp, feat_name, qual, val);
2596  }
2597  case eQual_db_xref:
2598  {
2599  CTempString db, tag;
2600  if (NStr::SplitInTwo (val, ":", db, tag)) {
2601  CSeq_feat::TDbxref& dblist = sfp->SetDbxref ();
2602  CRef<CDbtag> dbt (new CDbtag);
2603  dbt->SetDb (db);
2604  CRef<CObject_id> oid (new CObject_id);
2605  static const char* digits = "0123456789";
2606  if (tag.find_first_not_of(digits) == string::npos && !NStr::IsBlank(tag))
2607  oid->SetId(NStr::StringToLong(tag));
2608  else
2609  oid->SetStr(tag);
2610  dbt->SetTag (*oid);
2611  dblist.push_back (dbt);
2612  return true;
2613  }
2614  return true;
2615  }
2616  case eQual_nomenclature:
2617  {
2618  /* !!! need to implement !!! */
2619  return true;
2620  }
2621  case eQual_go_component:
2622  case eQual_go_function:
2623  case eQual_go_process:
2624  if (featType == CSeqFeatData::e_Gene ||
2625  featType == CSeqFeatData::e_Cdregion ||
2626  featType == CSeqFeatData::e_Rna) {
2627  try {
2628  CReadUtil::AddGeneOntologyTerm(*sfp, qual, val);
2629  }
2630  catch( ILineError& err) {
2631  x_ProcessMsg(
2632  err.Problem(),
2633  err.Severity(),
2634  feat_name, qual, val,
2635  err.ErrorMessage());
2636  }
2637  //rw-621: throw out the faulty qualifier but retain the rest of the feature.
2638  return true;
2639  }
2640  return false;
2641  case eQual_transcript_id:
2642  {
2643  if (featType == CSeqFeatData::e_Rna &&
2644  sfdata.GetRna().GetType() == CRNA_ref::eType_mRNA) {
2645  CBioseq::TId ids;
2646  try {
2647  CSeq_id::ParseIDs(ids, val,
2650  }
2651  catch (CSeqIdException&)
2652  {
2653  x_ProcessMsg(
2655  feat_name, qual, val,
2656  "Invalid transcript_id : " + val);
2657  return true;
2658  }
2659 
2660  for (const auto& id : ids) {
2661  auto id_string = id->GetSeqIdString(true);
2662  auto res = m_ProcessedTranscriptIds.insert(id_string);
2663  if (res.second == false) { // Insertion failed because Seq-id already encountered
2664  x_ProcessMsg(
2666  feat_name, qual, val,
2667  "Transcript ID " + id_string + " appears on multiple mRNA features"
2668  );
2669  }
2670  }
2671  }
2672  x_AddGBQualToFeature(sfp, qual, val);
2673  return true;
2674  }
2675  case eQual_protein_id:
2676  // see SQD-1535 and SQD-3496
2677  if (featType == CSeqFeatData::e_Cdregion ||
2678  (featType == CSeqFeatData::e_Rna &&
2679  sfdata.GetRna().GetType() == CRNA_ref::eType_mRNA) ||
2680  (featType == CSeqFeatData::e_Prot &&
2681  sfdata.GetProt().IsSetProcessed() &&
2683  {
2684  CBioseq::TId ids;
2685  try {
2686  CSeq_id::ParseIDs(ids, val,
2689  }
2690  catch (CSeqIdException&)
2691  {
2692  x_ProcessMsg(
2694  feat_name, qual, val,
2695  "Invalid protein_id : " + val);
2696  return true;
2697  }
2698 
2699  if (featType == CSeqFeatData::e_Cdregion) {
2700  for (const auto& id : ids) {
2701  auto id_string = id->GetSeqIdString(true);
2702  auto res = m_ProcessedProteinIds.insert(id_string);
2703  if (res.second == false) { // Insertion failed because Seq-id already encountered
2704  x_ProcessMsg(
2706  feat_name, qual, val,
2707  "Protein ID " + id_string + " appears on multiple CDS features"
2708  );
2709  }
2710  }
2711  }
2712 
2713  if (featType != CSeqFeatData::e_Rna) { // mRNA only has a protein_id qualifier
2714  auto pBestId = GetBestId(ids);
2715  if (pBestId) {
2716  sfp->SetProduct().SetWhole(*pBestId);
2717  }
2718  }
2719  }
2720 
2721  if (featType != CSeqFeatData::e_Prot) { // Mat-peptide has an instantiated product, but no qualifier
2722  x_AddGBQualToFeature(sfp, qual, val);
2723  }
2724  return true;
2726  // This should've been handled up in x_AddQualifierToImp
2727  // so it's always a bad value to be here
2728  x_ProcessMsg(
2730  feat_name, qual, val );
2731  return true;
2732  default:
2733  break;
2734  }
2735  }
2736  return false;
2737 }
2738 
2740 {
2741  // This function is testing for a match against the following regular
2742  // expression, but we avoid actual regexps for max speed:
2743  // "^(===================================================================| INFO:| WARNING:| ERROR:).*"
2744 
2745  // (that magic number is the size of the smallest possible match)
2746  if( line.length() < 6 ) {
2747  return false;
2748  }
2749 
2750  if( line[0] == '=' ) {
2751  static const CTempString kAllEqualsMatch =
2752  "===================================================================";
2753  if( NStr::StartsWith(line, kAllEqualsMatch) ) {
2754  return true;
2755  }
2756  } else if( line[0] == ' ') {
2757  switch(line[1]) {
2758  case 'I':
2759  {
2760  static const CTempString kInfo = " INFO:";
2761  if( NStr::StartsWith(line, kInfo) ) {
2762  return true;
2763  }
2764  }
2765  break;
2766  case 'W':
2767  {
2768  static const CTempString kWarning = " WARNING:";
2769  if( NStr::StartsWith(line, kWarning) ) {
2770  return true;
2771  }
2772  }
2773  break;
2774  case 'E':
2775  {
2776  static const CTempString kError = " ERROR:";
2777  if( NStr::StartsWith(line, kError) ) {
2778  return true;
2779  }
2780  }
2781  break;
2782  default:
2783  // no match
2784  break;
2785  }
2786  }
2787 
2788  // no match
2789  return false;
2790 }
2791 
2793  CTempString strFeatureName,
2794  CRef<CSeq_feat>& sfp,
2795  const SFeatLocInfo& loc_info
2796 )
2797 
2798 {
2799 
2800  auto start = loc_info.start_pos;
2801  auto stop = loc_info.stop_pos;
2802 
2803  const Int4 orig_start = start;
2805 
2806  if (start > stop) {
2807  swap(start, stop);
2808  strand = eNa_strand_minus;
2809  }
2810  if (loc_info.is_minus_strand) {
2811  strand = eNa_strand_minus;
2812  }
2813 
2814  // construct loc, which will be added to the mix
2815  CSeq_loc_mix::Tdata & mix_set = sfp->SetLocation().SetMix();
2816  CRef<CSeq_loc> loc(new CSeq_loc);
2817  if (loc_info.is_point || start == stop ) {
2818  // a point of some kind
2819  if (mix_set.empty())
2820  m_need_check_strand = true;
2821  else
2822  x_GetPointStrand(*sfp, strand);
2823 
2824  // note usage of orig_start instead of start
2825  // because we want the first part of the point
2826  // specified in the file, not the smallest because SetRightOf
2827  // works differently for plus vs. minus strand
2828  CRef<CSeq_point> pPoint(
2829  new CSeq_point(*m_seq_id, orig_start, strand) );
2830  if( loc_info.is_point ) {
2831  // between two bases
2832  pPoint->SetRightOf (true);
2833  // warning if stop is not start plus one
2834  if( stop != (start+1) ) {
2835  x_ProcessMsg(
2837  strFeatureName );
2838  }
2839  } else {
2840  // just a point. do nothing
2841  }
2842 
2843  if (loc_info.is_5p_partial) {
2844  pPoint->SetPartialStart (true, eExtreme_Biological);
2845  }
2846  if (loc_info.is_3p_partial) {
2847  pPoint->SetPartialStop (true, eExtreme_Biological);
2848  }
2849 
2850  loc->SetPnt( *pPoint );
2851  } else {
2852  // interval
2853  CRef<CSeq_interval> pIval( new CSeq_interval(*m_seq_id, start, stop, strand) );
2854  if (loc_info.is_5p_partial) {
2855  pIval->SetPartialStart (true, eExtreme_Biological);
2856  }
2857  if (loc_info.is_3p_partial) {
2858  pIval->SetPartialStop (true, eExtreme_Biological);
2859  }
2860  loc->SetInt(*pIval);
2861  if (m_need_check_strand)
2862  {
2863  x_UpdatePointStrand(*sfp, strand);
2864  m_need_check_strand = false;
2865  }
2866  }
2867 
2868  // check for internal partials
2869  if( ! mix_set.empty() ) {
2870  const CSeq_loc & last_loc = *mix_set.back();
2871  if( last_loc.IsPartialStop(eExtreme_Biological) ||
2873  {
2874  // internal partials
2876  eDiag_Warning, strFeatureName );
2877  }
2878  }
2879 
2880  mix_set.push_back(loc);
2881 
2882 
2883  if (loc_info.is_5p_partial || loc_info.is_3p_partial) {
2884  sfp->SetPartial (true);
2885  }
2886 
2887  return true;
2888 }
2889 
2890 
2891 
2893  CRef<CSeq_feat> sfp,
2894  const string& feat,
2895  const TFlags flags,
2896  ITableFilter *filter
2897 )
2898 
2899 {
2900  if (feat.empty ()) return false;
2901 
2902  // check filter, if any
2903  if (filter) {
2904  ITableFilter::EAction action = filter->GetFeatAction(feat);
2905  if( action != ITableFilter::eAction_Okay ) {
2906  x_ProcessMsg(
2908  eDiag_Warning, feat );
2909  if( action == ITableFilter::eAction_Disallowed ) {
2910  return false;
2911  }
2912  }
2913  }
2914 
2916  if (sbtyp != CSeqFeatData::eSubtype_bad) {
2917 
2918  // populate *sfp here...
2919 
2921  sfp->SetData ().Select (typ);
2922  CSeqFeatData& sfdata = sfp->SetData ();
2923 
2924  if (typ == CSeqFeatData::e_Rna) {
2925  CRNA_ref& rrp = sfdata.SetRna ();
2927  switch (sbtyp) {
2929  rnatyp = CRNA_ref::eType_premsg;
2930  break;
2932  rnatyp = CRNA_ref::eType_mRNA;
2933  break;
2935  rnatyp = CRNA_ref::eType_tRNA;
2936  break;
2938  rnatyp = CRNA_ref::eType_rRNA;
2939  break;
2941  rnatyp = CRNA_ref::eType_ncRNA;
2942  rrp.SetExt().SetGen().SetClass("snRNA");
2943  break;
2945  rnatyp = CRNA_ref::eType_ncRNA;
2946  rrp.SetExt().SetGen().SetClass("scRNA");
2947  break;
2949  rnatyp = CRNA_ref::eType_ncRNA;
2950  rrp.SetExt().SetGen().SetClass("snoRNA");
2951  break;
2953  rnatyp = CRNA_ref::eType_ncRNA;
2954  rrp.SetExt().SetGen();
2955  break;
2957  rnatyp = CRNA_ref::eType_tmRNA;
2958  rrp.SetExt().SetGen();
2959  break;
2961  rrp.SetExt().SetName("misc_RNA");
2962  rnatyp = CRNA_ref::eType_other;
2963  break;
2964  default :
2965  break;
2966  }
2967  rrp.SetType (rnatyp);
2968 
2969  } else if (typ == CSeqFeatData::e_Imp) {
2970  CImp_feat_Base& imp = sfdata.SetImp ();
2971  imp.SetKey (feat);
2972 
2973  } else if (typ == CSeqFeatData::e_Bond) {
2975 
2976  } else if (typ == CSeqFeatData::e_Site) {
2978  } else if (typ == CSeqFeatData::e_Prot ) {
2979  CProt_ref &prot_ref = sfdata.SetProt();
2980  switch (sbtyp) {
2981  default:
2982  break;
2985  break;
2988  break;
2991  break;
2994  break;
2997  break;
2998  }
2999  }
3000 
3001  // check for discouraged feature name
3003  if( CSeqFeatData::IsDiscouragedSubtype(sbtyp) ) {
3004  x_ProcessMsg(
3006  eDiag_Warning, feat);
3007  }
3008  }
3009 
3010  return true;
3011  }
3012 
3013  // unrecognized feature key
3014 
3017  }
3018 
3020 
3021  sfp->SetData ().Select (CSeqFeatData::e_Imp);
3022  CSeqFeatData& sfdata = sfp->SetData ();
3023  CImp_feat_Base& imp = sfdata.SetImp ();
3024  imp.SetKey ("misc_feature");
3025  x_AddQualifierToFeature (sfp, kEmptyStr, "standard_name", feat, flags);
3026 
3027  return true;
3028 
3029  } else if ((flags & CFeature_table_reader::fKeepBadKey) != 0) {
3030 
3031  sfp->SetData ().Select (CSeqFeatData::e_Imp);
3032  CSeqFeatData& sfdata = sfp->SetData ();
3033  CImp_feat_Base& imp = sfdata.SetImp ();
3034  imp.SetKey (feat);
3035 
3036  return true;
3037  }
3038 
3039  return false;
3040 }
3041 
3043  ILineError::EProblem eProblem,
3044  EDiagSev eSeverity,
3045  const string& strFeatureName,
3046  const string& strQualifierName,
3047  const string& strQualifierValue,
3048  const string& strErrorMessage,
3049  const ILineError::TVecOfLines & vecOfOtherLines)
3050 {
3051  x_ProcessMsg(m_reader ? static_cast<unsigned>(m_reader->GetLineNumber()) : m_LineNumber,
3052  eProblem,
3053  eSeverity,
3054  strFeatureName,
3055  strQualifierName,
3056  strQualifierValue,
3057  strErrorMessage,
3058  vecOfOtherLines);
3059 }
3060 
3061 
3063  int line_num,
3064  ILineError::EProblem eProblem,
3065  EDiagSev eSeverity,
3066  const string & strFeatureName,
3067  const string & strQualifierName,
3068  const string & strQualifierValue,
3069  const string& strErrorMessage,
3070  const ILineError::TVecOfLines & vecOfOtherLines )
3071 {
3072 
3073  if (!m_pMessageListener) {
3074  return;
3075  }
3076 
3079  eSeverity, line_num, strErrorMessage, eProblem, m_real_seqid, strFeatureName,
3080  strQualifierName, strQualifierValue));
3081  ITERATE( ILineError::TVecOfLines, line_it, vecOfOtherLines ) {
3082  pErr->AddOtherLine(*line_it);
3083  }
3084 
3085  if (!m_pMessageListener->PutError(*pErr)) {
3086  pErr->Throw();
3087  }
3088 }
3089 
3090 
3092  const CTempString& seq_id,
3093  const unsigned int line_number,
3094  ILineErrorListener* pListener)
3095 {
3096  if (!pListener) {
3097  return;
3098  }
3099 
3100  string msg = "Seq-id " + seq_id + ", line " + NStr::IntToString(line_number);
3101  pListener->PutProgress(msg);
3102 }
3103 
3104 
3105 // helper for CFeatureTableReader_Imp::ReadSequinFeatureTable,
3106 // just so we don't forget a step when we reset the feature
3107 //
3108 void CFeatureTableReader_Imp::x_ResetFeat(CRef<CSeq_feat> & sfp, bool & curr_feat_intervals_done)
3109 {
3110  m_need_check_strand = false;
3111  sfp.Reset(new CSeq_feat);
3112  //sfp->ResetLocation();
3113  curr_feat_intervals_done = false;
3114 }
3115 
3117 {
3118  if (feat.IsSetLocation() && feat.GetLocation().IsMix())
3119  {
3120  const CSeq_loc& last = *feat.GetLocation().GetMix().Get().back();
3121  if (last.IsInt() && last.GetInt().IsSetStrand())
3122  {
3123  strand = last.GetInt().GetStrand();
3124  }
3125  else
3126  if (last.IsPnt() && last.GetPnt().IsSetStrand())
3127  {
3128  strand = last.GetPnt().GetStrand();
3129  }
3130  }
3131 }
3132 
3134 {
3135  if (feat.IsSetLocation() && feat.GetLocation().IsMix())
3136  {
3137 
3138  for (auto pSeqLoc : feat.SetLocation().SetMix().Set()) {
3139  if (pSeqLoc->IsPnt()) {
3140  auto& seq_point = pSeqLoc->SetPnt();
3141  const auto old_strand =
3142  seq_point.IsSetStrand() ?
3143  seq_point.GetStrand() :
3145 
3146  seq_point.SetStrand(strand);
3147  if (old_strand != strand) {
3148  const bool is_5p_partial = seq_point.IsPartialStop(eExtreme_Biological);
3149  const bool is_3p_partial = seq_point.IsPartialStart(eExtreme_Biological);
3150  seq_point.SetPartialStart(is_5p_partial, eExtreme_Biological);
3151  seq_point.SetPartialStop(is_3p_partial, eExtreme_Biological);
3152  }
3153  }
3154  }
3155  }
3156 }
3157 
3158 
3160  TFtable& ftable)
3161 {
3162  if ( !feat ||
3163  feat.Empty() ||
3164  !feat->IsSetData() ||
3165  (feat->GetData().Which() == CSeqFeatData::e_not_set) )
3166  {
3167  return;
3168  }
3169 
3170  // Check for missing publication - RW-626
3171  if (feat->GetData().GetSubtype() == CSeqFeatData::eSubtype_pub &&
3172  (!feat->SetData().SetPub().IsSetPub() ||
3173  feat->SetData().SetPub().GetPub().Get().empty())) {
3174  const int line_number = m_reader->AtEOF() ?
3175  static_cast<unsigned>(m_reader->GetLineNumber()) :
3176  static_cast<unsigned>(m_reader->GetLineNumber())-1;
3177 
3178  string msg = "Reference feature is empty. Skipping feature.";
3179 
3180  x_ProcessMsg(line_number,
3182  eDiag_Warning,
3183  "Reference",
3184  kEmptyStr,
3185  kEmptyStr,
3186  msg);
3187  return;
3188  }
3189 
3190  if (feat->IsSetLocation() && feat->GetLocation().IsMix())
3191  {
3192  if (feat->GetLocation().GetMix().Get().empty()) {
3193  // turn empty seqlocmix into a null seq-loc
3194  feat->SetLocation().SetNull();
3195  }
3196  else
3197  if (feat->GetLocation().GetMix().Get().size() == 1) {
3198  // demote 1-part seqlocmixes to seq-loc with just that part
3199  CRef<CSeq_loc> keep_loc = *feat->SetLocation().SetMix().Set().begin();
3200  feat->SetLocation(*keep_loc);
3201  }
3202  }
3203  ftable.push_back(feat);
3204 }
3205 
3206 
3207 
3208 void CFeatureTableReader_Imp::x_ProcessQualifier(const string& qual_name,
3209  const string& qual_val,
3210  const string& feat_name,
3211  CRef<CSeq_feat> feat,
3212  TFlags flags)
3213 {
3214  if (NStr::IsBlank(qual_name)) {
3215  return;
3216  }
3217 
3218  if (!feat) {
3221  eDiag_Warning, kEmptyStr, qual_name, qual_val);
3222  }
3223  return;
3224  }
3225 
3226  if (NStr::IsBlank(qual_val)) {
3227  if (sc_SingleKeys.find(qual_name.c_str()) != sc_SingleKeys.end()) {
3228  x_AddQualifierToFeature(feat, feat_name, qual_name, qual_val, flags);
3229  }
3230  else {
3232  eDiag_Warning, feat_name, qual_name);
3233  }
3234  return;
3235  }
3236 
3237  // else qual_name and qual_val are not blank
3238  if (!x_AddQualifierToFeature(feat, feat_name, qual_name, qual_val, flags)) {
3241  eDiag_Warning, feat_name, qual_name, qual_val);
3242  }
3243 
3245  x_AddGBQualToFeature(feat, qual_name, qual_val);
3246  }
3247  }
3248 }
3249 
3250 
3251 
3253  const CTempString& in_seqid,
3254  const CTempString& in_annotname,
3255  const TFlags flags,
3256  ITableFilter *filter
3257 )
3258 {
3259  string feat, qual, qual_value;
3260  string curr_feat_name;
3261  // Int4 start, stop;
3262  //bool partial5, partial3, ispoint, isminus,
3263 
3264  bool ignore_until_next_feature_key = false;
3265  Int4 offset = 0;
3266  SFeatLocInfo loc_info;
3267 
3268  CRef<CSeq_annot> sap(new CSeq_annot);
3269 
3270  TFtable& ftable = sap->SetData().SetFtable();
3271  const bool bIgnoreWebComments =
3273 
3274  // if sequence ID is a list, use just one sequence ID string
3275  x_InitId(in_seqid, flags);
3276 
3277  // Use this to efficiently find the best CDS for a prot feature
3278  // (only add CDS's for it to work right)
3279  CBestFeatFinder best_CDS_finder;
3280 
3281  // map feature types to features
3282  TChoiceToFeatMap choiceToFeatMap;
3283 
3284  CRef<CSeq_feat> sfp;
3285  // This is true once this feature should not
3286  // have any more intervals.
3287  // This allows us to catch errors like the following:
3288  //
3289  //
3290  //>Feature lcl|Seq1
3291  //1 1008 CDS
3292  // gene THE_GENE_NAME
3293  //50 200
3294  // product THE_GENE_PRODUCT
3295  bool curr_feat_intervals_done = false;
3296 
3297  if (! in_annotname.empty ()) {
3298  CAnnot_descr& descr = sap->SetDesc ();
3299  CRef<CAnnotdesc> annot(new CAnnotdesc);
3300  annot->SetName (in_annotname);
3301  descr.Set().push_back (annot);
3302  }
3303 
3304  while ( !m_reader->AtEOF() ) {
3305 
3306  CTempString line = *++(*m_reader);
3307 
3308  if( m_reader->GetLineNumber() % 10000 == 0 &&
3309  m_reader->GetLineNumber() > 0 )
3310  {
3311  PutProgress(m_real_seqid, static_cast<unsigned>(m_reader->GetLineNumber()), m_pMessageListener);
3312  }
3313 
3314  // skip empty lines.
3315  // if requested, also skip webcomment lines
3316  if( line.empty () || (bIgnoreWebComments && x_IsWebComment(line) ) ) {
3317  continue;
3318  }
3319 
3320  // if next line is a new feature table, return current sap
3321  CTempStringEx dummy1, dummy2;
3322  if( ParseInitialFeatureLine(line, dummy1, dummy2) ) {
3323  m_reader->UngetLine(); // we'll get this feature line the next time around
3324  break;
3325  }
3326 
3327  if (line [0] == '[') {
3328 
3329  // try to parse it as an offset
3330  if( x_TryToParseOffset(line, offset) ) {
3331  // okay, known command
3332  } else {
3333  // warn for unknown square-bracket commands
3334  x_ProcessMsg(
3336  eDiag_Warning);
3337  }
3338 
3339  } else if ( s_LineIndicatesOrder(line) ) {
3340 
3341  // put nulls between feature intervals
3342  CRef<CSeq_loc> loc_with_nulls = s_LocationJoinToOrder( sfp->GetLocation() );
3343  // loc_with_nulls is unset if no change was needed
3344  if( loc_with_nulls ) {
3345  sfp->SetLocation( *loc_with_nulls );
3346  }
3347 
3348  } else if (x_ParseFeatureTableLine (line, loc_info, feat, qual, qual_value, offset)) {
3349  // process line in feature table
3350 
3351  replace( qual_value.begin(), qual_value.end(), '\"', '\'' );
3352 
3353  if ((! feat.empty ()) && loc_info.start_pos >= 0 && loc_info.stop_pos >= 0) {
3354 
3355  // process start - stop - feature line
3356 
3357  x_FinishFeature(sfp, ftable);
3358  x_ResetFeat( sfp, curr_feat_intervals_done );
3359 
3360  if (x_SetupSeqFeat (sfp, feat, flags, filter)) {
3361 
3362  // figure out type of feat, and store in map for later use
3364  if( sfp->CanGetData() ) {
3365  eChoice = sfp->GetData().Which();
3366  }
3367  choiceToFeatMap.insert(
3369  eChoice,
3370  SFeatAndLineNum(sfp, static_cast<unsigned>(m_reader->GetLineNumber()))));
3371 
3372  // if new feature is a CDS, remember it for later lookups
3373  if( eChoice == CSeqFeatData::e_Cdregion ) {
3374  best_CDS_finder.AddFeat( *sfp );
3375  }
3376 
3377  // and add first interval
3378  x_AddIntervalToFeature (curr_feat_name, sfp, loc_info);
3379 
3380  ignore_until_next_feature_key = false;
3381 
3382  curr_feat_name = feat;
3383 
3384  } else {
3385 
3386  // bad feature, set ignore flag
3387 
3388  ignore_until_next_feature_key = true;
3389  }
3390 
3391  } else if (ignore_until_next_feature_key) {
3392 
3393  // bad feature was found before, so ignore
3394  // qualifiers until next feature key
3395 
3396  }
3397  else
3398  if (loc_info.start_pos >= 0 &&
3399  loc_info.stop_pos >= 0 &&
3400  feat.empty () &&
3401  qual.empty () &&
3402  qual_value.empty ()) {
3403 
3404  if( curr_feat_intervals_done ) {
3405  // the feat intervals were done, so it's an error for there to be more intervals
3407  // this feature is in bad shape, so we ignore the rest of it
3408  ignore_until_next_feature_key = true;
3409  x_ResetFeat(sfp, curr_feat_intervals_done);
3410  } else if (sfp && sfp->IsSetLocation() && sfp->GetLocation().IsMix()) {
3411  // process start - stop multiple interval line
3412  x_AddIntervalToFeature (curr_feat_name, sfp, loc_info);
3413  // start, stop, partial5, partial3, ispoint, isminus);
3414  } else {
3417  eDiag_Warning);
3418  }
3419  }
3420 
3421  } else if (!NStr::IsBlank(qual)) {
3422  curr_feat_intervals_done = true;
3423  x_ProcessQualifier(qual, qual_value, curr_feat_name, sfp, flags);
3424  }
3425  else if (!feat.empty()) {
3426 
3427  // unrecognized location
3428 
3429  // there should no more ranges for this feature
3430  // (although there still can be ranges for quals, of course).
3431  curr_feat_intervals_done = true;
3432 
3434  x_ProcessMsg(
3436  feat );
3437  }
3438  }
3439  }
3440  }
3441 
3442  // make sure last feature is finished
3443  x_FinishFeature(sfp, ftable);
3444  x_ResetFeat( sfp, curr_feat_intervals_done );
3445 
3448  {
3449  x_CreateGenesFromCDSs(sap, choiceToFeatMap, flags);
3450  }
3451  return sap;
3452 }
3453 
3454 
3456  const string& feat,
3457  CSeq_loc& location,
3458  const TFlags flags,
3459  const string &seq_id,
3460  ITableFilter *filter
3461 )
3462 
3463 {
3464  CRef<CSeq_feat> sfp (new CSeq_feat);
3465 
3466  sfp->ResetLocation ();
3467 
3468  if ( ! x_SetupSeqFeat (sfp, feat, flags, filter) ) {
3469 
3470  // bad feature, make dummy
3471  sfp->SetData ().Select (CSeqFeatData::e_not_set);
3472  }
3473  sfp->SetLocation (location);
3474 
3475  return sfp;
3476 }
3477 
3479 {
3480  if (!NStr::IsBlank(seq_id)) {
3481  CBioseq::TId ids;
3482  CSeq_id::ParseIDs(ids, seq_id,
3484 
3485  m_seq_id.Reset();
3487  {
3488  for (auto id : ids)
3489  {
3490  if (id->IsGenbank())
3491  m_seq_id = id;
3492  }
3493  };
3494 
3495  if (m_seq_id.Empty())
3496  m_seq_id = ids.front();
3497 
3498  m_real_seqid.clear();
3500  }
3501 }
3502 
3504  CRef<CSeq_feat> sfp,
3505  const string& feat_name,
3506  const string& qual,
3507  const string& val,
3508  const TFlags flags,
3509  const string &seq_id1 )
3510 
3511 {
3512  x_InitId(seq_id1, flags);
3513 
3514  if (NStr::IsBlank(qual)) {
3515  return;
3516  }
3517 
3518  if (!val.empty ()) { // Should probably use NStr::IsBlank()
3519  if (! x_AddQualifierToFeature (sfp, feat_name, qual, val, flags)) {
3520  // unrecognized qualifier key
3522  ERR_POST_X (5, Warning << "Unrecognized qualifier '" << qual << "'");
3523  }
3525  x_AddGBQualToFeature (sfp, qual, val);
3526  }
3527  }
3528  }
3529  else { // empty val
3530  // check for the few qualifiers that do not need a value
3531  auto s_iter = sc_SingleKeys.find (qual.c_str ());
3532  if (s_iter != sc_SingleKeys.end ()) {
3533  x_AddQualifierToFeature (sfp, feat_name, qual, val, flags);
3534  }
3535  }
3536 }
3537 
3538 // static
3540  const CTempString& line_arg,
3541  CTempStringEx& out_seqid,
3542  CTempStringEx& out_annotname )
3543 {
3544  out_seqid.clear();
3545  out_annotname.clear();
3546 
3547  // copy the line_arg because we can't edit line_arg itself
3548  CTempString line = line_arg;
3549 
3550  // handle ">"
3552  if( ! NStr::StartsWith(line, ">") ) {
3553  return false;
3554  }
3555  line = line.substr(1); // remove '>'
3556 
3557  // handle "Feature"
3559  const CTempString kFeatureStr("Feature");
3560  if( ! NStr::StartsWith(line, kFeatureStr, NStr::eNocase) ) {
3561  return false;
3562  }
3563  line = line.substr( kFeatureStr.length() ); // remove "Feature"
3564 
3565  // throw out any non-space characters at the beginning,
3566  // so we can, for example, handle ">Features" (note the "s")
3567  while( !line.empty() && !isspace(line[0]) ) {
3568  line = line.substr(1);
3569  }
3570 
3571  // extract seqid and annotname
3573  NStr::SplitInTwo(line, " \t", out_seqid, out_annotname, NStr::fSplit_Tokenize);
3574 
3575  return true;
3576 }
3577 
3578 
3579 // public access functions
3580 
3582  TReaderFlags fReaderFlags)
3583  : CReaderBase(fReaderFlags)
3584 {
3585 }
3586 
3588  ILineReader& lr,
3589  ILineErrorListener* pErrors) :
3590  CReaderBase(0),
3591  m_pImpl(new CFeatureTableReader_Imp(&lr, 0, pErrors))
3592  {}
3593 
3596  ILineReader &lr, ILineErrorListener *pMessageListener)
3597 {
3598  CRef<CSerialObject> object(
3599  ReadSeqAnnot( lr, pMessageListener ).ReleaseOrNull() );
3600  return object;
3601 }
3602 
3603 
3606  ILineReader &lr, ILineErrorListener *pMessageListener)
3607 {
3608  return ReadSequinFeatureTable(lr, m_iFlags, pMessageListener);
3609 }
3610 
3611 
3613  CNcbiIstream& ifs,
3614  const string& seqid,
3615  const string& annotname,
3616  const TFlags flags,
3617  ILineErrorListener* pMessageListener,
3618  ITableFilter *filter
3619 )
3620 {
3621  CStreamLineReader reader(ifs);
3622  return ReadSequinFeatureTable(reader, seqid, annotname, flags, pMessageListener, filter);
3623 }
3624 
3626  ILineReader& reader,
3627  const string& seqid,
3628  const string& annotname,
3629  const TFlags flags,
3630  ILineErrorListener* pMessageListener,
3631  ITableFilter *filter
3632 )
3633 {
3634  // just read features from 5-column table
3635  CFeatureTableReader_Imp impl(&reader, 0, pMessageListener);
3636  return impl.ReadSequinFeatureTable(seqid, annotname, flags, filter);
3637 }
3638 
3640  CFeatureTableReader_Imp& reader,
3641  const CTempString& seqid,
3642  const CTempString& annot_name,
3643  TFlags flags,
3644  ITableFilter* filter) {
3645  return reader.ReadSequinFeatureTable(seqid, annot_name, flags, filter);
3646 }
3647 
3648 
3650  CNcbiIstream& ifs,
3651  const TFlags flags,
3652  ILineErrorListener* pMessageListener,
3653  ITableFilter *filter
3654 )
3655 {
3656  CStreamLineReader reader(ifs);
3657  return ReadSequinFeatureTable(reader, flags, pMessageListener, filter);
3658 }
3659 
3660 
3662  CFeatureTableReader_Imp& reader,
3663  const TFlags flags,
3664  ITableFilter* filter,
3665  const string& seqid_prefix)
3666 {
3667  auto pLineReader = reader.GetLineReaderPtr();
3668  if (!pLineReader) {
3669  return CRef<CSeq_annot>();
3670  }
3671 
3672 
3673  CTempStringEx orig_seqid, annotname;
3674  // first look for >Feature line, extract seqid and optional annotname
3675  while (orig_seqid.empty () && !pLineReader->AtEOF() ) {
3676  CTempString line = *++(*pLineReader);
3677  if( ParseInitialFeatureLine(line, orig_seqid, annotname) ) {
3679  static_cast<unsigned>(pLineReader->GetLineNumber()),
3680  reader.GetErrorListenerPtr());
3681  }
3682  }
3683 
3684  string temp_seqid;
3685  if (seqid_prefix.empty()) {
3686  //seqid = orig_seqid;
3687  } else {
3688  if (orig_seqid.find('|') == string::npos)
3689  temp_seqid = seqid_prefix + orig_seqid;
3690  else
3691  if (NStr::StartsWith(orig_seqid, "lcl|"))
3692  {
3693  temp_seqid = seqid_prefix + orig_seqid.substr(4);
3694  }
3695  orig_seqid = temp_seqid;
3696  }
3697  return x_ReadFeatureTable(reader, orig_seqid, annotname, flags, filter);
3698 }
3699 
3700 
3702  ILineReader& reader,
3703  const TFlags flags,
3704  ILineErrorListener* pMessageListener,
3705  ITableFilter* pFilter,
3706  const string& seqid_prefix
3707 )
3708 {
3709  CFeatureTableReader_Imp ftable_reader(&reader, 0, pMessageListener);
3710  return x_ReadFeatureTable(ftable_reader, flags, pFilter, seqid_prefix);
3711 }
3712 
3713 
3715  const TFlags flags,
3716  ITableFilter* pFilter,
3717  const string& seqid_prefix
3718 )
3719 {
3720  return x_ReadFeatureTable(*m_pImpl, flags, pFilter, seqid_prefix);
3721 }
3722 
3723 
3725  CNcbiIstream& ifs,
3726  CSeq_entry& entry,
3727  const TFlags flags,
3728  ILineErrorListener* pMessageListener,
3729  ITableFilter *filter
3730 )
3731 {
3732  CStreamLineReader reader(ifs);
3733  return ReadSequinFeatureTables(reader, entry, flags, pMessageListener, filter);
3734 }
3735 
3736 void
3738  const list<string>& stringFlags,
3739  TFlags& baseFlags)
3740 {
3741  static const map<string, CFeature_table_reader::TReaderFlags> flagsMap = {
3742  { "KeepBadKey", CFeature_table_reader::fKeepBadKey},
3743  { "TranslateBadKey", CFeature_table_reader::fTranslateBadKey},
3744  { "IgnoreWebComments", CFeature_table_reader::fIgnoreWebComments},
3745  { "CreateGenesFromCDSs", CFeature_table_reader::fCreateGenesFromCDSs},
3746  { "CDSsMustBeInTheirGenes", CFeature_table_reader::fCDSsMustBeInTheirGenes},
3747  { "ReportDiscouragedKey", CFeature_table_reader::fReportDiscouragedKey},
3748  { "LeaveProteinIds", CFeature_table_reader::fLeaveProteinIds},
3749  { "AllIdsAsLocal", CFeature_table_reader::fAllIdsAsLocal},
3750  { "PreferGenbankId", CFeature_table_reader::fPreferGenbankId},
3751  { "SuppressBadKeyWarning", CFeature_table_reader::fSuppressBadKeyWarning},
3752  };
3753 
3754  return CReaderBase::xAddStringFlagsWithMap(stringFlags, flagsMap, baseFlags);
3755 };
3756 
3757 
3758 struct SCSeqidCompare
3759 {
3760  inline
3761  bool operator()(const CSeq_id* left, const CSeq_id* right) const
3762  {
3763  return *left < *right;
3764  };
3765 };
3766 
3768  ILineReader& reader,
3769  CSeq_entry& entry,
3770  const TFlags flags,
3771  ILineErrorListener* pMessageListener,
3772  ITableFilter *filter
3773 )
3774 {
3775  // let's use map to speedup matching on very large files, see SQD-1847
3776  map<const CSeq_id*, CRef<CBioseq>, SCSeqidCompare> seq_map;
3777 
3778  for (CTypeIterator<CBioseq> seqit(entry); seqit; ++seqit) {
3779  ITERATE (CBioseq::TId, seq_id, seqit->GetId()) {
3780  seq_map[seq_id->GetPointer()].Reset(&*seqit);
3781  }
3782  }
3783 
3784  CFeatureTableReader_Imp ftable_reader(&reader, 0, pMessageListener);
3785  while ( !reader.AtEOF() ) {
3786  auto annot = x_ReadFeatureTable(ftable_reader, flags, filter);
3787  //CRef<CSeq_annot> annot = ReadSequinFeatureTable(reader, flags, pMessageListener, filter);
3788  if (entry.IsSeq()) { // only one place to go
3789  entry.SetSeq().SetAnnot().push_back(annot);
3790  continue;
3791  }
3792  _ASSERT(annot->GetData().IsFtable());
3793  if (annot->GetData().GetFtable().empty()) {
3794  continue;
3795  }
3796  // otherwise, take the first feature, which should be representative
3797  const CSeq_feat& feat = *annot->GetData().GetFtable().front();
3798  const CSeq_id* feat_id = feat.GetLocation().GetId();
3799  CBioseq* seq = nullptr;
3800  _ASSERT(feat_id); // we expect a uniform sequence ID
3801  seq = seq_map[feat_id].GetPointer();
3802  if (seq) { // found a match
3803  seq->SetAnnot().push_back(annot);
3804  } else { // just package on the set
3805  ERR_POST_X(6, Warning
3806  << "ReadSequinFeatureTables: unable to find match for "
3807  << feat_id->AsFastaString());
3808  entry.SetSet().SetAnnot().push_back(annot);
3809  }
3810  }
3811 }
3812 
3813 
3815  const string& feat,
3816  CSeq_loc& location,
3817  const TFlags flags,
3818  ILineErrorListener* pMessageListener,
3819  unsigned int line_number,
3820  string *seq_id,
3821  ITableFilter *filter
3822 )
3823 {
3824  CFeatureTableReader_Imp impl(nullptr, line_number, pMessageListener);
3825  return impl.CreateSeqFeat (feat, location, flags, (seq_id ? *seq_id : string() ), filter);
3826 }
3827 
3828 
3830  CRef<CSeq_feat> sfp,
3831  const string& feat_name,
3832  const string& qual,
3833  const string& val,
3835  ILineErrorListener* pMessageListener,
3836  int line_number,
3837  const string &seq_id
3838 )
3839 
3840 {
3841  CFeatureTableReader_Imp impl(nullptr, line_number, pMessageListener);
3842  impl.AddFeatQual (sfp, feat_name, qual, val, flags, seq_id) ;
3843 }
3844 
3845 bool
3847  const CTempString& line_arg,
3848  CTempStringEx& out_seqid,
3849  CTempStringEx& out_annotname )
3850 {
3851  return CFeatureTableReader_Imp::ParseInitialFeatureLine(line_arg, out_seqid, out_annotname);
3852 }
3853 
3854 
3856 
3857 END_objects_SCOPE
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
@ eExtreme_Biological
5' and 3'
Definition: Na_strand.hpp:62
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
static unsigned int line_num
Definition: attributes.c:11
void remove_if(Container &c, Predicate *__pred)
Definition: chainer.hpp:69
AutoPtr –.
Definition: ncbimisc.hpp:401
CAnnot_descr –.
Definition: Annot_descr.hpp:66
CAnnotdesc –.
Definition: Annotdesc.hpp:66
bool AddFeat(const CSeq_feat &new_cds)
CCdregion –.
Definition: Cdregion.hpp:66
Definition: Dbtag.hpp:53
bool x_AddQualifierToFeature(CRef< CSeq_feat > sfp, const string &feat_name, const string &qual, const string &val, const TFlags flags)
Definition: readfeat.cpp:2342
static void PutProgress(const CTempString &seq_id, const unsigned int line_number, ILineErrorListener *pListener)
Definition: readfeat.cpp:3091
CSeq_annot::C_Data::TFtable TFtable
Definition: readfeat.cpp:297
bool x_TryToParseOffset(const CTempString &sLine, Int4 &out_offset)
Definition: readfeat.cpp:865
void AddFeatQual(CRef< CSeq_feat > sfp, const string &feat_name, const string &qual, const string &val, const TFlags flags, const string &seq_id)
Definition: readfeat.cpp:3503
bool x_AddQualifierToImp(CRef< CSeq_feat > sfp, CSeqFeatData &sfdata, EQual qtype, const string &qual, const string &val)
Definition: readfeat.cpp:1826
static bool ParseInitialFeatureLine(const CTempString &line_arg, CTempStringEx &out_seqid, CTempStringEx &out_annotname)
Definition: readfeat.cpp:3539
bool x_AddNoteToFeature(CRef< CSeq_feat > sfp, const string &note)
Definition: readfeat.cpp:2302
CConstRef< CSeq_feat > TFeatConstRef
Definition: readfeat.cpp:418
void x_TokenizeStrict(const CTempString &line, vector< string > &out_tokens)
Definition: readfeat.cpp:1173
bool x_AddQualifierToGene(CSeqFeatData &sfdata, EQual qtype, const string &val)
Definition: readfeat.cpp:1292
ILineReader *const GetLineReaderPtr(void)
Definition: readfeat.cpp:334
string x_TrnaToAaString(const string &val)
Definition: readfeat.cpp:1496
bool x_AddIntervalToFeature(CTempString strFeatureName, CRef< CSeq_feat > &sfp, const SFeatLocInfo &loc_info)
Definition: readfeat.cpp:2792
unsigned int m_LineNumber
Definition: readfeat.cpp:501
void x_ProcessQualifier(const string &qual_name, const string &qual_val, const string &feat_name, CRef< CSeq_feat > feat, TFlags flags)
Definition: readfeat.cpp:3208
void x_InitId(const CTempString &seq_id, const TFlags flags)
Definition: readfeat.cpp:3478
multimap< CSeqFeatData::E_Choice, SFeatAndLineNum > TChoiceToFeatMap
Definition: readfeat.cpp:444
ILineReader * m_reader
Definition: readfeat.cpp:500
bool x_AddQualifierToRna(CRef< CSeq_feat > sfp, EQual qtype, const string &val)
Definition: readfeat.cpp:1665
void x_ResetFeat(CRef< CSeq_feat > &feat, bool &curr_feat_intervals_done)
Definition: readfeat.cpp:3108
CRef< CSeq_annot > ReadSequinFeatureTable(const CTempString &seqid, const CTempString &annotname, const TFlags flags, ITableFilter *filter)
Definition: readfeat.cpp:3252
bool x_ParseTrnaExtString(CTrna_ext &ext_trna, const string &str)
Definition: readfeat.cpp:1517
void x_CreateGenesFromCDSs(CRef< CSeq_annot > sap, TChoiceToFeatMap &choiceToFeatMap, const TFlags flags)
Definition: readfeat.cpp:2080
long x_StringToLongNoThrow(CTempString strToConvert, CTempString strFeatureName, CTempString strQualifierName, ILineError::EProblem eProblem=ILineError::eProblem_Unset)
Definition: readfeat.cpp:1620
ILineErrorListener * m_pMessageListener
Definition: readfeat.cpp:502
void x_FinishFeature(CRef< CSeq_feat > &feat, TFtable &ftable)
Definition: readfeat.cpp:3159
CFeatureTableReader_Imp(ILineReader *reader, unsigned int line_num, ILineErrorListener *pMessageListener)
Definition: readfeat.cpp:854
bool x_AddGBQualToFeature(CRef< CSeq_feat > sfp, const string &qual, const string &val)
Definition: readfeat.cpp:2043
bool x_IsWebComment(CTempString line)
Definition: readfeat.cpp:2739
CRef< CSeq_id > m_seq_id
Definition: readfeat.cpp:499
CFeature_table_reader::TFlags TFlags
Definition: readfeat.cpp:296
ILineErrorListener *const GetErrorListenerPtr(void)
Definition: readfeat.cpp:338
bool x_StringIsJustQuotes(const string &str)
Definition: readfeat.cpp:1428
void x_GetPointStrand(const CSeq_feat &feat, CSeq_interval::TStrand &strand) const
Definition: readfeat.cpp:3116
CRef< CSeq_feat > CreateSeqFeat(const string &feat, CSeq_loc &location, const TFlags flags, const string &seq_id, ITableFilter *filter)
Definition: readfeat.cpp:3455
SIZE_TYPE x_MatchingParenPos(const string &str, SIZE_TYPE open_paren_pos)
Definition: readfeat.cpp:1585
bool x_AddQualifierToCdregion(CRef< CSeq_feat > sfp, CSeqFeatData &sfdata, EQual qtype, const string &val)
Definition: readfeat.cpp:1332
void x_TokenizeLenient(const CTempString &line, vector< string > &out_tokens)
Definition: readfeat.cpp:1213
void x_UpdatePointStrand(CSeq_feat &feat, CSeq_interval::TStrand strand) const
Definition: readfeat.cpp:3133
unordered_set< string > m_ProcessedProteinIds
Definition: readfeat.cpp:504
bool x_ParseFeatureTableLine(const CTempString &line, SFeatLocInfo &loc_info, string &feat, string &qual, string &val, Int4 offset)
Definition: readfeat.cpp:911
bool x_AddQualifierToBioSrc(CSeqFeatData &sfdata, const string &feat_name, EOrgRef rtype, const string &val)
Definition: readfeat.cpp:1941
CFeatureTableReader_Imp & operator=(const CFeatureTableReader_Imp &value)
bool x_AddCodons(const string &val, CTrna_ext &trna_ext) const
Definition: readfeat.cpp:1792
bool x_SetupSeqFeat(CRef< CSeq_feat > sfp, const string &feat, const TFlags flags, ITableFilter *filter)
Definition: readfeat.cpp:2892
unordered_set< string > m_ProcessedTranscriptIds
Definition: readfeat.cpp:503
void x_ProcessMsg(int line_num, ILineError::EProblem eProblem, EDiagSev eSeverity, const std::string &strFeatureName=kEmptyStr, const std::string &strQualifierName=kEmptyStr, const std::string &strQualifierValue=kEmptyStr, const std::string &strErrorMessage=kEmptyStr, const ILineError::TVecOfLines &vecOfOtherLines=ILineError::TVecOfLines())
void x_ProcessMsg(ILineError::EProblem eProblem, EDiagSev eSeverity, const std::string &strFeatureName=kEmptyStr, const std::string &strQualifierName=kEmptyStr, const std::string &strQualifierValue=kEmptyStr, const std::string &strErrorMessage=kEmptyStr, const ILineError::TVecOfLines &vecOfOtherLines=ILineError::TVecOfLines())
CFeatureTableReader_Imp(const CFeatureTableReader_Imp &value)
CFeature_table_reader(TReaderFlags fReaderFlags=0)
Definition: readfeat.cpp:3581
CRef< CSerialObject > ReadObject(ILineReader &lr, ILineErrorListener *pErrors) override
Read an object from a given line reader, render it as the most appropriate Genbank object.
Definition: readfeat.cpp:3595
long TFlags
binary OR of EFlags
Definition: readfeat.hpp:79
static bool ParseInitialFeatureLine(const CTempString &line_arg, CTempStringEx &out_seqid, CTempStringEx &out_annotname)
If line_arg is a feature line (e.g.
Definition: readfeat.cpp:3846
static void ReadSequinFeatureTables(ILineReader &reader, CSeq_entry &entry, const TFlags flags=0, ILineErrorListener *pMessageListener=nullptr, ITableFilter *filter=nullptr)
Definition: readfeat.cpp:3767
@ fSuppressBadKeyWarning
= 0x400 (Suppress 'bad key' errors; Not recommended.)
Definition: readfeat.hpp:77
@ fReportDiscouragedKey
= 0x40 (Report discouraged keys into the error container)
Definition: readfeat.hpp:73
@ fKeepBadKey
= 0x02 (As much as possible, try to use bad keys as if they were acceptable)
Definition: readfeat.hpp:68
@ fIgnoreWebComments
= 0x08 (ignore web comment lines such as lines that start with " INFO:", or consist of many equals si...
Definition: readfeat.hpp:70
@ fAllIdsAsLocal
= 0x100 (Do not attempt to parse accessions)
Definition: readfeat.hpp:75
@ fLeaveProteinIds
= 0x80 (Leave all protein_id as a qualifiers)
Definition: readfeat.hpp:74
@ fCreateGenesFromCDSs
= 0x10 (If a CDS has a gene xref, create a gene with the same intervals if one doesn't already exist....
Definition: readfeat.hpp:71
@ fPreferGenbankId
= 0x200 (Prefer Genbank accession ids)
Definition: readfeat.hpp:76
@ fTranslateBadKey
= 0x04 (yields misc_feature /standard_name="...")
Definition: readfeat.hpp:69
@ fCDSsMustBeInTheirGenes
= 0x20 (If a CDS has a gene xref, it *must* be inside of that gene)
Definition: readfeat.hpp:72
CRef< CSeq_annot > ReadSeqAnnot(ILineReader &lr, ILineErrorListener *pErrors) override
Read an object from a given line reader, render it as a single Seq-annot, if possible.
Definition: readfeat.cpp:3605
unique_ptr< CFeatureTableReader_Imp > m_pImpl
Definition: readfeat.hpp:191
static CRef< CSeq_annot > x_ReadFeatureTable(CFeatureTableReader_Imp &reader, const CTempString &seqid, const CTempString &annot_name, const TFlags flags, ITableFilter *filter)
Definition: readfeat.cpp:3639
static CRef< CSeq_feat > CreateSeqFeat(const string &feat, CSeq_loc &location, const TFlags flags=0, ILineErrorListener *pMessageListener=nullptr, unsigned int line=0, std::string *seq_id=nullptr, ITableFilter *filter=nullptr)
Definition: readfeat.cpp:3814
static void AddFeatQual(CRef< CSeq_feat > sfp, const string &feat_name, const string &qual, const string &val, const TFlags flags=0, ILineErrorListener *pMessageListener=nullptr, int line=0, const string &seq_id=std::string())
Definition: readfeat.cpp:3829
static void AddStringFlags(const list< string > &stringFlags, TFlags &baseFlags)
Definition: readfeat.cpp:3737
CRef< CSeq_annot > ReadSequinFeatureTable(const TFlags flags=0, ITableFilter *filter=nullptr, const string &seqid_prefix=kEmptyStr)
Definition: readfeat.cpp:3714
@Gb_qual.hpp User-defined methods of the data storage class.
Definition: Gb_qual.hpp:61
static const CTrans_table & GetTransTable(int id)
static int CodonToIndex(char base1, char base2, char base3)
*** Import *********************************************** * * Features imported from other databases...
Definition: Imp_feat_.hpp:77
static void GetPrefixAndRemainder(const string &inference, string &prefix, string &remainder)
Definition: Gb_qual.cpp:381
bool operator()(char c)
Definition: readfeat.cpp:1210
bool operator()(char c)
Definition: readfeat.cpp:1205
static CObjReaderLineException * Create(EDiagSev eSeverity, unsigned int uLine, const std::string &strMessage, EProblem eProblem=eProblem_GeneralParsingError, const std::string &strSeqId=string(""), const std::string &strFeatureName=string(""), const std::string &strQualifierName=string(""), const std::string &strQualifierValue=string(""), CObjReaderLineException::EErrCode eErrCode=eFormat, const TVecOfLines &vecOfOtherLines=TVecOfLines())
Please use this instead of the constructor because the ctor is protected.
Definition: line_error.cpp:194
@OrgMod.hpp User-defined methods of the data storage class.
Definition: OrgMod.hpp:54
Definition: Pub.hpp:56
CRNA_qual –.
Definition: RNA_qual.hpp:66
@RNA_ref.hpp User-defined methods of the data storage class.
Definition: RNA_ref.hpp:54
static void AddGeneOntologyTerm(CSeq_feat &feature, const CTempString &qual, const CTempString &val)
Definition: read_util.cpp:296
Defines and provides stubs for a general interface to a variety of file readers.
Definition: reader_base.hpp:63
long TReaderFlags
Definition: reader_base.hpp:84
TReaderFlags m_iFlags
static void xAddStringFlagsWithMap(const list< string > &stringFlags, const map< string, TReaderFlags > flagMap, TReaderFlags &baseFlags)
void SetProt(TProt &v)
void SetRegion(const TRegion &v)
void SetBiosrc(TBiosrc &v)
static bool IsDiscouragedQual(EQualifier qual)
EQualifier
List of available qualifiers for feature keys.
void SetBond(const TBond &v)
static bool CanHaveGene(ESubtype subtype)
void SetSite(const TSite &v)
static const CSiteList * GetSiteList()
void SetPub(TPub &v)
ESubtype GetSubtype(void) const
void SetImp(TImp &v)
static bool IsDiscouragedSubtype(ESubtype subtype)
static E_Choice GetTypeFromSubtype(ESubtype subtype)
void SetRna(TRna &v)
void SetCdregion(TCdregion &v)
@ eSubtype_bad
These no longer need to match the FEATDEF values in the C toolkit's objfdef.h.
@ eSubtype_transit_peptide_aa
static EQualifier GetQualifierType(CTempString qual)
convert qual string to enumerated value
static const CBondList * GetBondList()
static CTempString GetQualifierAsString(EQualifier qual)
Convert a qualifier from an enumerated value to a string representation or empty if not found.
static ESubtype SubtypeNameToValue(CTempString sName)
Turn a string into its ESubtype which is NOT necessarily related to the identifier of the enum.
static bool IsRegulatory(ESubtype subtype)
void SetGene(TGene &v)
static const vector< string > & GetRegulatoryClassList()
CSeqIdException –.
Definition: Seq_id.hpp:969
bool IsFtable(void) const
Definition: Seq_annot.cpp:177
Definition: Seq_entry.hpp:56
namespace ncbi::objects::
Definition: Seq_feat.hpp:58
const CGene_ref * GetGeneXref(void) const
See related function in util/feature.hpp.
Definition: Seq_feat.cpp:181
void SetGeneXref(CGene_ref &value)
Definition: Seq_feat.cpp:192
void SetProtXref(CProt_ref &value)
Definition: Seq_feat.cpp:233
void SetPartialStart(bool val, ESeqLocExtremes ext)
void SetPartialStop(bool val, ESeqLocExtremes ext)
Seq-loc iterator class – iterates all intervals from a seq-loc in the correct order.
Definition: Seq_loc.hpp:453
void SetRightOf(bool val)
Definition: Seq_point.cpp:193
void SetPartialStart(bool val, ESeqLocExtremes ext)
Definition: Seq_point.cpp:100
void SetPartialStop(bool val, ESeqLocExtremes ext)
Definition: Seq_point.cpp:116
class CStaticArrayMap<> is an array adaptor that provides an STLish interface to statically-defined a...
Definition: static_map.hpp:105
TBase::const_iterator const_iterator
Definition: static_map.hpp:109
Simple implementation of ILineReader for i(o)streams.
CStringException –.
Definition: ncbistr.hpp:4505
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
Definition: tempstr.hpp:65
aa this carries
Definition: Trna_ext_.hpp:96
CUser_object & AddField(const string &label, const string &value, EParseField parse=eParse_String)
add a data field to the user object that holds a given value
@ eParse_Number
Parse a real or integer number, otherwise string.
Definition: User_object.hpp:62
virtual bool PutError(const ILineError &)=0
Store error in the container, and return true if error was stored fine, and return false if the calle...
virtual void PutProgress(const string &sMessage, const Uint8 iNumDone=0, const Uint8 iNumTotal=0)=0
This is used for processing progress messages.
virtual EDiagSev Severity(void) const
Definition: line_error.hpp:370
@ eProblem_InvalidQualifier
Definition: line_error.hpp:92
@ eProblem_QualifierBadValue
Definition: line_error.hpp:68
@ eProblem_NumericQualifierValueIsNotANumber
Definition: line_error.hpp:61
@ eProblem_InternalPartialsInFeatLocation
Definition: line_error.hpp:72
@ eProblem_FeatMustBeInXrefdGene
Definition: line_error.hpp:73
@ eProblem_UnrecognizedFeatureName
Definition: line_error.hpp:58
@ eProblem_FeatureNameNotAllowed
Definition: line_error.hpp:62
@ eProblem_DuplicateIDs
Definition: line_error.hpp:94
@ eProblem_IncompleteFeature
Definition: line_error.hpp:65
@ eProblem_QualifierWithoutFeature
Definition: line_error.hpp:64
@ eProblem_FeatureBadStartAndOrStop
Definition: line_error.hpp:66
@ eProblem_NumericQualifierValueHasExtraTrailingCharacters
Definition: line_error.hpp:60
@ eProblem_UnrecognizedSquareBracketCommand
Definition: line_error.hpp:75
@ eProblem_UnrecognizedQualifierName
Definition: line_error.hpp:59
@ eProblem_BadFeatureInterval
Definition: line_error.hpp:67
@ eProblem_DiscouragedFeatureName
Definition: line_error.hpp:90
@ eProblem_NoFeatureProvidedOnIntervals
Definition: line_error.hpp:63
@ eProblem_DiscouragedQualifierName
Definition: line_error.hpp:91
virtual const std::string & ErrorMessage(void) const
Definition: line_error.hpp:174
vector< unsigned int > TVecOfLines
Definition: line_error.hpp:128
virtual EProblem Problem(void) const =0
Abstract base class for lightweight line-by-line reading.
Definition: line_reader.hpp:54
Use to give a feature filter to CFeature_table_reader.
EAction
How a given feature name should be handled.
@ eAction_Okay
Just accept the feat.
@ eAction_Disallowed
Do not accept the feat and give message eProblem_FeatureNameNotAllowed.
virtual EAction GetFeatAction(const string &feature_name) const =0
Returns how we should treat the given feature name.
Definition: map.hpp:338
const_iterator_pair equal_range(const key_type &key) const
Definition: map.hpp:296
iterator insert(const value_type &val)
Definition: map.hpp:305
Definition: set.hpp:45
iterator_bool insert(const value_type &val)
Definition: set.hpp:149
const_iterator begin() const
Definition: set.hpp:135
const_iterator_pair equal_range(const key_type &key) const
Definition: set.hpp:140
bool empty() const
Definition: set.hpp:133
const_iterator end() const
Definition: set.hpp:136
void swap(this_type &m)
Definition: set.hpp:102
static const char location[]
Definition: config.c:97
char value[7]
Definition: config.c:431
Include a standard set of the NCBI C++ Toolkit most basic headers.
static uch flags
static DLIST_TYPE *DLIST_NAME() first(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:46
static DLIST_TYPE *DLIST_NAME() last(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:51
const TResidue codons[4][4]
Definition: gnomon_seq.cpp:76
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
void swap(NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair1, NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair2)
Definition: ncbimisc.hpp:1508
#define ENTREZ_ID_FROM(T, value)
Definition: ncbimisc.hpp:1098
string
Definition: cgiapp.hpp:687
#define ERR_POST_X(err_subcode, message)
Error posting with default error code and given error subcode.
Definition: ncbidiag.hpp:550
EDiagSev
Severity level for the posted diagnostics.
Definition: ncbidiag.hpp:650
@ eDiag_Error
Error message.
Definition: ncbidiag.hpp:653
@ eDiag_Warning
Warning message.
Definition: ncbidiag.hpp:652
void Warning(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1191
virtual void UngetLine(void)=0
Unget current line, which must be valid.
virtual Uint8 GetLineNumber(void) const =0
Returns the current line number (counting from 1, not 0).
virtual bool AtEOF(void) const =0
Indicates (negatively) whether there is any more input.
const string AsFastaString(void) const
Definition: Seq_id.cpp:2265
static SIZE_TYPE ParseIDs(CBioseq::TId &ids, const CTempString &s, TParseFlags flags=fParse_Default)
Parse a string representing one or more Seq-ids, appending the results to IDS.
Definition: Seq_id.cpp:2612
void GetLabel(string *label, ELabelType type=eDefault, TLabelFlags flags=fLabel_Default) const
Append a label for this Seq-id to the supplied string.
Definition: Seq_id.cpp:2039
@ fParse_PartialOK
Warn rather than throwing an exception when a FASTA-style ID set contains unparsable portions,...
Definition: Seq_id.hpp:80
@ fParse_AnyLocal
Treat otherwise unidentified strings as local accessions as long as they don't resemble FASTA-style I...
Definition: Seq_id.hpp:90
@ fParse_Default
By default in ParseIDs and IsValid, allow raw parsable non-numeric accessions and plausible local acc...
Definition: Seq_id.hpp:102
@ fParse_ValidLocal
Treat otherwise unidentified strings as raw accessions, provided that they pass rudimentary validatio...
Definition: Seq_id.hpp:87
@ eFasta
Tagged ID in NCBI's traditional FASTA style.
Definition: Seq_id.hpp:575
bool IsPartialStart(ESeqLocExtremes ext) const
check start or stop of location for e_Lim fuzz
Definition: Seq_loc.cpp:3222
ENa_strand GetStrand(void) const
Get the location's strand.
Definition: Seq_loc.cpp:882
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Override Assign() to incorporate cache invalidation.
Definition: Seq_loc.cpp:337
void SetPnt(TPnt &v)
Definition: Seq_loc.hpp:985
void SetInt(TInt &v)
Definition: Seq_loc.hpp:983
CRef< CSeq_loc > Subtract(const CSeq_loc &other, TOpFlags flags, ISynonymMapper *syn_mapper, ILengthGetter *len_getter) const
Subtract seq-loc from this, merge/sort resulting ranges depending on flags.
Definition: Seq_loc.cpp:5087
const CSeq_loc & GetEmbeddingSeq_loc(void) const
Get the nearest seq-loc containing the current range.
Definition: Seq_loc.cpp:2573
const CSeq_id * GetId(void) const
Get the id of the location return NULL if has multiple ids or no id at all.
Definition: Seq_loc.hpp:941
void SetNull(void)
Override all setters to incorporate cache invalidation.
Definition: Seq_loc.hpp:960
bool IsPartialStop(ESeqLocExtremes ext) const
Definition: Seq_loc.cpp:3251
@ fSortAndMerge_All
Definition: Seq_loc.hpp:334
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
TObjectType * GetPointerOrNull(void) const THROWS_NONE
Get pointer value.
Definition: ncbiobj.hpp:1672
bool IsNull(void) const THROWS_NONE
Check if pointer is null – same effect as Empty().
Definition: ncbiobj.hpp:735
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
Definition: ncbiobj.hpp:719
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
unsigned char Uchar
Alias for unsigned char.
Definition: ncbitype.h:95
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
Definition: ncbistre.hpp:146
NCBI_NS_STD::string::size_type SIZE_TYPE
Definition: ncbistr.hpp:132
const_iterator end() const
Return an iterator to the string's ending position (one past the end of the represented sequence)
Definition: tempstr.hpp:306
#define kEmptyStr
Definition: ncbistr.hpp:123
static int CompareNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive compare of a substring with another string.
Definition: ncbistr.cpp:219
static int StringToInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to int.
Definition: ncbistr.cpp:630
static SIZE_TYPE FindNoCase(const CTempString str, const CTempString pattern, SIZE_TYPE start, SIZE_TYPE end, EOccurrence which=eFirst)
Find the pattern in the specified range of a string using a case insensitive search.
Definition: ncbistr.cpp:2989
static bool EndsWith(const CTempString str, const CTempString end, ECase use_case=eCase)
Check if a string ends with a specified suffix value.
Definition: ncbistr.hpp:5429
CTempStringEx substr(size_type pos) const
Obtain a substring from this string, beginning at a given offset.
Definition: tempstr.hpp:1010
static bool IsBlank(const CTempString str, SIZE_TYPE pos=0)
Check if a string is blank (has no text).
Definition: ncbistr.cpp:106
#define NPOS
Definition: ncbistr.hpp:133
static void TruncateSpacesInPlace(string &str, ETrunc where=eTrunc_Both)
Truncate spaces in a string (in-place)
Definition: ncbistr.cpp:3197
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5083
bool empty(void) const
Return true if the represented string is empty (i.e., the length is zero)
Definition: tempstr.hpp:334
void clear(void)
Clear value to an empty string.
Definition: tempstr.hpp:1003
static long StringToLong(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to long.
Definition: ncbistr.cpp:653
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
Definition: ncbistr.hpp:5411
size_t size_type
Definition: tempstr.hpp:70
static bool SplitInTwo(const CTempString str, const CTempString delim, string &str1, string &str2, TSplitFlags flags=0)
Split a string into two pieces using the specified delimiters.
Definition: ncbistr.cpp:3550
size_type length(void) const
Return the length of the represented array.
Definition: tempstr.hpp:320
CTempString substr(size_type pos) const
Obtain a substring from this string, beginning at a given offset.
Definition: tempstr.hpp:776
size_type find_first_not_of(const CTempString match, size_type pos=0) const
Find the first occurrence of any character not in the matching string within the current string,...
Definition: tempstr.hpp:553
static bool EqualNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive equality of a substring with another string.
Definition: ncbistr.hpp:5352
size_type find(const CTempString match, size_type pos=0) const
Find the first instance of the entire matching string within the current string, beginning at an opti...
Definition: tempstr.hpp:655
static string TruncateSpaces(const string &str, ETrunc where=eTrunc_Both)
Truncate spaces in a string.
Definition: ncbistr.cpp:3182
size_type size(void) const
Return the length of the represented array.
Definition: tempstr.hpp:327
static string & ToLower(string &str)
Convert string to lower case – string& version.
Definition: ncbistr.cpp:405
static const size_type npos
Definition: tempstr.hpp:72
const_iterator begin() const
Return an iterator to the string's starting position.
Definition: tempstr.hpp:299
@ fAllowTrailingSymbols
Ignore trailing non-numerics characters.
Definition: ncbistr.hpp:298
@ fSplit_Tokenize
All delimiters are merged and trimmed, to get non-empty tokens only.
Definition: ncbistr.hpp:2508
@ eTrunc_End
Truncate trailing spaces only.
Definition: ncbistr.hpp:2241
@ eTrunc_Begin
Truncate leading spaces only.
Definition: ncbistr.hpp:2240
@ eNocase
Case insensitive compare.
Definition: ncbistr.hpp:1206
void SetSubtype(TSubtype value)
Assign a value to Subtype data member.
Definition: SubSource_.hpp:319
list< CRef< CSubSource > > TSubtype
Definition: BioSource_.hpp:145
void SetGenome(TGenome value)
Assign a value to Genome data member.
Definition: BioSource_.hpp:428
void SetOrg(TOrg &value)
Assign a value to Org data member.
Definition: BioSource_.cpp:108
void SetName(const TName &value)
Assign a value to Name data member.
Definition: SubSource_.hpp:359
EGenome
biological context
Definition: BioSource_.hpp:97
TSubtype & SetSubtype(void)
Assign a value to Subtype data member.
Definition: BioSource_.hpp:545
@ eSubtype_collection_date
DD-MMM-YYYY format.
Definition: SubSource_.hpp:114
@ eSubtype_fwd_primer_seq
sequence (possibly more than one; semicolon-separated)
Definition: SubSource_.hpp:117
@ eSubtype_lat_lon
+/- decimal degrees
Definition: SubSource_.hpp:113
@ eSubtype_collected_by
name of person who collected the sample
Definition: SubSource_.hpp:115
@ eSubtype_rev_primer_seq
sequence (possibly more than one; semicolon-separated)
Definition: SubSource_.hpp:118
@ eSubtype_environmental_sample
Definition: SubSource_.hpp:111
@ eSubtype_endogenous_virus_name
Definition: SubSource_.hpp:109
@ eSubtype_identified_by
name of person who identified the sample
Definition: SubSource_.hpp:116
TSyn & SetSyn(void)
Assign a value to Syn data member.
Definition: Gene_ref_.hpp:774
void SetAllele(const TAllele &value)
Assign a value to Allele data member.
Definition: Gene_ref_.hpp:561
bool IsSetLocus_tag(void) const
systematic gene name (e.g., MI0001, ORF0069) Check if a value has been assigned to Locus_tag data mem...
Definition: Gene_ref_.hpp:781
bool IsSetLocus(void) const
Official gene symbol Check if a value has been assigned to Locus data member.
Definition: Gene_ref_.hpp:493
void SetLocus(const TLocus &value)
Assign a value to Locus data member.
Definition: Gene_ref_.hpp:514
void SetLocus_tag(const TLocus_tag &value)
Assign a value to Locus_tag data member.
Definition: Gene_ref_.hpp:802
void SetMaploc(const TMaploc &value)
Assign a value to Maploc data member.
Definition: Gene_ref_.hpp:655
const TLocus_tag & GetLocus_tag(void) const
Get the Locus_tag member data.
Definition: Gene_ref_.hpp:793
list< string > TSyn
Definition: Gene_ref_.hpp:102
const TLocus & GetLocus(void) const
Get the Locus member data.
Definition: Gene_ref_.hpp:505
void SetDesc(const TDesc &value)
Assign a value to Desc data member.
Definition: Gene_ref_.hpp:608
bool IsStr(void) const
Check if variant Str is selected.
Definition: Object_id_.hpp:291
void SetTag(TTag &value)
Assign a value to Tag data member.
Definition: Dbtag_.cpp:66
const TStr & GetStr(void) const
Get the variant data.
Definition: Object_id_.hpp:297
TStr & SetStr(void)
Select the variant.
Definition: Object_id_.hpp:304
void SetType(TType &value)
Assign a value to Type data member.
void SetDb(const TDb &value)
Assign a value to Db data member.
Definition: Dbtag_.hpp:229
TId & SetId(void)
Select the variant.
Definition: Object_id_.hpp:277
void SetDiv(const TDiv &value)
Assign a value to Div data member.
Definition: OrgName_.hpp:1014
void SetSubtype(TSubtype value)
Assign a value to Subtype data member.
Definition: OrgMod_.hpp:316
void SetTaxname(const TTaxname &value)
Assign a value to Taxname data member.
Definition: Org_ref_.hpp:381
list< CRef< COrgMod > > TMod
Definition: OrgName_.hpp:332
void SetGcode(TGcode value)
Assign a value to Gcode data member.
Definition: OrgName_.hpp:927
void SetMgcode(TMgcode value)
Assign a value to Mgcode data member.
Definition: OrgName_.hpp:974
TMod & SetMod(void)
Assign a value to Mod data member.
Definition: OrgName_.hpp:845
void SetOrgname(TOrgname &value)
Assign a value to Orgname data member.
Definition: Org_ref_.cpp:87
void SetSubname(const TSubname &value)
Assign a value to Subname data member.
Definition: OrgMod_.hpp:356
void SetLineage(const TLineage &value)
Assign a value to Lineage data member.
Definition: OrgName_.hpp:873
@ eSubtype_biotype
Definition: OrgMod_.hpp:97
@ eSubtype_subgroup
Definition: OrgMod_.hpp:99
@ eSubtype_gb_acronym
used by taxonomy database
Definition: OrgMod_.hpp:115
@ eSubtype_gb_synonym
used by taxonomy database
Definition: OrgMod_.hpp:117
@ eSubtype_substrain
Definition: