NCBI C++ ToolKit
source_mod_parser.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: source_mod_parser.cpp 101909 2024-03-01 12:11:21Z stakhovv $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors: Aaron Ucko, Jonathan Kans, Vasuki Gobi, Michael Kornbluh
27 *
28 * File Description:
29 * Parser for source modifiers, as found in (Sequin-targeted) FASTA files.
30 *
31 * ===========================================================================
32 */
33 
34 #include <ncbi_pch.hpp>
35 
36 #include <sstream>
37 
38 #include <objtools/readers/source_mod_parser.hpp>
40 
41 #include <corelib/ncbiutil.hpp>
42 #include <util/static_map.hpp>
43 #include <serial/enumvalues.hpp>
44 
49 #include <objects/pub/Pub.hpp>
51 #include <objects/seq/Bioseq.hpp>
52 #include <objects/seq/Pubdesc.hpp>
54 #include <objects/seq/Seq_data.hpp>
56 #include <objects/seq/Seq_inst.hpp>
57 #include <objects/seq/Seqdesc.hpp>
69 
71 
74 
75 namespace
76 {
77  class equal_subtype
78  {
79  public:
80  equal_subtype(CSubSource::TSubtype st) : m_st(st){};
81  bool operator()(const CRef<CSubSource>& st) const
82  {
83  return st->IsSetSubtype() && (st->GetSubtype() == m_st);
84  }
85  private:
87  };
88 
89 #ifdef STATIC_SMOD
90 # error "STATIC_SMOD already defined"
91 #endif
92 
93  // The macro makes sure that the var's name matches its key.
94  // Due to kKeyCanonicalizationTable, it's okay to use '_' for '-'
95  // because it will match both.
96 
97 
98 #define STATIC_SMOD(key_str) \
99  const char s_Mod_s_##key_str[] = #key_str; \
100  const size_t s_Mod_n_##key_str = sizeof(#key_str)-1; \
101  const CTempString s_Mod_##key_str(s_Mod_s_##key_str, s_Mod_n_##key_str)
102 
103 
104  // For CBioseq
105  STATIC_SMOD(topology);
106  STATIC_SMOD(top);
107  STATIC_SMOD(molecule);
108  STATIC_SMOD(mol);
109  STATIC_SMOD(moltype);
110  STATIC_SMOD(mol_type);
111  STATIC_SMOD(strand);
112  STATIC_SMOD(comment);
113 
114  // For CBioSource
115  STATIC_SMOD(organism);
116  STATIC_SMOD(org);
117  STATIC_SMOD(taxname);
118  STATIC_SMOD(taxid);
121  STATIC_SMOD(sub_clone);
122  STATIC_SMOD(lat_long);
123  STATIC_SMOD(latitude_longitude);
124  STATIC_SMOD(fwd_primer_seq);
125  STATIC_SMOD(fwd_pcr_primer_seq);
126  STATIC_SMOD(rev_primer_seq);
127  STATIC_SMOD(rev_pcr_primer_seq);
128  STATIC_SMOD(fwd_primer_name);
129  STATIC_SMOD(fwd_pcr_primer_name);
130  STATIC_SMOD(rev_primer_name);
131  STATIC_SMOD(rev_pcr_primer_name);
132  STATIC_SMOD(dbxref);
133  STATIC_SMOD(db_xref);
134  STATIC_SMOD(division);
135  STATIC_SMOD(div);
136  STATIC_SMOD(lineage);
137  STATIC_SMOD(gcode);
138  STATIC_SMOD(mgcode);
139  STATIC_SMOD(pgcode);
140  STATIC_SMOD(note);
141  STATIC_SMOD(notes);
142  STATIC_SMOD(focus);
143 
144  // For CMolInfo
145  STATIC_SMOD(tech);
146  STATIC_SMOD(completeness);
147  STATIC_SMOD(completedness);
148 
149  // For CGene_ref
150  STATIC_SMOD(gene);
151  STATIC_SMOD(allele);
152  STATIC_SMOD(gene_syn);
153  STATIC_SMOD(gene_synonym);
154  STATIC_SMOD(locus_tag);
155 
156  // For CProt_ref
157  STATIC_SMOD(protein);
158  STATIC_SMOD(prot);
159  STATIC_SMOD(prot_desc);
160  STATIC_SMOD(protein_desc);
161  STATIC_SMOD(EC_number);
162  STATIC_SMOD(activity);
163  STATIC_SMOD(function);
164 
165  // For CGB_block
166  STATIC_SMOD(secondary_accession);
167  STATIC_SMOD(secondary_accessions);
168  STATIC_SMOD(keyword);
169  STATIC_SMOD(keywords);
170 
171  STATIC_SMOD(biosample);
172  STATIC_SMOD(bioproject);
173  // For TPA Mods (CUser_object)
174  STATIC_SMOD(primary);
175  STATIC_SMOD(primary_accessions);
176  // For SRA (Sequence Read Archive) CUser_object
177  STATIC_SMOD(SRA);
178 
179  // For Genome Project DB Mods (CUser_object)
180  STATIC_SMOD(project);
181  STATIC_SMOD(projects);
182 
183  // For Pub Mods (CSeq_descr)
184  STATIC_SMOD(PubMed);
185  STATIC_SMOD(PMID);
186 
187 
188 #undef STATIC_SMOD
189 
191 
192  // Loads up a map of SMod to subtype
193  template<typename TEnum,
194  typename TSModEnumMap = map<CSourceModParser::SMod, TEnum>,
195  typename TEnumNameToValMap = map<string, TEnum>>
196  TSModEnumMap * s_InitSmodToEnumMap(
197  const CEnumeratedTypeValues* etv,
198  // names to skip
199  const TSModNameSet & skip_enum_names,
200  // extra values to add that aren't in the enum
201  const TEnumNameToValMap & extra_enum_names_to_vals )
202  {
203  unique_ptr<TSModEnumMap> smod_enum_map(new TSModEnumMap);
204 
206  const string & enum_name = it->first;
207  const TEnum enum_val = static_cast<TEnum>(it->second);
208  if( skip_enum_names.find(enum_name.c_str()) !=
209  skip_enum_names.end() )
210  {
211  // skip this tag
212  continue;
213  }
214  auto emplace_result =
215  smod_enum_map->emplace(
216  CSourceModParser::SMod(enum_name), enum_val);
217  // emplace must succeed
218  if( ! emplace_result.second) {
220  "s_InitSmodToEnumMap " << enum_name);
221  }
222  }
223 
224  for(auto extra_smod_to_enum : extra_enum_names_to_vals) {
225  auto emplace_result =
226  smod_enum_map->emplace(
227  CSourceModParser::SMod(extra_smod_to_enum.first),
228  extra_smod_to_enum.second);
229  // emplace must succeed
230  if( ! emplace_result.second) {
232  "s_InitSmodToEnumMap " << extra_smod_to_enum.first);
233  }
234  }
235 
236  return smod_enum_map.release();
237  }
238 
239  typedef map<CSourceModParser::SMod, COrgMod::ESubtype> TSModOrgSubtypeMap;
240 
241  TSModOrgSubtypeMap * s_InitSModOrgSubtypeMap(void)
242  {
243  const TSModNameSet kDeprecatedOrgSubtypes{
244  "dosage", "old-lineage", "old-name"};
245  const map<const char*, COrgMod::ESubtype> extra_smod_to_enum_names {
246  { "subspecies", COrgMod::eSubtype_sub_species },
247  { "host", COrgMod::eSubtype_nat_host },
248  { "specific-host", COrgMod::eSubtype_nat_host },
249  };
250 
251  return s_InitSmodToEnumMap<COrgMod::ESubtype>(
252  COrgMod::ENUM_METHOD_NAME(ESubtype)(),
253  kDeprecatedOrgSubtypes,
254  extra_smod_to_enum_names
255  );
256  }
257 
258  // The subtype SMods are loaded from the names of the enum
259  // and they map to ESubtype enum values so we can't just use STATIC_SMOD
260  CSafeStatic<TSModOrgSubtypeMap> kSModOrgSubtypeMap(s_InitSModOrgSubtypeMap,
261  nullptr);
262 
263  typedef map<CSourceModParser::SMod,
264  CSubSource::ESubtype> TSModSubSrcSubtype;
265 
266  TSModSubSrcSubtype * s_InitSModSubSrcSubtypeMap(void)
267  {
268  // some are skipped because they're handled specially and some are
269  // skipped because they're deprecated
270  TSModNameSet skip_enum_names {
271  // skip because handled specially elsewhere
272  "fwd_primer_seq", "rev_primer_seq",
273  "fwd_primer_name", "rev_primer_name",
274  "fwd_PCR_primer_seq", "rev_PCR_primer_seq",
275  "fwd_PCR_primer_name", "rev_PCR_primer_name",
276  // skip because deprecated
277  "transposon_name",
278  "plastid_name",
279  "insertion_seq_name",
280  };
281  const map<string, CSubSource::ESubtype> extra_smod_to_enum_names {
282  { "sub-clone", CSubSource::eSubtype_subclone },
283  { "lat-long", CSubSource::eSubtype_lat_lon },
284  { "latitude-longitude", CSubSource::eSubtype_lat_lon },
285  };
286  return s_InitSmodToEnumMap<CSubSource::ESubtype>(
287  CSubSource::ENUM_METHOD_NAME(ESubtype)(),
288  skip_enum_names,
289  extra_smod_to_enum_names);
290  }
291 
292  CSafeStatic<TSModSubSrcSubtype> kSModSubSrcSubtypeMap(
293  s_InitSModSubSrcSubtypeMap, nullptr);
294 
295  bool x_FindBrackets(const CTempString& str, size_t& start, size_t& stop, size_t& eq_pos)
296  {
297  size_t i = start;
298  eq_pos = CTempString::npos;
299 
300  const char* s = str.data() + start;
301 
302  int nested_brackets = -1;
303  while (i < str.size())
304  {
305  switch (*s)
306  {
307  case '[':
308  nested_brackets++;
309  if (nested_brackets == 0)
310  {
311  start = i;
312  }
313  break;
314  case '=':
315  if (nested_brackets >= 0)
316  if (eq_pos == CTempString::npos)
317  eq_pos = i;
318  break;
319  case ']':
320  if (nested_brackets == 0)
321  {
322  stop = i;
323  if (eq_pos == CTempString::npos)
324  eq_pos = i;
325  return true;
326  }
327  else
328  if (nested_brackets < 0)
329  return false;
330  else
331  {
332  nested_brackets--;
333  }
334  }
335  i++; s++;
336  }
337  return false;
338  };
339 
340  void x_AppendIfNonEmpty(string& s, const CTempString& o)
341  {
342  if (!o.empty())
343  {
344  if (!s.empty())
345  s.push_back(' ');
346  s.append(o.data(), o.length());
347  }
348  }
349 
350 };
351 
352 
354 
355 // ASCII letters to lowercase, space and underscore to hyphen.
356 const unsigned char CSourceModParser::kKeyCanonicalizationTable[257] =
357  "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F"
358  "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F"
359  "-!\"#$%&'()*+,-./0123456789:;<=>?"
360  "@abcdefghijklmnopqrstuvwxyz[\\]^-"
361  "`abcdefghijklmnopqrstuvwxyz{|}~\x7F"
362  "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8A\x8B\x8C\x8D\x8E\x8F"
363  "\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9A\x9B\x9C\x9D\x9E\x9F"
364  "\xA0\xA1\xA2\xA3\xA4\xA5\xA6\xA7\xA8\xA9\xAA\xAB\xAC\xAD\xAE\xAF"
365  "\xB0\xB1\xB2\xB3\xB4\xB5\xB6\xB7\xB8\xB9\xBA\xBB\xBC\xBD\xBE\xBF"
366  "\xC0\xC1\xC2\xC3\xC4\xC5\xC6\xC7\xC8\xC9\xCA\xCB\xCC\xCD\xCE\xCF"
367  "\xD0\xD1\xD2\xD3\xD4\xD5\xD6\xD7\xD8\xD9\xDA\xDB\xDC\xDD\xDE\xDF"
368  "\xE0\xE1\xE2\xE3\xE4\xE5\xE6\xE7\xE8\xE9\xEA\xEB\xEC\xED\xEE\xEF"
369  "\xF0\xF1\xF2\xF3\xF4\xF5\xF6\xF7\xF8\xF9\xFA\xFB\xFC\xFD\xFE\xFF";
370 
371 
372 template<class _T>
373 class CAutoInitDesc : protected CAutoAddDesc
374 {
375 public:
377  CAutoInitDesc(CBioseq& bioseq, CSeqdesc::E_Choice which);
379  CAutoInitDesc(_T& obj);
380  _T* operator->();
381  _T& operator*();
382 protected:
384  void _getfromdesc();
387 };
388 
390 {
391 public:
393  :m_bioseq(seq), m_id(id)
394  {
395  }
396  bool IsInitialised() const
397  {
398  return !m_dblink.Empty();
399  }
400 
402  {
403  if (m_dblink)
404  return *m_dblink;
405 
406  for (auto& d : m_bioseq.SetDescr().Set())
407  {
408  if (d->IsUser() && d->GetUser().IsDBLink())
409  {
410  for (auto& u : d->SetUser().SetData())
411  {
412  if (u->IsSetLabel() && u->GetLabel().IsStr() &&
413  NStr::EqualCase(u->GetLabel().GetStr(), m_id))
414  {
415  m_dblink = u;
416  return *m_dblink;
417  }
418  }
419  }
420  }
421  if (m_dblink.Empty())
422  {
424  m_user_obj->SetUser().SetType().SetStr() = "DBLink";
426  m_dblink->SetLabel().SetStr() = m_id;
427  m_user_obj->SetUser().SetData().push_back(m_dblink);
428  m_bioseq.SetDescr().Set().push_back(m_user_obj);
429  }
430 
431  return *m_dblink;
432  }
433 protected:
438 };
439 
441 
442 template<class _T>
443 inline
445  CAutoAddDesc(descr, which),
446  m_ptr(nullptr)
447 {
448 }
449 
450 template<class _T>
451 inline
453  CAutoAddDesc(*fake_descr, which),
454  m_ptr(nullptr),
455  m_bioseq(&bioseq)
456 {
457  m_descr.Reset();
458 }
459 
460 template<class _T>
461 inline
463  CAutoAddDesc(*fake_descr, which),
464  m_ptr(nullptr),
465  m_bioset(&bioset)
466 
467 {
468  m_descr.Reset();
469 }
470 
471 template<class _T>
472 inline
474  CAutoAddDesc(*fake_descr, CSeqdesc::e_not_set), m_ptr(&obj)
475 {
476  m_descr.Reset();
477 }
478 
479 
480 template<class _T>
481 inline
483 {
484  return * operator->();
485 }
486 
487 template<class _T>
488 inline
490 {
491  if (! m_ptr &&
492  m_which != CSeqdesc::e_not_set)
493  {
494  if (m_descr.Empty())
495  {
496  if (!m_bioseq.Empty())
497  m_descr = &m_bioseq->SetDescr();
498  else
499  if (!m_bioset.Empty())
500  m_descr = &m_bioset->SetDescr();
501  }
502  _getfromdesc();
503  }
504 
505  return m_ptr;
506 }
507 
508 template<>
510 {
511  m_ptr = &Set().SetSource();
512 }
513 
514 template<>
516 {
517  m_ptr = &Set().SetMolinfo();
518 }
519 
520 template<>
522 {
523  m_ptr = &Set().SetGenbank();
524 }
525 
526 
528  CConstRef<CSeq_id> seqid,
529  size_t iMaxModsToParse )
530 {
531  SMod mod;
532  string stripped_title;
533  size_t pos = 0;
534 
535  m_Mods.clear();
536 
537  mod.seqid = seqid;
538 
539  size_t iModsFoundSoFar = 0;
540  for (; (pos < title.size()) && (iModsFoundSoFar < iMaxModsToParse);
541  ++iModsFoundSoFar )
542  {
543  size_t lb_pos, end_pos, eq_pos;
544  lb_pos = pos;
545  if (x_FindBrackets(title, lb_pos, end_pos, eq_pos))
546  {
547  CTempString skipped = NStr::TruncateSpaces_Unsafe(title.substr(pos, lb_pos - pos));
548 
549  if (eq_pos < end_pos) {
550  CTempString key = NStr::TruncateSpaces_Unsafe(title.substr(lb_pos+1, eq_pos - lb_pos - 1));
551  CTempString value = NStr::TruncateSpaces_Unsafe(title.substr(eq_pos + 1, end_pos - eq_pos - 1));
552 
553  mod.key = key;
554  mod.value = value;
555  mod.pos = lb_pos;
556  mod.used = false;
557  m_Mods.emplace(mod);
558  }
559 
560  x_AppendIfNonEmpty(stripped_title, skipped);
561 
562  pos = end_pos + 1;
563  }
564  else
565  { // rest of the title is unparsed
567  x_AppendIfNonEmpty(stripped_title, rest);
568  break;
569  }
570  }
571 
572  return stripped_title;
573 }
574 
576 {
577  ApplyMods(seq);
578  // Although the logic below reuses some existing objects if
579  // present, it always creates new features and descriptors.
580 
581  {{
583  if (location.Empty() && !best_id.Empty())
584  {
585  CRef<CSeq_loc> loc(new CSeq_loc);
586  loc->SetWhole(*best_id);
587  location = loc;
588  }
589 
590  if (location)
591  {
593  bool had_ftable = false;
594 
595  if (seq.IsSetAnnot()) {
597  if ((*it)->GetData().IsFtable()) {
598  ftable.Set(*it);
599  had_ftable = true;
600  break;
601  }
602  }
603  }
604 
605  // CGene_ref only on nucleotide seqs
606  if( ! FIELD_CHAIN_OF_2_IS_SET(seq, Inst, Mol) || seq.IsNa() ) {
608  x_ApplyMods(gene);
609  if (gene.IsInitialized()) {
610  CRef<CSeq_feat> feat(new CSeq_feat);
611  feat->SetData().SetGene(*gene);
612  feat->SetLocation().Assign(*location);
613  ftable->SetData().SetFtable().push_back(feat);
614  }
615  }
616 
617  // only add Prot_ref if amino acid (or at least not nucleic acid)
618  // (Yes, the FIELD_CHAIN_OF_2_IS_SET is necessary because IsAa()
619  // can throw an exception if mol isn't set)
620  if( ! FIELD_CHAIN_OF_2_IS_SET(seq, Inst, Mol) || seq.IsAa() ) {
622  x_ApplyMods(prot);
623  if ( prot.IsInitialized() ) {
624  CRef<CSeq_feat> feat(new CSeq_feat);
625  feat->SetData().SetProt(*prot);
626  feat->SetLocation().Assign(*location);
627  ftable->SetData().SetFtable().push_back(feat);
628  }
629  }
630 
631  if ( !had_ftable && ftable.IsInitialized() ) {
632  seq.SetAnnot().push_back(CRef<CSeq_annot>(&*ftable));
633  }
634  }
635  }}
636 
637  if (seq.GetInst().IsSetHist()) {
638  ApplyMods(seq.SetInst().SetHist());
639  } else {
641  x_ApplyMods(hist);
642  if (hist.IsInitialized()) {
643  seq.SetInst().SetHist(*hist);
644  }
645  }
646 
647  {{
648  //CSeq_descr* descr = nullptr;
649  if (
650  seq.GetParentSet() && seq.GetParentSet()->IsSetClass() &&
652  {
653  CBioseq_set& bioset = *(CBioseq_set*)(seq.GetParentSet().GetPointerOrNull());
654  //descr = &bioset.SetDescr();
656  x_ApplyMods(bsrc, organism);
657  }
658  else
659  {
660  //descr = &seq.SetDescr();
662  x_ApplyMods(bsrc, organism);
663  }
664  //CAutoInitDesc<CBioSource> bsrc(*descr, CSeqdesc::e_Source);
665  //x_ApplyMods(bsrc, organism);
666  }}
667 
668  {{
670  x_ApplyMods(mi);
671  }}
672 
673  {{
675  x_ApplyMods(gbb);
676  }}
677 
678  {{
680  x_ApplyTPAMods(tpa);
681  if (tpa.IsInitialized()) {
682  CRef<CSeqdesc> desc(new CSeqdesc);
683  desc->SetUser(*tpa);
684  seq.SetDescr().Set().push_back(desc);
685  }
686  }}
687 
688  x_ApplyDBLinkMods(seq);
689 
690  {{
693  if (gpdb.IsInitialized()) {
694  CRef<CSeqdesc> desc(new CSeqdesc);
695  desc->SetUser(*gpdb);
696  seq.SetDescr().Set().push_back(desc);
697  }
698  }}
699 
700  {{
701  ApplyPubMods(seq);
702  }}
703 
704  TMods unusedMods = GetMods(fUnusedMods);
705  for (TMods::const_iterator unused = unusedMods.begin(); unused != unusedMods.end(); ++unused) {
707  }
708 };
709 
710 struct SMolTypeInfo {
711 
712  // is it shown to the user as a possibility or just silently accepted?
713  enum EShown {
714  eShown_Yes, // Yes, show to user in error messages, etc.
715  eShown_No // No, don't show the user, but silently accept it if the user gives it to us
716  };
717 
719  EShown eShown,
720  CMolInfo::TBiomol eBiomol,
721  CSeq_inst::EMol eMol ) :
722  m_eBiomol(eBiomol), m_eMol(eMol), m_eShown(eShown)
723  { }
724 
728 };
730 static const TBiomolMapEntry sc_BiomolArray[] = {
731  // careful with the sort: remember that the key is canonicalized first
748 };
749 typedef CStaticPairArrayMap<const char*, SMolTypeInfo,
752 
754 {
755  const SMod* mod = nullptr;
756 
757  // top[ology]
758  if ((mod = FindMod(s_Mod_topology, s_Mod_top)) != nullptr) {
759  if (NStr::EqualNocase(mod->value, "linear")) {
760  seq.SetInst().SetTopology(CSeq_inst::eTopology_linear);
761  } else if (NStr::EqualNocase(mod->value, "circular")) {
762  seq.SetInst().SetTopology(CSeq_inst::eTopology_circular);
763  } else {
765  }
766  }
767 
768  // molecule information is not set for proteins at this time
769  // (Yes, the FIELD_CHAIN_OF_2_IS_SET is necessary because IsNa()
770  // can throw an exception if mol isn't set)
771  if( ! FIELD_CHAIN_OF_2_IS_SET(seq, Inst, Mol) || seq.IsNa() ) {
772  bool bMolSetViaMolMod = false;
773 
774  // mol[ecule]
775  if ((mod = FindMod(s_Mod_molecule, s_Mod_mol)) != nullptr) {
776  if (NStr::EqualNocase(mod->value, "dna")) {
777  seq.SetInst().SetMol( CSeq_inst::eMol_dna );
778  bMolSetViaMolMod = true;
779  } else if (NStr::EqualNocase(mod->value, "rna")) {
780  seq.SetInst().SetMol( CSeq_inst::eMol_rna );
781  bMolSetViaMolMod = true;
782  } else {
784  }
785  }
786 
787  // if mol/molecule not set right, we can use moltype instead
788 
789  // mol[-]type
790  if( ! bMolSetViaMolMod ) {
791  if ((mod = FindMod(s_Mod_moltype, s_Mod_mol_type)) != nullptr) {
792  TBiomolMap::const_iterator it = sc_BiomolMap.find(mod->value.c_str());
793  if (it == sc_BiomolMap.end()) {
795  } else {
796  // moltype sets biomol and inst.mol
797  seq.SetInst().SetMol(it->second.m_eMol);
798  }
799  }
800  }
801  }
802 
803  // strand
804  if ((mod = FindMod(s_Mod_strand)) != nullptr) {
805  if (NStr::EqualNocase(mod->value, "single")) {
806  seq.SetInst().SetStrand( CSeq_inst::eStrand_ss );
807  } else if (NStr::EqualNocase(mod->value, "double")) {
808  seq.SetInst().SetStrand( CSeq_inst::eStrand_ds );
809  } else if (NStr::EqualNocase(mod->value, "mixed")) {
810  seq.SetInst().SetStrand( CSeq_inst::eStrand_mixed );
811  } else {
813  }
814  }
815 
816  // comment
817  if ((mod = FindMod(s_Mod_comment)) != nullptr) {
818  CRef<CSeqdesc> desc(new CSeqdesc);
819  desc->SetComment( mod->value );
820  seq.SetDescr().Set().push_back(desc);
821  }
822 }
823 
824 
825 static void s_AddPrimers(const pair<string, string>& primer_info, CPCRPrimerSet& primer_set)
826 {
827  vector<string> names;
828  NStr::Split(primer_info.first, ":", names, NStr::fSplit_Tokenize);
829  vector<string> seqs;
830  NStr::Split(primer_info.second, ":", seqs, NStr::fSplit_Tokenize);
831 
832  const auto num_names = names.size();
833  const auto num_seqs = seqs.size();
834  const auto num_primers = max(num_names, num_seqs);
835 
836  for(size_t i=0; i<num_primers; ++i) {
837  auto primer = Ref(new CPCRPrimer());
838 
839  if (i<num_names && !NStr::IsBlank(names[i])) {
840  primer->SetName().Set(names[i]);
841  }
842  if (i<num_seqs && !NStr::IsBlank(seqs[i])) {
843  primer->SetSeq().Set(seqs[i]);
844  }
845  primer_set.Set().push_back(primer);
846  }
847 }
848 
849 
850 static void s_GetPrimerInfo(const CSourceModParser::SMod* pNamesMod,
851  const CSourceModParser::SMod* pSeqsMod,
852  vector<pair<string, string>>& reaction_info)
853 {
854  reaction_info.clear();
855  vector<string> names;
856  if (pNamesMod) {
857  NStr::Split(pNamesMod->value, ",", names, NStr::fSplit_Tokenize);
858  }
859 
860  vector<string> seqs;
861  if (pSeqsMod) {
862  NStr::Split(pSeqsMod->value, ",", seqs, NStr::fSplit_Tokenize);
863  if (seqs.size()>1) {
864  if (seqs.front().front() == '(') {
865  seqs.front().erase(0,1);
866  }
867  if (seqs.back().back() == ')') {
868  seqs.back().erase(seqs.back().size()-1, 1);
869  }
870  }
871  }
872 
873  const auto num_names = names.size();
874  const auto num_seqs = seqs.size();
875  const auto num_reactions = max(num_names, num_seqs);
876 
877  for (int i=0; i<num_reactions; ++i) {
878  const string name = (i<num_names) ? names[i] : "";
879  const string seq = (i<num_seqs) ? seqs[i] : "";
880  reaction_info.push_back(make_pair(name, seq));
881  }
882 }
883 
884 
886 {
887  using TNameSeqPair = pair<string, string>;
888 
889  const SMod* pNameMod = nullptr;
890  const SMod* pSeqMod = nullptr;
891 
892  pNameMod = FindMod(s_Mod_fwd_primer_name, s_Mod_fwd_pcr_primer_name);
893  pSeqMod = FindMod(s_Mod_fwd_primer_seq, s_Mod_fwd_pcr_primer_seq);
894  vector<TNameSeqPair> fwd_primer_info;
895  s_GetPrimerInfo(pNameMod, pSeqMod, fwd_primer_info);
896 
897 
898  pNameMod = FindMod(s_Mod_rev_primer_name, s_Mod_rev_pcr_primer_name);
899  pSeqMod = FindMod(s_Mod_rev_primer_seq, s_Mod_rev_pcr_primer_seq);
900  vector<TNameSeqPair> rev_primer_info;
901  s_GetPrimerInfo(pNameMod, pSeqMod, rev_primer_info);
902 
903  if (fwd_primer_info.empty() &&
904  rev_primer_info.empty()) {
905  return;
906  }
907 
908  auto num_fwd_primer_info = fwd_primer_info.size();
909  auto num_rev_primer_info = rev_primer_info.size();
910 
911  if (num_fwd_primer_info == num_rev_primer_info) {
912  for (auto i=0; i<num_fwd_primer_info; ++i) {
913  CRef<CPCRReaction> pcr_reaction(new CPCRReaction());
914  s_AddPrimers(fwd_primer_info[i], pcr_reaction->SetForward());
915  s_AddPrimers(rev_primer_info[i], pcr_reaction->SetReverse());
916  pcr_reaction_set->Set().push_back(pcr_reaction);
917  }
918  }
919  else
920  if (num_fwd_primer_info > num_rev_primer_info) {
921  auto diff = num_fwd_primer_info - num_rev_primer_info;
922  for (int i=0; i<diff; ++i) {
923  CRef<CPCRReaction> pcr_reaction(new CPCRReaction());
924  s_AddPrimers(fwd_primer_info[i], pcr_reaction->SetForward());
925  pcr_reaction_set->Set().push_back(pcr_reaction);
926  }
927 
928  for (int i=diff; i<num_fwd_primer_info; ++i) {
929  CRef<CPCRReaction> pcr_reaction(new CPCRReaction());
930  s_AddPrimers(fwd_primer_info[i], pcr_reaction->SetForward());
931  s_AddPrimers(rev_primer_info[i-diff], pcr_reaction->SetReverse());
932  pcr_reaction_set->Set().push_back(pcr_reaction);
933  }
934  }
935  else
936  if (num_fwd_primer_info < num_rev_primer_info) {
937  for (int i=0; i<num_fwd_primer_info; ++i) {
938  CRef<CPCRReaction> pcr_reaction(new CPCRReaction());
939  s_AddPrimers(fwd_primer_info[i], pcr_reaction->SetForward());
940  s_AddPrimers(rev_primer_info[i], pcr_reaction->SetReverse());
941  pcr_reaction_set->Set().push_back(pcr_reaction);
942  }
943 
944  for (int i=num_fwd_primer_info; i<num_rev_primer_info; ++i) {
945  CRef<CPCRReaction> pcr_reaction(new CPCRReaction());
946  s_AddPrimers(rev_primer_info[i], pcr_reaction->SetReverse());
947  pcr_reaction_set->Set().push_back(pcr_reaction);
948  }
949  }
950 }
951 
952 
954  CTempString organism)
955 {
956  const SMod* mod = nullptr;
957  bool reset_taxid = false;
958 
959  // org[anism]
960  if (organism.empty())
961  {
962  if ((mod = FindMod(s_Mod_organism, s_Mod_org)) != nullptr) {
963  organism = mod->value;
964  }
965  else
966  if ((mod = FindMod(s_Mod_taxname)) != nullptr) {
967  organism = mod->value;
968  }
969  }
970 
971  if ( !organism.empty())
972  {
973  if (!(bsrc->GetOrg().IsSetTaxname() && NStr::EqualNocase(bsrc->GetOrg().GetTaxname(), organism)))
974  {
975  if (bsrc->GetOrg().IsSetTaxname())
976  {
977  bsrc->ResetOrg();
978 // bsrc->ResetSubtype();
979  }
980  bsrc->SetOrg().SetTaxname(organism);
981  reset_taxid = true;
982  }
983  }
984 
985  // location
986  if ((mod = FindMod(s_Mod_location)) != nullptr) {
987  if (NStr::EqualNocase(mod->value, "mitochondrial")) {
988  bsrc->SetGenome(CBioSource::eGenome_mitochondrion);
989  } else if (NStr::EqualNocase(mod->value, "provirus")) {
990  bsrc->SetGenome(CBioSource::eGenome_proviral);
991  } else if (NStr::EqualNocase(mod->value, "extrachromosomal")) {
992  bsrc->SetGenome(CBioSource::eGenome_extrachrom);
993  } else if (NStr::EqualNocase(mod->value, "insertion sequence")) {
994  bsrc->SetGenome(CBioSource::eGenome_insertion_seq);
995  } else {
996  try {
997  bsrc->SetGenome(CBioSource::ENUM_METHOD_NAME(EGenome)()
998  ->FindValue(mod->value));
999  } catch (CSerialException&) {
1001  }
1002  }
1003  }
1004 
1005  // origin
1006  if ((mod = FindMod(s_Mod_origin)) != nullptr) {
1007  try {
1008  // also check for special cases that don't match the enum name
1009  if( NStr::EqualNocase(mod->value, "natural mutant") ) {
1010  bsrc->SetOrigin( CBioSource::eOrigin_natmut );
1011  } else if( NStr::EqualNocase(mod->value, "mutant") ) {
1012  bsrc->SetOrigin( CBioSource::eOrigin_mut );
1013  } else {
1014  bsrc->SetOrigin(CBioSource::ENUM_METHOD_NAME(EOrigin)()
1015  ->FindValue(mod->value));
1016  }
1017  } catch (CSerialException&) {
1019  }
1020  }
1021 
1022  // handle orgmods
1023  for(const auto & smod_orgsubtype : kSModOrgSubtypeMap.Get()) {
1024  const SMod & smod = smod_orgsubtype.first;
1025  const COrgMod::ESubtype e_subtype = smod_orgsubtype.second;
1026  if ((mod = FindMod(smod.key)) != nullptr) {
1027  CRef<COrgMod> org_mod(new COrgMod);
1028  org_mod->SetSubtype(e_subtype);
1029  org_mod->SetSubname(mod->value);
1030  bsrc->SetOrg().SetOrgname().SetMod().push_back(org_mod);
1031  reset_taxid = true;
1032  }
1033  }
1034 
1035  // handle subsources
1036  for( const auto & smod_subsrcsubtype : kSModSubSrcSubtypeMap.Get() ) {
1037  const SMod & smod = smod_subsrcsubtype.first;
1038  const CSubSource::ESubtype e_subtype = smod_subsrcsubtype.second;
1039  if ((mod = FindMod(smod.key)) != nullptr) {
1040  auto& subtype = bsrc->SetSubtype();
1041  CRef<CSubSource> subsource(new CSubSource);
1042  subsource->SetSubtype(e_subtype);
1043 
1044  if( CSubSource::NeedsNoText(e_subtype) ) {
1045  subsource->SetName(kEmptyStr);
1046  } else {
1047  subsource->SetName(mod->value);
1048  }
1049 
1050  if (!CSubSource::IsMultipleValuesAllowed(e_subtype))
1051  {
1052  // since only one of this e_subtype is allowed, we erase any
1053  // that are already in the subtype list.
1054  // (Unfortunately, we cannot just use bsrc->RemoveSubSource
1055  // because it will ResetSubtype if subtype ends up empty)
1056  subtype.erase(
1057  remove_if(subtype.begin(), subtype.end(),
1058  equal_subtype(e_subtype)),
1059  subtype.end());
1060  }
1061 
1062  subtype.push_back(subsource);
1063  }
1064  }
1065 
1066  // handle PCR Primers
1067  {{
1068  CAutoInitRef<CPCRReactionSet> pcr_reaction_set;
1069  x_AddPCRPrimers(pcr_reaction_set);
1070  if (pcr_reaction_set.IsInitialized()) {
1071  if (!bsrc->IsSetPcr_primers()) {
1072  bsrc->SetPcr_primers(*pcr_reaction_set);
1073  }
1074  else {
1075  bsrc->SetPcr_primers().Set().splice(
1076  bsrc->SetPcr_primers().Set().end(),
1077  pcr_reaction_set->Set());
1078  }
1079  }
1080  }}
1081 
1082 
1083  // db_xref
1084  TModsRange db_xref_mods_range = FindAllMods( s_Mod_db_xref, s_Mod_dbxref );
1085  for( TModsCI db_xref_iter = db_xref_mods_range.first;
1086  db_xref_iter != db_xref_mods_range.second;
1087  ++db_xref_iter ) {
1088  CRef< CDbtag > new_db( new CDbtag );
1089 
1090  const CTempString db_xref_str = db_xref_iter->value;
1091  CRef<CObject_id> object_id(new CObject_id);
1092 
1093  size_t colon_location = db_xref_str.find(":");
1094  if (colon_location == string::npos) {
1095  // no colon: it's just tag, and db is unknown
1096  new_db->SetDb() = "?";
1097  db_xref_str.Copy(object_id->SetStr(), 0, CTempString::npos);
1098  } else {
1099  // there's a colon, so db and tag are both known
1100  db_xref_str.Copy(new_db->SetDb(), 0, colon_location);
1101  db_xref_str.Copy(object_id->SetStr(), colon_location + 1, CTempString::npos);
1102  }
1103 
1104  new_db->SetTag( *object_id );
1105 
1106  bsrc->SetOrg().SetDb().push_back( new_db );
1107  }
1108 
1109  // div[ision]
1110  if ((mod = FindMod(s_Mod_division, s_Mod_div)) != nullptr) {
1111  bsrc->SetOrg().SetOrgname().SetDiv( mod->value );
1112  }
1113 
1114  // lineage
1115  if ((mod = FindMod(s_Mod_lineage)) != nullptr) {
1116  bsrc->SetOrg().SetOrgname().SetLineage( mod->value );
1117  }
1118 
1119  // gcode
1120  if ((mod = FindMod(s_Mod_gcode)) != nullptr) {
1121  bsrc->SetOrg().SetOrgname().SetGcode( NStr::StringToInt(mod->value, NStr::fConvErr_NoThrow) );
1122  }
1123 
1124  // mgcode
1125  if ((mod = FindMod(s_Mod_mgcode)) != nullptr) {
1126  bsrc->SetOrg().SetOrgname().SetMgcode( NStr::StringToInt(mod->value, NStr::fConvErr_NoThrow) );
1127  }
1128 
1129  // pgcode
1130  if ((mod = FindMod(s_Mod_pgcode)) != nullptr) {
1131  bsrc->SetOrg().SetOrgname().SetPgcode( NStr::StringToInt(mod->value, NStr::fConvErr_NoThrow) );
1132  }
1133 
1134  // note[s]
1135  TModsRange mods[2];
1136  mods[0] = FindAllMods(s_Mod_note);
1137  mods[1] = FindAllMods(s_Mod_notes);
1138  for (size_t i = 0; i < 2; i++)
1139  {
1140  for (TModsCI it = mods[i].first; it != mods[i].second; it++)
1141  {
1142  CRef< CSubSource > new_subsource(new CSubSource);
1143  new_subsource->SetSubtype(CSubSource::eSubtype_other);
1144  new_subsource->SetName(it->value);
1145  bsrc->SetSubtype().push_back(new_subsource);
1146  }
1147  }
1148 
1149  // focus
1150  if ((mod = FindMod(s_Mod_focus)) != nullptr) {
1151  if( NStr::EqualNocase( mod->value, "TRUE" ) ) {
1152  bsrc->SetIs_focus();
1153  }
1154  }
1155 
1156 
1157  if ((mod = FindMod(s_Mod_taxid)) != nullptr) {
1158  bsrc->SetOrg().SetTaxId( NStr::StringToNumeric<TTaxId>(mod->value, NStr::fConvErr_NoThrow) );
1159  }
1160  else
1161  if (reset_taxid && bsrc->IsSetOrgname() && bsrc->GetOrg().GetTaxId() != ZERO_TAX_ID) {
1162  bsrc->SetOrg().SetTaxId(ZERO_TAX_ID);
1163  }
1164 }
1165 
1167 static const TTechMapEntry sc_TechArray[] = {
1168  { "?", CMolInfo::eTech_unknown },
1169  { "barcode", CMolInfo::eTech_barcode },
1170  { "both", CMolInfo::eTech_both },
1171  { "composite-wgs-htgs", CMolInfo::eTech_composite_wgs_htgs },
1172  { "concept-trans", CMolInfo::eTech_concept_trans },
1173  { "concept-trans-a", CMolInfo::eTech_concept_trans_a },
1174  { "derived", CMolInfo::eTech_derived },
1175  { "EST", CMolInfo::eTech_est },
1176  { "fli cDNA", CMolInfo::eTech_fli_cdna },
1177  { "genetic map", CMolInfo::eTech_genemap },
1178  { "htc", CMolInfo::eTech_htc },
1179  { "htgs 0", CMolInfo::eTech_htgs_0 },
1180  { "htgs 1", CMolInfo::eTech_htgs_1 },
1181  { "htgs 2", CMolInfo::eTech_htgs_2 },
1182  { "htgs 3", CMolInfo::eTech_htgs_3 },
1183  { "physical map", CMolInfo::eTech_physmap },
1184  { "seq-pept", CMolInfo::eTech_seq_pept },
1185  { "seq-pept-homol", CMolInfo::eTech_seq_pept_homol },
1186  { "seq-pept-overlap", CMolInfo::eTech_seq_pept_overlap },
1187  { "standard", CMolInfo::eTech_standard },
1188  { "STS", CMolInfo::eTech_sts },
1189  { "survey", CMolInfo::eTech_survey },
1190  { "targeted", CMolInfo::eTech_targeted },
1191  { "tsa", CMolInfo::eTech_tsa },
1192  { "wgs", CMolInfo::eTech_wgs }
1193 };
1194 typedef CStaticPairArrayMap<const char*, CMolInfo::TTech,
1197 
1200  { "complete", CMolInfo::eCompleteness_complete },
1201  { "has-left", CMolInfo::eCompleteness_has_left },
1202  { "has-right", CMolInfo::eCompleteness_has_right },
1203  { "no-ends", CMolInfo::eCompleteness_no_ends },
1204  { "no-left", CMolInfo::eCompleteness_no_left },
1205  { "no-right", CMolInfo::eCompleteness_no_right },
1206  { "partial", CMolInfo::eCompleteness_partial }
1207 };
1208 typedef CStaticPairArrayMap<const char*, CMolInfo::TCompleteness,
1211 
1213 {
1214  const SMod* mod = nullptr;
1215 
1216  // mol[-]type
1217  if ((mod = FindMod(s_Mod_moltype, s_Mod_mol_type)) != nullptr) {
1218  TBiomolMap::const_iterator it = sc_BiomolMap.find(mod->value.c_str());
1219  if (it == sc_BiomolMap.end()) {
1220  // construct the possible bad values by hand
1222  } else {
1223  // moltype sets biomol and inst.mol
1224  mi->SetBiomol(it->second.m_eBiomol);
1225  }
1226  }
1227 
1228  // tech
1229  if ((mod = FindMod(s_Mod_tech)) != nullptr) {
1230  TTechMap::const_iterator it = sc_TechMap.find(mod->value.c_str());
1231  if (it == sc_TechMap.end()) {
1233  } else {
1234  mi->SetTech(it->second);
1235  }
1236  }
1237 
1238  // complete[d]ness
1239  if ((mod = FindMod(s_Mod_completeness, s_Mod_completedness)) != nullptr) {
1240  TTechMap::const_iterator it = sc_CompletenessMap.find(mod->value.c_str());
1241  if (it == sc_CompletenessMap.end()) {
1243  } else {
1244  mi->SetCompleteness(it->second);
1245  }
1246  }
1247 }
1248 
1250 {
1251  const SMod* mod = nullptr;
1252 
1253  // gene
1254  if ((mod = FindMod(s_Mod_gene)) != nullptr) {
1255  gene->SetLocus(mod->value);
1256  }
1257 
1258  // allele
1259  if ((mod = FindMod(s_Mod_allele)) != nullptr) {
1260  gene->SetAllele( mod->value );
1261  }
1262 
1263  // gene_syn[onym]
1264  if ((mod = FindMod(s_Mod_gene_syn, s_Mod_gene_synonym)) != nullptr) {
1265  gene->SetSyn().push_back( mod->value );
1266  }
1267 
1268  // locus_tag
1269  if ((mod = FindMod(s_Mod_locus_tag)) != nullptr) {
1270  gene->SetLocus_tag( mod->value );
1271  }
1272 }
1273 
1274 
1276 {
1277  const SMod* mod = nullptr;
1278 
1279  // prot[ein]
1280  if ((mod = FindMod(s_Mod_protein, s_Mod_prot)) != nullptr) {
1281  prot->SetName().push_back(mod->value);
1282  }
1283 
1284  // prot[ein]_desc
1285  if ((mod = FindMod(s_Mod_prot_desc, s_Mod_protein_desc)) != nullptr) {
1286  prot->SetDesc( mod->value );
1287  }
1288 
1289  // EC_number
1290  if ((mod = FindMod(s_Mod_EC_number)) != nullptr) {
1291  prot->SetEc().push_back( mod->value );
1292  }
1293 
1294  // activity/function
1295  if ((mod = FindMod(s_Mod_activity, s_Mod_function)) != nullptr) {
1296  prot->SetActivity().push_back( mod->value );
1297  }
1298 }
1299 
1300 
1302 {
1303  const SMod* mod = nullptr;
1304 
1305  // secondary-accession[s]
1306  if ((mod = FindMod(s_Mod_secondary_accession,
1307  s_Mod_secondary_accessions)) != nullptr)
1308  {
1309  list<CTempString> ranges;
1310  NStr::Split(mod->value, ",", ranges, NStr::fSplit_MergeDelimiters);
1311  ITERATE (list<CTempString>, it, ranges) {
1312  string s = NStr::TruncateSpaces_Unsafe(*it);
1313  try {
1314  SSeqIdRange range(s);
1315  ITERATE (SSeqIdRange, it2, range) {
1316  gbb->SetExtra_accessions().push_back(*it2);
1317  }
1318  } catch (CSeqIdException&) {
1319  gbb->SetExtra_accessions().push_back(s);
1320  }
1321  }
1322  }
1323 
1324  // keyword[s]
1325  if ((mod = FindMod(s_Mod_keyword, s_Mod_keywords)) != nullptr) {
1326  list<string> keywordList;
1327  NStr::Split(mod->value, ",;", keywordList, NStr::fSplit_MergeDelimiters);
1328  // trim every string and push it into the real keyword list
1329  NON_CONST_ITERATE( list<string>, keyword_iter, keywordList ) {
1330  NStr::TruncateSpacesInPlace( *keyword_iter );
1331  gbb->SetKeywords().push_back( *keyword_iter );
1332  }
1333  }
1334 }
1335 
1336 
1338 {
1339  const SMod* mod = nullptr;
1340 
1341  // secondary-accession[s]
1342  if ((mod = FindMod(s_Mod_secondary_accession,
1343  s_Mod_secondary_accessions)) != nullptr)
1344  {
1345  list<CTempString> ranges;
1346  NStr::Split(mod->value, ",", ranges, NStr::fSplit_MergeDelimiters);
1347  ITERATE (list<CTempString>, it, ranges) {
1348  string s = NStr::TruncateSpaces_Unsafe(*it);
1349  try {
1350  SSeqIdRange range(s);
1351  ITERATE (SSeqIdRange, it2, range) {
1352  hist->SetReplaces().SetIds().push_back(it2.GetID());
1353  }
1354  } catch (CSeqIdException&) {
1355  NStr::ReplaceInPlace(s, "ref_seq|", "ref|", 0, 1);
1356  hist->SetReplaces().SetIds()
1357  .push_back(CRef<CSeq_id>(new CSeq_id(s)));
1358  }
1359  }
1360  }
1361 }
1362 
1363 // Note: It's untested.
1364 //
1365 // This code is currently unused, but I'm leaving it here in case
1366 // at some point in the future someone decides that we do want it.
1367 //
1368 // We're not using this because it would introduce a whole new
1369 // dependency just for a single keyword.
1370 //
1371 //void CSourceModParser::x_ApplyMods(CAutoInitRef<CSubmit_block>& sb) {
1372 //
1373 // // hup
1374 // if ((mod = FindMod("hup")) != nullptr) {
1375 // sb->SetHup( false );
1376 // sb->ResetReldate();
1377 // if( ! mod->value.empty() ) {
1378 // if( NStr::EqualNocase( mod->value, "y" ) ) {
1379 // sb->SetHup( true );
1380 // // by default, release in a year
1381 // CDate releaseDate( CTime(CTime::eCurrent) );
1382 // _ASSERT(releaseDate.IsStd());
1383 // releaseDate.GetStd().SetYear( releaseDate.GetStd().GetYear() + 1 );
1384 // sb->SetReldate( releaseDate );
1385 // } else {
1386 // // parse string as "m/d/y" (or with "-" instead of "/" )
1387 // try {
1388 // CTime hupTime( NStr::Replace( mod->value, "-", "/" ), "M/D/Y" );
1389 // sb->SetReldate( CDate(hupTime) );
1390 // sb->SetHup( true );
1391 // } catch( const CException & e) {
1392 // // couldn't parse date
1393 // x_HandleBadModValue(*mod);
1394 // }
1395 // }
1396 // }
1397 // }
1398 //}
1399 
1400 
1401 static
1402 void s_PopulateUserObject(CUser_object& uo, const string& type,
1404 {
1405  if (uo.GetType().Which() == CObject_id::e_not_set) {
1406  uo.SetType().SetStr(type);
1407  } else if ( !uo.GetType().IsStr() || uo.GetType().GetStr() != type) {
1408  // warn first?
1409  return;
1410  }
1411 
1412  swap(uo.SetData(), data);
1413 }
1414 
1415 
1417 {
1418  const SMod* mod = nullptr;
1419 
1420  // primary[-accessions]
1421  if ((mod = FindMod(s_Mod_primary, s_Mod_primary_accessions)) != nullptr) {
1423  list<CTempString> accns;
1424  NStr::Split(mod->value, ",", accns, NStr::fSplit_MergeDelimiters);
1425  ITERATE (list<CTempString>, it, accns) {
1426  CRef<CUser_field> field(new CUser_field), subfield(new CUser_field);
1427  field->SetLabel().SetId(0);
1428  subfield->SetLabel().SetStr("accession");
1429  subfield->SetData().SetStr(CUtf8::AsUTF8(*it, eEncoding_UTF8));
1430  field->SetData().SetFields().push_back(subfield);
1431  data.push_back(field);
1432  }
1433 
1434  if ( !data.empty() ) {
1435  s_PopulateUserObject(*tpa, "TpaAssembly", data);
1436  }
1437  }
1438 }
1439 
1440 
1442 {
1443  CConstRef<CBioseq_set> pParentSet = bioseq.GetParentSet();
1444  CSeq_descr& descriptors = (pParentSet &&
1445  pParentSet->GetClass() == CBioseq_set::eClass_nuc_prot) ?
1446 
1447  (const_cast<CBioseq_set&>(*pParentSet)).SetDescr() :
1448  bioseq.SetDescr();
1449 
1450 
1451  for (auto pDesc : descriptors.Set()) {
1452  if (pDesc->IsUser() && pDesc->GetUser().IsDBLink()) {
1453  return pDesc;
1454  }
1455  }
1456 
1457  auto pDBLinkDesc = Ref(new CSeqdesc());
1458  pDBLinkDesc->SetUser().SetObjectType(CUser_object::eObjectType_DBLink);
1459  descriptors.Set().push_back(pDBLinkDesc);
1460  return pDBLinkDesc;
1461 }
1462 
1463 
1464 static void s_SetDBLinkFieldVals(const string& label,
1465  const list<CTempString>& vals,
1466  CSeqdesc& dblink_desc)
1467 {
1468  if (vals.empty()) {
1469  return;
1470  }
1471 
1472  auto& user_obj = dblink_desc.SetUser();
1473  CRef<CUser_field> pField;
1474  if (user_obj.IsSetData()) {
1475  for (auto pUserField : user_obj.SetData()) {
1476  if (pUserField->IsSetLabel() &&
1477  pUserField->GetLabel().IsStr() &&
1478  NStr::EqualNocase(pUserField->GetLabel().GetStr(), label)) {
1479  pField = pUserField;
1480  break;
1481  }
1482  }
1483  }
1484 
1485  if (!pField) {
1486  pField = Ref(new CUser_field());
1487  pField->SetLabel().SetStr() = label;
1488  user_obj.SetData().push_back(pField);
1489  }
1490 
1491  pField->SetData().SetStrs().clear(); // RW-518 - clear any preexisting entries
1492  for (const auto& val : vals) {
1493  pField->SetData().SetStrs().push_back(val);
1494  }
1495  pField->SetNum(pField->GetData().GetStrs().size());
1496 }
1497 
1498 
1499 static void s_SetDBLinkField(const string& label,
1500  const string& vals,
1501  CRef<CSeqdesc>& pDBLinkDesc,
1502  CBioseq& bioseq)
1503 {
1504  list<CTempString> value_list;
1505  NStr::Split(vals, ",", value_list, NStr::fSplit_MergeDelimiters);
1506  for (auto& val : value_list) {
1508  }
1509  value_list.remove_if([](const CTempString& val){ return val.empty(); });
1510  if (value_list.empty()) { // nothing to do
1511  return;
1512  }
1513 
1514  if (!pDBLinkDesc) {
1515  pDBLinkDesc = s_SetDBLinkDesc(bioseq);
1516  }
1517 
1519  value_list,
1520  *pDBLinkDesc);
1521 }
1522 
1523 
1525 {
1526  CRef<CSeqdesc> pDBLinkDesc;
1527  const SMod* mod = nullptr;
1528  if ((mod = FindMod(s_Mod_SRA)) != nullptr) {
1529  s_SetDBLinkField("Sequence Read Archive", mod->value, pDBLinkDesc, bioseq);
1530  }
1531 
1532  if ((mod = FindMod(s_Mod_bioproject)) != nullptr) {
1533  s_SetDBLinkField("BioProject", mod->value, pDBLinkDesc, bioseq);
1534  }
1535 
1536  if ((mod = FindMod(s_Mod_biosample)) != nullptr) {
1537  s_SetDBLinkField("BioSample", mod->value, pDBLinkDesc, bioseq);
1538  }
1539 }
1540 
1541 
1542 
1543 void
1545 {
1546  const SMod* mod = nullptr;
1547 
1548  // project[s]
1549  if ((mod = FindMod(s_Mod_project, s_Mod_projects)) != nullptr) {
1551  list<CTempString> ids;
1552  NStr::Split(mod->value, ",;", ids, NStr::fSplit_MergeDelimiters);
1553  ITERATE (list<CTempString>, it, ids) {
1554  unsigned int id = NStr::StringToUInt(*it, NStr::fConvErr_NoThrow);
1555  if (id > 0) {
1556  CRef<CUser_field> field(new CUser_field),
1557  subfield(new CUser_field);
1558  field->SetLabel().SetId(0);
1559  subfield->SetLabel().SetStr("ProjectID");
1560  subfield->SetData().SetInt(id);
1561  field->SetData().SetFields().push_back(subfield);
1562  subfield.Reset(new CUser_field);
1563  subfield->SetLabel().SetStr("ParentID");
1564  subfield->SetData().SetInt(0);
1565  field->SetData().SetFields().push_back(subfield);
1566  data.push_back(field);
1567  }
1568  }
1569 
1570  if ( !data.empty() ) {
1571  s_PopulateUserObject(*gpdb, "GenomeProjectsDB", data);
1572  }
1573  }
1574 }
1575 
1576 
1577 static
1579 {
1580  for (CSourceModParser::TModsCI it = range.first;
1581  it != range.second; ++it) {
1582  TEntrezId pmid = NStr::StringToNumeric<TEntrezId>(it->value, NStr::fConvErr_NoThrow);
1583  CRef<CPub> pub(new CPub);
1584  pub->SetPmid().Set(pmid);
1585  CRef<CSeqdesc> pubdesc(new CSeqdesc);
1586  pubdesc->SetPub().SetPub().Set().push_back(pub);
1587  bioseq.SetDescr().Set().push_back(pubdesc);
1588  }
1589 }
1590 
1591 
1593 {
1594  // find PubMed IDs
1595  s_ApplyPubMods(seq, FindAllMods(s_Mod_PubMed));
1596  s_ApplyPubMods(seq, FindAllMods(s_Mod_PMID));
1597 }
1598 
1600  const SMod & badMod,
1601  const string & sAllowedValues )
1602  : runtime_error(x_CalculateErrorString(badMod, sAllowedValues)),
1603  m_BadMod(badMod), m_sAllowedValues(sAllowedValues)
1604 {
1605  // no further work required
1606 }
1607 
1609  const SMod & badMod,
1610  const string & sAllowedValues )
1611 {
1612  stringstream str_strm;
1613  str_strm << "Bad modifier value at seqid '"
1614  << ( badMod.seqid ? badMod.seqid->AsFastaString() : "UNKNOWN")
1615  << "'. '" << badMod.key << "' cannot have value '" << badMod.value
1616  << "'. Accepted values are [" << sAllowedValues << "]";
1617  return str_strm.str();
1618 }
1619 
1621  const SMod& unkMod )
1622  : runtime_error(x_CalculateErrorString(unkMod)), m_UnkMod(unkMod)
1623 {
1624 }
1625 
1627  const SMod& unkMod)
1628 {
1629  stringstream str_strm;
1630  str_strm << "Bad modifier key at seqid '"
1631  << ( unkMod.seqid ? unkMod.seqid->AsFastaString() : "UNKNOWN")
1632  << "'. '" << unkMod.key << "' is not a recognized modifier key";
1633  return str_strm.str();
1634 }
1635 
1636 
1638 {
1639  if (which == fAllMods) {
1640  // if caller gave this they probably should prefer calling GetAllMods
1641  // to avoid the struct copy.
1642  return m_Mods;
1643  } else {
1644  TMods ret;
1645 
1646  ITERATE (TMods, it, m_Mods) {
1647  if (which == (it->used ? fUsedMods : fUnusedMods)) {
1648  ret.insert(ret.end(), *it);
1649  }
1650  }
1651 
1652  return ret;
1653  }
1654 }
1655 
1657  //const SMod & smod, const SMod & alt_smod)
1658  const CTempString& key, const CTempString& alt_key)
1659 {
1660  // check against m_pModFilter, if any
1661  if( m_pModFilter ) {
1662  if( ! (*m_pModFilter)(key) || ! (*m_pModFilter)(alt_key) ) {
1663  return nullptr;
1664  }
1665  }
1666 
1667  SMod mod;
1668 
1669  for (int tries = 0; tries < 2; ++tries) {
1670  const CTempString & modkey = ( tries == 0 ? key : alt_key );
1671  if( modkey.empty() ) {
1672  continue;
1673  }
1674  mod.key = modkey;
1675 
1676  TModsCI it = m_Mods.lower_bound(mod);
1677  if (it != m_Mods.end() && EqualKeys(it->key, modkey)) {
1678  // set iterators are const since changing an object could affect
1679  // its order in the set. However, in this case we know that
1680  // changing the `used` field won't affect the order so we know
1681  // that a const_cast to change it is safe to do.
1682  const_cast<SMod&>(*it).used = true;
1683  return &*it;
1684  }
1685  }
1686 
1687  return nullptr;
1688 }
1689 
1690 
1693 {
1694  SMod smod(key);
1695  return FindAllMods(smod);
1696 }
1697 
1700 {
1701  SMod smod(key);
1702  SMod alt_smod(alt_key);
1703  return FindAllMods(smod, alt_smod);
1704 }
1705 
1707 CSourceModParser::FindAllMods(const SMod & smod, const SMod & alt_smod)
1708 {
1709  TModsRange r;
1710  r.first = m_Mods.lower_bound(smod);
1711  if (r.first == m_Mods.end() || !EqualKeys(r.first->key, smod.key)) {
1712  r.first = m_Mods.lower_bound(alt_smod);
1713  }
1714  for (r.second = r.first;
1715  r.second != m_Mods.end() && (EqualKeys(r.second->key, smod.key) || EqualKeys(r.second->key, alt_smod.key));
1716  ++r.second)
1717  {
1718  // set iterators are const since changing an object could affect
1719  // its order in the set. However, in this case we know that
1720  // changing the `used` field won't affect the order so we know
1721  // that a const_cast to change it is safe to do.
1722  const_cast<SMod&>(*r.second).used = true;
1723  }
1724  return r;
1725 }
1726 
1727 
1728 void CSourceModParser::GetLabel(string* s, TWhichMods which) const
1729 {
1730  // Possible (flag-conditional?) behavior changes:
1731  // - leave off spaces between modifiers
1732  // - sort by position rather than key
1733  _ASSERT(s);
1734 
1735  string delim = s->empty() ? kEmptyStr : " ";
1736 
1737  ITERATE (TMods, it, m_Mods) {
1738  if ((which & (it->used ? fUsedMods : fUnusedMods)) != 0) {
1739  *s += delim + '[' + it->key + '=' + it->value + ']';
1740  delim = " ";
1741  }
1742  }
1743 }
1744 
1745 // static
1746 const set<string> &
1748 {
1749  // since this has a lock, do NOT grab any other locks
1750  // inside here.
1751  static CMutex mutex;
1752  CMutexGuard guard(mutex);
1753 
1754  typedef map< string, set<string>, CSourceModParser::PKeyCompare> TMapModToValidValues;
1755  static TMapModToValidValues s_mapModToValidValues;
1756 
1757  // see if value is already calculated to try to save time
1758  TMapModToValidValues::const_iterator find_iter =
1759  s_mapModToValidValues.find(mod);
1760  if( find_iter != s_mapModToValidValues.end() ) {
1761  return find_iter->second;
1762  }
1763 
1764  // does canonical comparison, which goes a little beyond case-insensitivity
1766 
1767  // not cached, so we need to calculate it ourselves
1768  set<string> & set_valid_values = s_mapModToValidValues[mod];
1769  if( key_equal(mod, "topology") || key_equal(mod, "top") ) {
1770  set_valid_values.insert("linear");
1771  set_valid_values.insert("circular");
1772  } else if( key_equal(mod, "molecule") || key_equal(mod, "mol") ) {
1773  set_valid_values.insert("rna");
1774  set_valid_values.insert("dna");
1775  } else if( key_equal(mod, "moltype") || key_equal(mod, "mol-type") ) {
1776  // construct the possible bad values by hand
1777  ITERATE( TBiomolMap, map_iter, sc_BiomolMap ) {
1778  if( map_iter->second.m_eShown == SMolTypeInfo::eShown_Yes ) {
1779  set_valid_values.insert(map_iter->first);
1780  }
1781  }
1782  } else if( key_equal(mod, "strand") ) {
1783  set_valid_values.insert("single");
1784  set_valid_values.insert("double");
1785  set_valid_values.insert("mixed");
1786  } else if( key_equal(mod, "location") ) {
1787  set_valid_values.insert("mitochondrial");
1788  set_valid_values.insert("provirus");
1789  set_valid_values.insert("extrachromosomal");
1790  set_valid_values.insert("insertion sequence");
1791  } else if( key_equal(mod, "origin") ) {
1792  set_valid_values.insert("natural mutant");
1793  set_valid_values.insert("mutant");
1794  ITERATE(CEnumeratedTypeValues::TValues, enum_iter, CBioSource::ENUM_METHOD_NAME(EOrigin)()->GetValues()) {
1795  set_valid_values.insert( enum_iter->first );
1796  }
1797  } else if( key_equal(mod, "tech") ) {
1798  ITERATE(TTechMap, tech_it, sc_TechMap) {
1799  set_valid_values.insert(tech_it->first);
1800  }
1801  } else if( key_equal(mod, "completeness") || key_equal(mod, "completedness") ) {
1802  ITERATE( TCompletenessMap, comp_it, sc_CompletenessMap ) {
1803  set_valid_values.insert(comp_it->first);
1804  }
1805  } else {
1806  set_valid_values.insert("ERROR TRYING TO DETERMINE ALLOWED VALUES");
1807  }
1808 
1809  return set_valid_values;
1810 }
1811 
1812 // static
1813 const string &
1815 {
1816  // do not grab any other locks while in here (except the lock in
1817  // GetModAllowedValues)
1818  static CMutex mutex;
1819  CMutexGuard guard(mutex);
1820 
1821  typedef map<string, string> TMapModNameToStringOfAllAllowedValues;
1822  static TMapModNameToStringOfAllAllowedValues mapModNameToStringOfAllAllowedValues;
1823 
1824  // see if we've already cached the value
1825  TMapModNameToStringOfAllAllowedValues::const_iterator find_iter =
1826  mapModNameToStringOfAllAllowedValues.find(mod);
1827  if( find_iter != mapModNameToStringOfAllAllowedValues.end() ) {
1828  return find_iter->second;
1829  }
1830 
1831  // not loaded, so we need to calculate it
1832  string & sAllValuesAsOneString =
1833  mapModNameToStringOfAllAllowedValues[mod];
1834  const set<string> & setAllowedValues = GetModAllowedValues(mod);
1835  ITERATE( set<string>, value_it, setAllowedValues ) {
1836  if( ! sAllValuesAsOneString.empty() ) {
1837  sAllValuesAsOneString += ", ";
1838  }
1839  sAllValuesAsOneString += "'" + *value_it + "'";
1840  }
1841 
1842  return sAllValuesAsOneString;
1843 }
1844 
1846  const SMod& mod)
1847 {
1848  m_BadMods.insert(mod);
1849 
1851  return;
1852  }
1853 
1854  const string & sAllAllowedValues = GetModAllowedValuesAsOneString(mod.key);
1855 
1856  CBadModError badModError(mod, sAllAllowedValues);
1857 
1858  switch( m_HandleBadMod ) {
1859  case eHandleBadMod_Throw:
1860  throw badModError;
1862  cerr << badModError.what() << endl;
1863  break;
1867  eDiag_Warning,
1868  m_LineNumber,
1869  badModError.what(),
1871  x_ProcessError(*pErr);
1872  break;
1873  }
1874  default:
1875  _TROUBLE;
1876  }
1877 }
1878 
1880  const SMod& mod)
1881 {
1883  return;
1884  }
1885  if (m_pModFilter && !m_pModFilter->operator()(mod.key)) {
1886  return;
1887  }
1888  CUnkModError unkModError(mod);
1889 
1890  switch( m_HandleBadMod ) {
1891  case eHandleBadMod_Throw:
1892  throw unkModError;
1894  cerr << unkModError.what() << endl;
1895  break;
1899  eDiag_Warning,
1900  m_LineNumber,
1901  unkModError.what(),
1903  x_ProcessError(*pErr);
1904  break;
1905  }
1906  default:
1907  _TROUBLE;
1908  }
1909 }
1910 
1913 {
1914  if (!m_pErrorListener) {
1915  err.Throw();
1916  }
1917  if (!m_pErrorListener->PutError(err)) {
1921  0,
1922  "Error allowance exceeded",
1924  pErr->Throw();
1925  }
1926 }
1927 
1929 {
1930  CAutoInitDesc<CBioSource> ref(bsrc);
1931  x_ApplyMods(ref, organism);
1932 }
1933 
1934 
1936 {
1937  CAutoInitDesc<CMolInfo> ref(mi);
1938  x_ApplyMods(ref);
1939 }
1940 
1941 
1943 {
1944  CAutoInitDesc<CGB_block> ref(gbb);
1945  x_ApplyMods(ref);
1946 }
1947 
1949 {
1951  {
1952  // set iterators are const since changing an object could affect
1953  // its order in the set. However, in this case we know that
1954  // changing the `used` field won't affect the order so we know
1955  // that a const_cast to change it is safe to do.
1956  const_cast<SMod&>(*it).used = false;
1957  }
1958 }
1959 
1961 {
1962  SMod newmod(NStr::TruncateSpaces_Unsafe(name));
1964  newmod.used = false;
1965 
1966  return m_Mods.insert(newmod).second;
1967 }
1968 
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
void remove_if(Container &c, Predicate *__pred)
Definition: chainer.hpp:69
AutoPtr –.
Definition: ncbimisc.hpp:401
CRef< CSeq_descr > m_descr
Definition: Seq_descr.hpp:108
CSeqdesc & Set(bool skip_lookup=false)
Definition: Seq_descr.cpp:93
CSourceModParser.
CRef< CBioseq > m_bioseq
void _getfromdesc()
CAutoInitDesc(CSeq_descr &descr, CSeqdesc::E_Choice which)
CRef< CBioseq_set > m_bioset
bool IsInitialized(void) const
CAutoInitRef<>::
void Set(T *object)
Initialize with an existing object.
CConstRef< CBioseq_set > GetParentSet(void) const
Definition: Bioseq_set.cpp:312
bool IsNa(void) const
Definition: Bioseq.cpp:345
bool IsAa(void) const
Definition: Bioseq.cpp:350
Definition: Dbtag.hpp:53
CMutex –.
Definition: ncbimtx.hpp:749
void Throw(void) const
this function to throw this object.
Definition: line_error.cpp:440
static CObjReaderLineException * Create(EDiagSev eSeverity, unsigned int uLine, const std::string &strMessage, EProblem eProblem=eProblem_GeneralParsingError, const std::string &strSeqId=string(""), const std::string &strFeatureName=string(""), const std::string &strQualifierName=string(""), const std::string &strQualifierValue=string(""), CObjReaderLineException::EErrCode eErrCode=eFormat, const TVecOfLines &vecOfOtherLines=TVecOfLines())
Please use this instead of the constructor because the ctor is protected.
Definition: line_error.cpp:417
@OrgMod.hpp User-defined methods of the data storage class.
Definition: OrgMod.hpp:54
CPCRPrimerSet –.
CPCRPrimer –.
Definition: PCRPrimer.hpp:66
CPCRReaction –.
Definition: PCRReaction.hpp:66
Definition: Pub.hpp:56
CSafeStaticRef<>::
CSafeStatic<>::
CSeqIdException –.
Definition: Seq_id.hpp:1001
@Seq_descr.hpp User-defined methods of the data storage class.
Definition: Seq_descr.hpp:55
namespace ncbi::objects::
Definition: Seq_feat.hpp:58
Root class for all serialization exceptions.
Definition: exception.hpp:50
class CStaticArrayMap<> is an array adaptor that provides an STLish interface to statically-defined a...
Definition: static_map.hpp:105
TBase::const_iterator const_iterator
Definition: static_map.hpp:109
static bool IsMultipleValuesAllowed(TSubtype)
Definition: SubSource.cpp:208
static bool NeedsNoText(const TSubtype &subtype)
Definition: SubSource.cpp:233
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
Definition: tempstr.hpp:65
virtual bool PutError(const ILineError &)=0
Store error in the container, and return true if error was stored fine, and return false if the calle...
@ eProblem_GeneralParsingError
Definition: line_error.hpp:105
const_iterator find(const key_type &key) const
Definition: map.hpp:153
Definition: map.hpp:338
Definition: set.hpp:45
iterator_bool insert(const value_type &val)
Definition: set.hpp:149
const_iterator begin() const
Definition: set.hpp:135
void clear()
Definition: set.hpp:153
const_iterator end() const
Definition: set.hpp:136
const_iterator lower_bound(const key_type &key) const
Definition: set.hpp:138
parent_type::const_iterator const_iterator
Definition: set.hpp:79
static bool key_equal(const KEY_T *a, const KEY_T *b)
Definition: dbpivot.c:472
static const struct name_t names[]
static DLIST_TYPE *DLIST_NAME() first(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:46
static const char * str(char *buf, int n)
Definition: stats.c:84
static const char location[]
Definition: config.c:97
char data[12]
Definition: iconv.c:80
SStrictId_Entrez::TId TEntrezId
TEntrezId type for entrez ids which require the same strictness as TGi.
Definition: ncbimisc.hpp:1041
#define ZERO_TAX_ID
Definition: ncbimisc.hpp:1115
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
void swap(NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair1, NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair2)
Definition: ncbimisc.hpp:1508
@ eDiag_Warning
Warning message.
Definition: ncbidiag.hpp:652
@ eDiag_Critical
Critical error message.
Definition: ncbidiag.hpp:654
#define NCBI_USER_THROW_FMT(message)
Throw a "user exception" with message processed as output to ostream.
Definition: ncbiexpt.hpp:724
list< pair< string, TEnumValueType > > TValues
Definition: enumvalues.hpp:54
const TValues & GetValues(void) const
Get the list of name-value pairs.
Definition: enumvalues.hpp:98
TPrim & Set(void)
Definition: serialbase.hpp:351
#define ENUM_METHOD_NAME(EnumName)
Definition: serialbase.hpp:994
void SetAllUnused()
Set all mods to unused.
static const unsigned char kKeyCanonicalizationTable[257]
pair< TModsCI, TModsCI > TModsRange
std::string x_CalculateErrorString(const SMod &badMod, const string &sAllowedValues)
ILineErrorListener * m_pErrorListener
void ApplyMods(CBioSource &bsrc, CTempString organism=kEmptyStr)
const SMod * FindMod(const CTempString &key, const CTempString &alt_key=CTempString())
If a modifier with either key is present, mark it as used and return it; otherwise,...
bool AddMods(const CTempString &name, const CTempString &value)
void x_ApplyDBLinkMods(CBioseq &bioseq)
void x_ProcessError(CObjReaderLineException &err)
TModsRange FindAllMods(const CTempString &key, const CTempString &alt_key)
Return all modifiers with the given key (e.g., db_xref), marking them as used along the way.
void x_AddPCRPrimers(CAutoInitRef< CPCRReactionSet > &pcr_reaction_set)
static const set< string > & GetModAllowedValues(const string &mod)
Given a mod name (e.g.
EHandleBadMod m_HandleBadMod
void ApplyPubMods(CBioseq &seq)
CRef< CModFilter > m_pModFilter
CConstRef< CSeq_id > seqid
static bool EqualKeys(const CTempString &lhs, const CTempString &rhs)
void GetLabel(string *s, TWhichMods which=fAllMods) const
Append a representation of the specified modifiers to s, with a space in between if s is not empty an...
CBadModError(const SMod &badMod, const std::string &sAllowedValues)
std::string x_CalculateErrorString(const SMod &unkMod)
void x_HandleUnkModValue(const SMod &mod)
static CSafeStatic< CSourceModParser::SMod > kEmptyMod
Used for passing an empty mod to some funcs without having to constantly recreate an empty one.
void x_HandleBadModValue(const SMod &mod)
static const string & GetModAllowedValuesAsOneString(const string &mod)
Same as GetModAllowedValues, but returns one string with all the values.
void x_ApplyTPAMods(CAutoInitRef< CUser_object > &tpa)
void ApplyAllMods(CBioseq &seq, CTempString organism=kEmptyStr, CConstRef< CSeq_loc > location=CConstRef< CSeq_loc >())
Apply previously extracted modifiers to the given object, marking all relevant ones as used.
void x_ApplyGenomeProjectsDBMods(CAutoInitRef< CUser_object > &gpdb)
string ParseTitle(const CTempString &title, CConstRef< CSeq_id > seqid, size_t iMaxModsToParse=std::numeric_limits< size_t >::max())
Extract and store bracketed modifiers from a title string, returning a stripped version (which may we...
void x_ApplyMods(CAutoInitDesc< CBioSource > &bsrc, CTempString organism)
TMods::const_iterator TModsCI
TMods GetMods(TWhichMods which=fAllMods) const
Return all modifiers matching the given criteria (if any) without affecting their status (used vs.
const string AsFastaString(void) const
Definition: Seq_id.cpp:2266
static int BestRank(const CRef< CSeq_id > &id)
Definition: Seq_id.hpp:774
void SetWhole(TWhole &v)
Definition: Seq_loc.hpp:982
CRef< C > Ref(C *object)
Helper functions to get CRef<> and CConstRef<> objects.
Definition: ncbiobj.hpp:2015
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
TObjectType * GetPointerOrNull(void) const THROWS_NONE
Get pointer value.
Definition: ncbiobj.hpp:1672
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
Definition: ncbiobj.hpp:719
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
static CTempString TruncateSpaces_Unsafe(const CTempString str, ETrunc where=eTrunc_Both)
Truncate spaces in a string.
Definition: ncbistr.cpp:3191
#define kEmptyStr
Definition: ncbistr.hpp:123
static int StringToInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to int.
Definition: ncbistr.cpp:630
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
Definition: ncbistr.cpp:3461
static bool IsBlank(const CTempString str, SIZE_TYPE pos=0)
Check if a string is blank (has no text).
Definition: ncbistr.cpp:106
static void TruncateSpacesInPlace(string &str, ETrunc where=eTrunc_Both)
Truncate spaces in a string (in-place)
Definition: ncbistr.cpp:3201
static bool EqualCase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-sensitive equality of a substring with another string.
Definition: ncbistr.hpp:5325
const char * data(void) const
Return a pointer to the array represented.
Definition: tempstr.hpp:313
bool empty(void) const
Return true if the represented string is empty (i.e., the length is zero)
Definition: tempstr.hpp:334
static CStringUTF8 AsUTF8(const CTempString &src, EEncoding encoding, EValidate validate=eNoValidate)
Convert into UTF8 from a C/C++ string.
Definition: ncbistr.hpp:3889
size_type length(void) const
Return the length of the represented array.
Definition: tempstr.hpp:320
static unsigned int StringToUInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to unsigned int.
Definition: ncbistr.cpp:642
CTempString substr(size_type pos) const
Obtain a substring from this string, beginning at a given offset.
Definition: tempstr.hpp:776
static bool EqualNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive equality of a substring with another string.
Definition: ncbistr.hpp:5353
size_type find(const CTempString match, size_type pos=0) const
Find the first instance of the entire matching string within the current string, beginning at an opti...
Definition: tempstr.hpp:655
static string & ReplaceInPlace(string &src, const string &search, const string &replace, SIZE_TYPE start_pos=0, SIZE_TYPE max_replace=0, SIZE_TYPE *num_replace=0)
Replace occurrences of a substring within a string.
Definition: ncbistr.cpp:3405
size_type size(void) const
Return the length of the represented array.
Definition: tempstr.hpp:327
void Copy(string &dst, size_type pos, size_type len) const
Copy a substring into a string Somewhat similar to basic_string::assign()
Definition: tempstr.hpp:525
static const size_type npos
Definition: tempstr.hpp:72
@ eEncoding_UTF8
Definition: ncbistr.hpp:201
@ fConvErr_NoThrow
Do not throw an exception on error.
Definition: ncbistr.hpp:285
@ fSplit_Tokenize
All delimiters are merged and trimmed, to get non-empty tokens only.
Definition: ncbistr.hpp:2508
@ fSplit_MergeDelimiters
Merge adjacent delimiters.
Definition: ncbistr.hpp:2498
C::value_type FindBestChoice(const C &container, F score_func)
Find the best choice (lowest score) for values in a container.
Definition: ncbiutil.hpp:250
static const char label[]
void SetSubtype(TSubtype value)
Assign a value to Subtype data member.
Definition: SubSource_.hpp:319
TSubtype GetSubtype(void) const
Get the Subtype member data.
Definition: SubSource_.hpp:310
void SetForward(TForward &value)
Assign a value to Forward data member.
bool IsSetSubtype(void) const
Check if a value has been assigned to Subtype data member.
Definition: SubSource_.hpp:291
void SetReverse(TReverse &value)
Assign a value to Reverse data member.
void SetName(const TName &value)
Assign a value to Name data member.
Definition: SubSource_.hpp:359
Tdata & Set(void)
Assign a value to data member.
@ eSubtype_lat_lon
+/- decimal degrees
Definition: SubSource_.hpp:113
@ eOrigin_mut
artificially mutagenized
Definition: BioSource_.hpp:132
@ eOrigin_natmut
naturally occurring mutant
Definition: BioSource_.hpp:131
bool IsStr(void) const
Check if variant Str is selected.
Definition: Object_id_.hpp:291
void SetTag(TTag &value)
Assign a value to Tag data member.
Definition: Dbtag_.cpp:66
const TStrs & GetStrs(void) const
Get the variant data.
const TData & GetData(void) const
Get the Data member data.
E_Choice Which(void) const
Which variant is currently selected.
Definition: Object_id_.hpp:235
TData & SetData(void)
Assign a value to Data data member.
void SetNum(TNum value)
Assign a value to Num data member.
const TStr & GetStr(void) const
Get the variant data.
Definition: Object_id_.hpp:297
void SetLabel(TLabel &value)
Assign a value to Label data member.
TStr & SetStr(void)
Select the variant.
Definition: Object_id_.hpp:304
void SetType(TType &value)
Assign a value to Type data member.
void SetData(TData &value)
Assign a value to Data data member.
const TType & GetType(void) const
Get the Type member data.
void SetDb(const TDb &value)
Assign a value to Db data member.
Definition: Dbtag_.hpp:229
vector< CRef< CUser_field > > TData
@ e_not_set
No variant selected.
Definition: Object_id_.hpp:89
void SetSubtype(TSubtype value)
Assign a value to Subtype data member.
Definition: OrgMod_.hpp:316
void SetSubname(const TSubname &value)
Assign a value to Subname data member.
Definition: OrgMod_.hpp:356
@ eSubtype_sub_species
Definition: OrgMod_.hpp:105
@ eSubtype_nat_host
natural host of this specimen
Definition: OrgMod_.hpp:104
TPmid & SetPmid(void)
Select the variant.
Definition: Pub_.hpp:690
void SetLocation(TLocation &value)
Assign a value to Location data member.
Definition: Seq_feat_.cpp:131
void SetData(TData &value)
Assign a value to Data data member.
Definition: Seq_feat_.cpp:94
bool IsSetClass(void) const
Check if a value has been assigned to Class data member.
TClass GetClass(void) const
Get the Class member data.
@ eClass_nuc_prot
nuc acid and coded proteins
Definition: Bioseq_set_.hpp:99
void SetPub(TPub &value)
Assign a value to Pub data member.
Definition: Pubdesc_.cpp:72
const TInst & GetInst(void) const
Get the Inst member data.
Definition: Bioseq_.hpp:336
TPub & SetPub(void)
Select the variant.
Definition: Seqdesc_.cpp:362
bool IsSetAnnot(void) const
Check if a value has been assigned to Annot data member.
Definition: Bioseq_.hpp:354
TAnnot & SetAnnot(void)
Assign a value to Annot data member.
Definition: Bioseq_.hpp:372
const TId & GetId(void) const
Get the Id member data.
Definition: Bioseq_.hpp:290
bool IsSetHist(void) const
sequence history Check if a value has been assigned to Hist data member.
Definition: Seq_inst_.hpp:847
TComment & SetComment(void)
Select the variant.
Definition: Seqdesc_.hpp:1065
void SetInst(TInst &value)
Assign a value to Inst data member.
Definition: Bioseq_.cpp:86
E_Choice
Choice variants.
Definition: Seqdesc_.hpp:109
EMol
molecule class in living organism
Definition: Seq_inst_.hpp:108
void SetDescr(TDescr &value)
Assign a value to Descr data member.
Definition: Bioseq_.cpp:65
TUser & SetUser(void)
Select the variant.
Definition: Seqdesc_.cpp:390
Tdata & Set(void)
Assign a value to data member.
Definition: Seq_descr_.hpp:172
list< CRef< CSeq_annot > > TAnnot
Definition: Bioseq_.hpp:97
@ eCompleteness_has_left
5' or NH3 end present
Definition: MolInfo_.hpp:161
@ eCompleteness_complete
complete biological entity
Definition: MolInfo_.hpp:156
@ eCompleteness_has_right
3' or COOH end present
Definition: MolInfo_.hpp:162
@ eCompleteness_no_left
missing 5' or NH3 end
Definition: MolInfo_.hpp:158
@ eCompleteness_partial
partial but no details given
Definition: MolInfo_.hpp:157
@ eCompleteness_no_right
missing 3' or COOH end
Definition: MolInfo_.hpp:159
@ eCompleteness_no_ends
missing both ends
Definition: MolInfo_.hpp:160
@ eTech_htgs_2
ordered High Throughput sequence contig
Definition: MolInfo_.hpp:138
@ eTech_physmap
from physical mapping techniques
Definition: MolInfo_.hpp:129
@ eTech_htc
high throughput cDNA
Definition: MolInfo_.hpp:142
@ eTech_both
concept transl. w/ partial pept. seq.
Definition: MolInfo_.hpp:133
@ eTech_targeted
targeted locus sets/studies
Definition: MolInfo_.hpp:147
@ eTech_seq_pept_homol
sequenced peptide, ordered by homology
Definition: MolInfo_.hpp:135
@ eTech_composite_wgs_htgs
composite of WGS and HTGS
Definition: MolInfo_.hpp:145
@ eTech_sts
Sequence Tagged Site.
Definition: MolInfo_.hpp:126
@ eTech_htgs_3
finished High Throughput sequence
Definition: MolInfo_.hpp:139
@ eTech_seq_pept_overlap
sequenced peptide, ordered by overlap
Definition: MolInfo_.hpp:134
@ eTech_htgs_1
unordered High Throughput sequence contig
Definition: MolInfo_.hpp:137
@ eTech_concept_trans
conceptual translation
Definition: MolInfo_.hpp:131
@ eTech_tsa
transcriptome shotgun assembly
Definition: MolInfo_.hpp:146
@ eTech_standard
standard sequencing
Definition: MolInfo_.hpp:124
@ eTech_wgs
whole genome shotgun sequencing
Definition: MolInfo_.hpp:143
@ eTech_seq_pept
peptide was sequenced
Definition: MolInfo_.hpp:132
@ eTech_survey
one-pass genomic sequence
Definition: MolInfo_.hpp:127
@ eTech_barcode
barcode of life project
Definition: MolInfo_.hpp:144
@ eTech_htgs_0
single genomic reads for coordination
Definition: MolInfo_.hpp:141
@ eTech_derived
derived from other data, not a primary entity
Definition: MolInfo_.hpp:130
@ eTech_fli_cdna
full length insert cDNA
Definition: MolInfo_.hpp:140
@ eTech_est
Expressed Sequence Tag.
Definition: MolInfo_.hpp:125
@ eTech_concept_trans_a
conceptual transl. supplied by author
Definition: MolInfo_.hpp:136
@ eTech_genemap
from genetic mapping techniques
Definition: MolInfo_.hpp:128
@ eBiomol_pre_RNA
precursor RNA of any sort really
Definition: MolInfo_.hpp:102
@ eBiomol_cRNA
viral RNA genome copy intermediate
Definition: MolInfo_.hpp:111
@ eBiomol_transcribed_RNA
transcribed RNA other than existing classes
Definition: MolInfo_.hpp:113
@ eBiomol_other_genetic
other genetic material
Definition: MolInfo_.hpp:109
@ e_Genbank
GenBank specific info.
Definition: Seqdesc_.hpp:121
@ e_Molinfo
info on the molecule and techniques
Definition: Seqdesc_.hpp:134
@ e_not_set
No variant selected.
Definition: Seqdesc_.hpp:110
@ e_Source
source of materials, includes Org-ref
Definition: Seqdesc_.hpp:133
@ eStrand_ds
double strand
Definition: Seq_inst_.hpp:136
@ eStrand_ss
single strand
Definition: Seq_inst_.hpp:135
@ e_not_set
int i
static const CS_INT unused
Definition: long_binary.c:20
range(_Ty, _Ty) -> range< _Ty >
const struct ncbi::grid::netcache::search::fields::KEY key
const GenericPointer< typename T::ValueType > T2 value
Definition: pointer.h:1227
#define nullptr
Definition: ncbimisc.hpp:45
Useful/utility classes and methods.
T max(T x_, T y_)
Int mod(Int i, Int j)
Definition: njn_integer.hpp:67
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
static const GLdouble origin[]
#define FIELD_CHAIN_OF_2_IS_SET(Var, Fld1, Fld2)
FIELD_CHAIN_OF_2_IS_SET.
SStaticPair< const char *, SMolTypeInfo > TBiomolMapEntry
static void s_SetDBLinkField(const string &label, const string &vals, CRef< CSeqdesc > &pDBLinkDesc, CBioseq &bioseq)
DEFINE_STATIC_ARRAY_MAP(TBiomolMap, sc_BiomolMap, sc_BiomolArray)
CStaticPairArrayMap< const char *, CMolInfo::TCompleteness, CSourceModParser::PKeyCompare > TCompletenessMap
static const TTechMapEntry sc_TechArray[]
static void s_GetPrimerInfo(const CSourceModParser::SMod *pNamesMod, const CSourceModParser::SMod *pSeqsMod, vector< pair< string, string >> &reaction_info)
static void s_AddPrimers(const pair< string, string > &primer_info, CPCRPrimerSet &primer_set)
static void s_PopulateUserObject(CUser_object &uo, const string &type, CUser_object::TData &data)
static CRef< CSeqdesc > s_SetDBLinkDesc(CBioseq &bioseq)
static const TBiomolMapEntry sc_BiomolArray[]
SStaticPair< const char *, CMolInfo::TTech > TTechMapEntry
static void s_ApplyPubMods(CBioseq &bioseq, const CSourceModParser::TModsRange &range)
static void s_SetDBLinkFieldVals(const string &label, const list< CTempString > &vals, CSeqdesc &dblink_desc)
static const TCompletenessMapEntry sc_CompletenessArray[]
SStaticPair< const char *, CMolInfo::TCompleteness > TCompletenessMapEntry
CSafeStaticRef< CSeq_descr > fake_descr
CStaticPairArrayMap< const char *, SMolTypeInfo, CSourceModParser::PKeyCompare > TBiomolMap
CStaticPairArrayMap< const char *, CMolInfo::TTech, CSourceModParser::PKeyCompare > TTechMap
#define STATIC_SMOD(key_str)
CSeq_inst::EMol m_eMol
SMolTypeInfo(EShown eShown, CMolInfo::TBiomol eBiomol, CSeq_inst::EMol eMol)
CMolInfo::TBiomol m_eBiomol
SSeqIdRange –.
Definition: Seq_id.hpp:895
Template structure SStaticPair is simlified replacement of STL pair<> Main reason of introducing this...
Definition: static_set.hpp:60
Definition: type.c:6
#define _TROUBLE
#define _ASSERT
#define ftable
Definition: utilfeat.h:37
Modified on Wed Jun 19 17:07:43 2024 by modify_doxy.py rev. 669887