NCBI C++ ToolKit
autodef_mod_combo.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: autodef_mod_combo.cpp 93413 2021-04-09 19:12:01Z stakhovv $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Colleen Bollin
27 *
28 * File Description:
29 * Generate unique definition lines for a set of sequences using organism
30 * descriptions and feature clauses.
31 */
32 
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbimisc.hpp>
35 #include <objmgr/seqdesc_ci.hpp>
36 #include <objmgr/bioseq_ci.hpp>
37 #include <objmgr/feat_ci.hpp>
38 #include <objmgr/util/feature.hpp>
39 
41 #include <objects/seq/Seqdesc.hpp>
42 #include <objects/seq/Bioseq.hpp>
43 
44 #include <serial/iterator.hpp>
45 
47 
50 
52  m_MaxModifiers(-99),
53  m_AllowModAtEndOfTaxname(false),
54  m_KeepCountryText(false),
55  m_ExcludeSpOrgs(true),
56  m_ExcludeCfOrgs(false),
57  m_ExcludeNrOrgs(false),
58  m_ExcludeAffOrgs(false),
59  m_KeepParen(true),
60  m_KeepAfterSemicolon(false),
61  m_HIVCloneIsolateRule(CAutoDefOptions::eWantBoth)
62 
63 {
64  m_SubSources.clear();
65  m_OrgMods.clear();
66 
67  m_GroupList.clear();
68  m_Modifiers.clear();
69 }
70 
71 
73 {
74  _ASSERT (orig);
75  m_SubSources.clear();
76  m_OrgMods.clear();
77 
78  m_GroupList.clear();
79  m_Modifiers.clear();
80 
81  for (auto it : orig->GetGroupList()) {
82  m_GroupList.emplace_back (new CAutoDefSourceGroup(*it));
83  }
85  m_Modifiers.push_back (CAutoDefSourceModifierInfo(*it));
86  }
87 
88  unsigned int k;
89  for (k = 0; k < orig->GetNumSubSources(); k++) {
90  m_SubSources.push_back(orig->GetSubSource(k));
91  }
92 
93  for (k = 0; k < orig->GetNumOrgMods(); k++) {
94  m_OrgMods.push_back(orig->GetOrgMod(k));
95  }
96 
97  m_UseModifierLabels = orig->GetUseModifierLabels();
98  m_KeepCountryText = orig->GetKeepCountryText();
99  m_ExcludeSpOrgs = orig->GetExcludeSpOrgs();
100  m_ExcludeCfOrgs = orig->GetExcludeCfOrgs();
101  m_ExcludeNrOrgs = orig->GetExcludeNrOrgs();
102  m_ExcludeAffOrgs = orig->GetExcludeAffOrgs();
103  m_KeepParen = orig->GetKeepParen();
104  m_KeepAfterSemicolon = orig->GetKeepAfterSemicolon();
105  m_HIVCloneIsolateRule = orig->GetHIVCloneIsolateRule();
106  m_AllowModAtEndOfTaxname = orig->GetAllowModAtEndOfTaxname();
107  m_MaxModifiers = orig->GetMaxModifiers();
108 }
109 
110 
112 {
113 }
114 
115 
117 {
118  m_UseModifierLabels = options.GetUseLabels();
119  m_MaxModifiers = options.GetMaxMods();
129 
130  const CAutoDefOptions::TSubSources& subsrcs = options.GetSubSources();
131  ITERATE(CAutoDefOptions::TSubSources, it, subsrcs) {
132  AddQual(false, *it, true);
133  }
134  const CAutoDefOptions::TOrgMods& orgmods = options.GetOrgMods();
135  ITERATE(CAutoDefOptions::TOrgMods, it, orgmods) {
136  AddQual(true, *it, true);
137  }
138 }
139 
140 
142 {
144  options.SetMaxMods(m_MaxModifiers);
154 
155  // add subsources and orgmods
157  if (it->IsOrgMod()) {
158  options.AddOrgMod(it->GetSubtype());
159  } else {
160  options.AddSubSource(it->GetSubtype());
161  }
162  }
163 }
164 
165 
167 {
168  return m_GroupList.size();
169 }
170 
171 
173 {
174  return m_SubSources.size();
175 }
176 
177 
179 {
180  _ASSERT (index < m_SubSources.size());
181 
182  return m_SubSources[index];
183 }
184 
185 
187 {
188  return m_OrgMods.size();
189 }
190 
191 
193 {
194  _ASSERT (index < m_OrgMods.size());
195 
196  return m_OrgMods[index];
197 }
198 
199 
201 {
202  for (unsigned int k = 0; k < m_SubSources.size(); k++) {
203  if (m_SubSources[k] == st) {
204  return true;
205  }
206  }
207  return false;
208 }
209 
210 
212 {
213  for (unsigned int k = 0; k < m_OrgMods.size(); k++) {
214  if (m_OrgMods[k] == st) {
215  return true;
216  }
217  }
218  return false;
219 }
220 
221 
222 void CAutoDefModifierCombo::AddSource(const CBioSource& bs, const string& feature_clauses)
223 {
224  CRef<CAutoDefSourceDescription> src(new CAutoDefSourceDescription(bs, feature_clauses));
225  bool found = false;
226 
227  for (auto it : m_GroupList) {
228  if (it->GetSrcList().size() > 0
229  && src->Compare (**(it->GetSrcList().begin())) == 0) {
230  it->AddSource (src);
231  found = true;
232  }
233  }
234  if (!found) {
236  g->AddSource (src);
237  m_GroupList.push_back (g);
238  }
239 }
240 
241 
242 void CAutoDefModifierCombo::AddSubsource(CSubSource::ESubtype st, bool even_if_not_uniquifying)
243 {
244  AddQual(false, st, even_if_not_uniquifying);
245 }
246 
247 
248 void CAutoDefModifierCombo::AddOrgMod(COrgMod::ESubtype st, bool even_if_not_uniquifying)
249 {
250  AddQual (true, st, even_if_not_uniquifying);
251 }
252 
253 
255 {
256  string label;
258  label = "endogenous virus";
259  } else if (st == CSubSource::eSubtype_transgenic) {
260  label = "transgenic";
261  } else if (st == CSubSource::eSubtype_plasmid_name) {
262  label = "plasmid";
263  } else if (st == CSubSource::eSubtype_country) {
264  label = "from";
265  } else if (st == CSubSource::eSubtype_segment) {
266  label = "segment";
267  } else if (m_UseModifierLabels) {
269  }
270  if (!NStr::IsBlank(label)) {
271  label = " " + label;
272  }
273  return label;
274 }
275 
276 
278 {
279  string label;
280  if (st == COrgMod::eSubtype_nat_host) {
281  label = "from";
282  } else if (m_UseModifierLabels) {
284  }
285  if (!NStr::IsBlank(label)) {
286  label = " " + label;
287  }
288  return label;
289 }
290 
291 
292 /* This function fixes HIV abbreviations, removes items in parentheses,
293  * and trims spaces around the taxonomy name.
294  */
296 {
297  if (NStr::Equal(tax_name, "Human immunodeficiency virus type 1", NStr::eNocase)
298  || NStr::Equal(tax_name, "Human immunodeficiency virus 1", NStr::eNocase)) {
299  tax_name = "HIV-1";
300  } else if (NStr::Equal(tax_name, "Human immunodeficiency virus type 2", NStr::eNocase)
301  || NStr::Equal(tax_name, "Human immunodeficiency virus 2", NStr::eNocase)) {
302  tax_name = "HIV-2";
303  } else if (!m_KeepParen) {
304  string::size_type pos = NStr::Find(tax_name, "(");
305  if (pos != NCBI_NS_STD::string::npos) {
306  tax_name = tax_name.substr(0, pos);
307  NStr::TruncateSpacesInPlace(tax_name);
308  }
309  }
310 }
311 
313 {
314  if (!src.IsSetSubtype()) {
315  return false;
316  }
318  if ((*it)->IsSetSubtype() && (*it)->GetSubtype() == subtype) {
319  return true;
320  }
321  }
322  return false;
323 }
324 
325 
327 {
328  if (!src.IsSetOrg() || !src.GetOrg().IsSetOrgname() || !src.GetOrg().GetOrgname().IsSetMod()) {
329  return false;
330  }
331  ITERATE(COrgName::TMod, it, src.GetOrg().GetOrgname().GetMod()) {
332  if ((*it)->IsSetSubtype() && (*it)->GetSubtype() == subtype) {
333  return true;
334  }
335  }
336  return false;
337 
338 }
339 
340 
341 bool CAutoDefModifierCombo::x_AddSubsourceString (string &source_description, const CBioSource& bsrc, CSubSource::ESubtype st)
342 {
343  bool used = false;
344 
345  if (!bsrc.IsSetSubtype()) {
346  return false;
347  }
348  ITERATE (CBioSource::TSubtype, subSrcI, bsrc.GetSubtype()) {
349  if ((*subSrcI)->IsSetSubtype() && (*subSrcI)->GetSubtype() == st) {
350  source_description += x_GetSubSourceLabel (st);
351 
352  string val = (*subSrcI)->GetName();
353  // truncate value at first semicolon
354  if (!m_KeepAfterSemicolon) {
355  string::size_type pos = NStr::Find(val, ";");
356  if (pos != NCBI_NS_STD::string::npos) {
357  val = val.substr(0, pos);
358  }
359  }
360 
361  // if country and not keeping text after colon, truncate after colon
363  && ! m_KeepCountryText) {
364  string::size_type pos = NStr::Find(val, ":");
365  if (pos != NCBI_NS_STD::string::npos) {
366  val = val.substr(0, pos);
367  }
368  } else if (st == CSubSource::eSubtype_plasmid_name && NStr::EqualNocase(val, "unnamed")) {
369  val.clear();
370  }
371  if (!NStr::IsBlank(val)) {
372  source_description += " " + val;
373  }
374  used = true;
375  }
376  }
377  return used;
378 }
379 
380 bool CAutoDefModifierCombo::IsModifierInString(const string& find_this, const string& find_in, bool ignore_at_end)
381 {
382  size_t pos = NStr::Find(find_in, find_this);
383  if (pos == string::npos) {
384  return false;
385  }
386 
387  bool keep_looking = false;
388  // need to make sure it's a whole word and not a partial match
389  if (pos != 0 && find_in.c_str()[pos - 1] != '(' && find_in.c_str()[pos - 1] != ' ') {
390  // not whole word
391  keep_looking = true;
392  } else if (find_in.c_str()[pos + find_this.length()] != ')' &&
393  find_in.c_str()[pos + find_this.length()] != ' ' &&
394  find_in.c_str()[pos + find_this.length()] != 0) {
395  // not whole word
396  keep_looking = true;
397  }
398 
399  bool at_end = (pos == find_in.length() - find_this.length());
400 
401  if (keep_looking) {
402  if (at_end) {
403  return false;
404  } else {
405  return IsModifierInString(find_this, find_in.substr(pos + 1), ignore_at_end);
406  }
407  } else if (at_end && ignore_at_end) {
408  return false;
409  } else {
410  return true;
411  }
412 }
413 
414 
415 bool CAutoDefModifierCombo::x_AddOrgModString (string &source_description, const CBioSource& bsrc, COrgMod::ESubtype st)
416 {
417  bool used = false;
418 
419  if (!bsrc.IsSetOrg() || !bsrc.GetOrg().IsSetOrgname() || !bsrc.GetOrg().GetOrgname().IsSetMod()) {
420  return false;
421  }
422 
423  ITERATE (COrgName::TMod, modI, bsrc.GetOrg().GetOrgname().GetMod()) {
424  if ((*modI)->IsSetSubtype() && (*modI)->GetSubtype() == st) {
425 
426  string val = (*modI)->GetSubname();
427  // truncate value at first semicolon
428  if (!m_KeepAfterSemicolon) {
429  string::size_type pos = NStr::Find(val, ";");
430  if (pos != NCBI_NS_STD::string::npos) {
431  val = val.substr(0, pos);
432  }
433  }
434 
435  if (st == COrgMod::eSubtype_specimen_voucher && NStr::StartsWith (val, "personal:")) {
436  val = val.substr(9);
437  }
438  // If modifier is one of the following types and the value already appears in the tax Name,
439  // don't use in the organism description
440  if ((st == COrgMod::eSubtype_strain
443  || st == COrgMod::eSubtype_forma
447  || st == COrgMod::eSubtype_isolate)
449  // can't use this
450  } else {
451  source_description += x_GetOrgModLabel(st);
452 
453  source_description += " ";
454  source_description += val;
455  used = true;
456  break;
457  }
458  }
459  }
460  return used;
461 }
462 
463 
465 {
466  bool has_tricky = false;
467 
468  for (unsigned int k = 0; k < m_GroupList.size() && !has_tricky; k++) {
469  has_tricky = m_GroupList[k]->HasTrickyHIV();
470  }
471  return has_tricky;
472 }
473 
474 
475 void CAutoDefModifierCombo::x_AddHIVModifiers(TExtraOrgMods& extra_orgmods, TExtraSubSrcs& extra_subsrcs, const CBioSource& bsrc)
476 {
477  bool src_has_clone = false;
478  bool src_has_isolate = false;
479  bool src_has_strain = false;
480 
481  if (!bsrc.IsSetOrg() || !bsrc.GetOrg().IsSetTaxname()) {
482  return;
483  }
484  string source_description = bsrc.GetOrg().GetTaxname();
485  x_CleanUpTaxName(source_description);
486  if (!NStr::Equal(source_description, "HIV-1") &&
487  !NStr::Equal(source_description, "HIV-2")) {
488  return;
489  }
490 
491  if (extra_subsrcs.find(CSubSource::eSubtype_country) == extra_subsrcs.end()) {
492  extra_subsrcs.insert(TExtraSubSrc(CSubSource::eSubtype_country, true));
493  }
494 
495  src_has_clone = x_BioSourceHasSubSrc(bsrc, CSubSource::eSubtype_clone);
496  src_has_isolate = x_BioSourceHasOrgMod(bsrc, COrgMod::eSubtype_isolate);
497  src_has_strain = x_BioSourceHasOrgMod(bsrc, COrgMod::eSubtype_strain);
498 
499  if ((HasSubSource (CSubSource::eSubtype_clone) && src_has_clone) ||
500  (HasOrgMod (COrgMod::eSubtype_isolate) && src_has_isolate) ||
501  (HasOrgMod (COrgMod::eSubtype_strain) && src_has_strain)) {
502  // no additional changes - isolate and clone rule taken care of
503  } else {
504  bool use_isolate = false;
505  if ( ! HasOrgMod (COrgMod::eSubtype_isolate) && src_has_isolate
508  || !src_has_clone)) {
509  if (extra_orgmods.find(COrgMod::eSubtype_isolate) == extra_orgmods.end()) {
510  extra_orgmods.insert(TExtraOrgMod(COrgMod::eSubtype_isolate, true));
511  use_isolate = true;
512  }
513  }
514  if (!HasOrgMod(COrgMod::eSubtype_strain) && src_has_strain &&
515  !use_isolate) {
516  if (extra_orgmods.find(COrgMod::eSubtype_strain) == extra_orgmods.end()) {
517  extra_orgmods.insert(TExtraOrgMod(COrgMod::eSubtype_strain, true));
518  }
519  }
520  if (! HasSubSource(CSubSource::eSubtype_clone) && src_has_clone
523  || (!src_has_isolate && !src_has_strain))) {
524  if (extra_subsrcs.find(CSubSource::eSubtype_clone) == extra_subsrcs.end()) {
525  extra_subsrcs.insert(TExtraSubSrc(CSubSource::eSubtype_clone, true));
526  }
527  }
528  }
529 }
530 
531 
533 {
534  if (NStr::StartsWith(taxname, "Influenza A virus", NStr::eNocase)) {
535  return eInfluenzaA;
536  } else if (NStr::StartsWith(taxname, "Influenza B virus", NStr::eNocase)) {
537  return eInfluenzaB;
538  } else if (NStr::StartsWith(taxname, "Influenza C virus", NStr::eNocase)) {
539  return eInfluenzaC;
540  } else if (NStr::StartsWith(taxname, "Influenza D virus", NStr::eNocase)) {
541  return eInfluenzaD;
542  } else {
543  return eNotInfluenza;
544  }
545 }
546 
547 
549 {
550  if (influenza_type == eNotInfluenza) {
551  return false;
552  } else if (subtype == CSubSource::eSubtype_clone || subtype == CSubSource::eSubtype_segment) {
553  return true;
554  } else {
555  return false;
556  }
557 }
558 
559 
561 {
562  if (influenza_type == eNotInfluenza) {
563  return false;
564  } else if (subtype == COrgMod::eSubtype_strain) {
565  return true;
566  } else if (subtype == COrgMod::eSubtype_serotype && influenza_type == eInfluenzaA) {
567  return true;
568  } else {
569  return false;
570  }
571 
572 }
573 
574 
576 {
577  switch (influenza_type) {
578  case eInfluenzaA:
579  if (extra_orgmods.find(COrgMod::eSubtype_strain) == extra_orgmods.end()) {
580  extra_orgmods.insert(TExtraOrgMod(COrgMod::eSubtype_strain, true));
581  }
582  if (extra_orgmods.find(COrgMod::eSubtype_serotype) == extra_orgmods.end()) {
583  extra_orgmods.insert(TExtraOrgMod(COrgMod::eSubtype_serotype, true));
584  }
585  if (extra_subsrcs.find(CSubSource::eSubtype_clone) == extra_subsrcs.end()) {
586  extra_subsrcs.insert(TExtraSubSrc(CSubSource::eSubtype_clone, true));
587  }
588  if (extra_subsrcs.find(CSubSource::eSubtype_segment) == extra_subsrcs.end()) {
589  extra_subsrcs.insert(TExtraSubSrc(CSubSource::eSubtype_segment, true));
590  }
591  break;
592  case eInfluenzaB:
593  case eInfluenzaC:
594  case eInfluenzaD:
595  if (extra_orgmods.find(COrgMod::eSubtype_strain) == extra_orgmods.end()) {
596  extra_orgmods.insert(TExtraOrgMod(COrgMod::eSubtype_strain, true));
597  }
598  if (extra_subsrcs.find(CSubSource::eSubtype_clone) == extra_subsrcs.end()) {
599  extra_subsrcs.insert(TExtraSubSrc(CSubSource::eSubtype_clone, true));
600  }
601  if (extra_subsrcs.find(CSubSource::eSubtype_segment) == extra_subsrcs.end()) {
602  extra_subsrcs.insert(TExtraSubSrc(CSubSource::eSubtype_segment, true));
603  }
604  break;
605  case eNotInfluenza:
606  break;
607  }
608 }
609 
610 
611 
613 {
614  bool default_exclude = true;
615 
616  for (unsigned int k = 0; k < m_GroupList.size() && default_exclude; k++) {
617  default_exclude = m_GroupList[k]->GetDefaultExcludeSp ();
618  }
619  return default_exclude;
620 }
621 
622 
624 {
625  if (extra_subsrcs.find(CSubSource::eSubtype_transgenic) == extra_subsrcs.end()) {
627  }
628 
629  if (extra_subsrcs.find(CSubSource::eSubtype_plasmid_name) == extra_subsrcs.end()) {
631  }
632 
633  if (extra_subsrcs.find(CSubSource::eSubtype_endogenous_virus_name) == extra_subsrcs.end()) {
635  }
636 
637  if (extra_subsrcs.find(CSubSource::eSubtype_segment) == extra_subsrcs.end() &&
638  bsrc.IsSetOrg() && bsrc.GetOrg().IsSetTaxname() && NStr::StartsWith(bsrc.GetOrg().GetTaxname(), "Influenza ")) {
639  extra_subsrcs.insert(TExtraSubSrc(CSubSource::eSubtype_segment, true));
640  }
641 }
642 
643 
645 {
646  if (!bsrc.IsSetOrg() || !bsrc.GetOrg().IsSetOrgname() || !bsrc.GetOrg().GetOrgname().IsSetMod()) {
647  return false;
648  }
649 
650  ITERATE(COrgName::TMod, it, bsrc.GetOrg().GetOrgname().GetMod()) {
651  if ((*it)->IsSetSubtype() && (*it)->GetSubtype() == COrgMod::eSubtype_other &&
652  (*it)->IsSetSubname() && NStr::FindNoCase((*it)->GetSubname(), "type strain of") != string::npos) {
653  return true;
654  }
655  }
656  return false;
657 }
658 
659 
661 {
662  if (x_HasTypeStrainComment(bsrc)) {
663  if (extra_orgmods.find(COrgMod::eSubtype_strain) == extra_orgmods.end()) {
664  extra_orgmods.insert(TExtraOrgMod(COrgMod::eSubtype_strain, true));
665  }
666  }
667 }
668 
669 
670 typedef struct {
671  size_t subtype;
672  bool is_orgmod;
674 
675 static const SPreferredQual s_PreferredList[] = {
678  { COrgMod::eSubtype_strain, true },
679  { COrgMod::eSubtype_isolate, true },
680  { COrgMod::eSubtype_cultivar, true },
682  { COrgMod::eSubtype_ecotype, true },
684  { COrgMod::eSubtype_breed, true},
685 
687  { COrgMod::eSubtype_biotype, true },
688  { COrgMod::eSubtype_biovar, true },
689  { COrgMod::eSubtype_chemovar, true },
690  { COrgMod::eSubtype_pathovar, true },
691  { COrgMod::eSubtype_serogroup, true },
692  { COrgMod::eSubtype_serovar, true },
693  { COrgMod::eSubtype_substrain, true },
694 
697  { CSubSource::eSubtype_clone, false } ,
698  { CSubSource::eSubtype_haplotype, false } ,
699 
700  { CSubSource::eSubtype_cell_line, false } ,
702  { CSubSource::eSubtype_country, false } ,
703  { CSubSource::eSubtype_dev_stage, false } ,
704  { CSubSource::eSubtype_genotype, false } ,
706 
708  { CSubSource::eSubtype_map, false } ,
710  { CSubSource::eSubtype_segment, false } ,
711  { CSubSource::eSubtype_subclone, false } ,
712  { CSubSource::eSubtype_other, false } ,
713  { COrgMod::eSubtype_other, true } ,
714 };
715 
716 
717 static const size_t kNumPreferred = sizeof(s_PreferredList) / sizeof (SPreferredQual);
718 
720 {
721  size_t k;
722 
723  // first, set up modifier list with blanks
724  modifier_list.clear();
725 
726  for (k = 0; k < kNumPreferred; k++) {
727  if (s_PreferredList[k].is_orgmod) {
728  modifier_list.push_back(CAutoDefAvailableModifier((COrgMod::ESubtype)s_PreferredList[k].subtype, true));
729  } else {
730  modifier_list.push_back(CAutoDefAvailableModifier((CSubSource::ESubtype)s_PreferredList[k].subtype, false));
731  }
732  }
733 
734  for (k = 0; k < m_GroupList.size(); k++) {
735  m_GroupList[k]->GetAvailableModifiers(modifier_list);
736  }
737 }
738 
739 
741 {
742  size_t k;
743  for (k = 0; k < kNumPreferred; k++) {
744  if (!s_PreferredList[k].is_orgmod && (CSubSource::ESubtype)s_PreferredList[k].subtype == subtype) {
745  return true;
746  }
747  }
748  return false;
749 }
750 
751 
753 {
754  size_t k;
755  for (k = 0; k < kNumPreferred; k++) {
756  if (s_PreferredList[k].is_orgmod && (COrgMod::ESubtype)s_PreferredList[k].subtype == subtype) {
757  return true;
758  }
759  }
760  return false;
761 }
762 
763 
764 bool CAutoDefModifierCombo::IsModifierRequiredByDefault(bool is_orgmod, int subtype)
765 {
766  bool rval = false;
767 if (is_orgmod) {
768  rval = false;
769 } else {
772  subtype == CSubSource::eSubtype_transgenic) {
773  rval = true;
774  } else {
775  rval = false;
776  }
777 }
778 return rval;
779 }
780 
781 
783 {
784  unsigned int k;
785  string source_description;
788  bool no_extras = false;
789 
790  /* start with tax name */
791  source_description += bsrc.GetOrg().GetTaxname();
792  x_CleanUpTaxName(source_description);
793 
794  EInfluenzaType influenza_type = GetInfluenzaType(source_description);
795 
796  x_AddRequiredSubSourceModifiers(orgmods, subsrcs, bsrc);
797 
798  x_AddHIVModifiers(orgmods, subsrcs, bsrc);
799  x_AddInfluenzaModifiers(orgmods, subsrcs, influenza_type);
800  x_AddTypeStrainModifiers(orgmods, subsrcs, bsrc);
801 
802  /* should this organism be excluded? */
803  if (m_ExcludeSpOrgs) {
804  string::size_type pos = NStr::Find(source_description, " sp. ");
805  if (pos != NCBI_NS_STD::string::npos
806  && (pos < 2 || !NStr::StartsWith(source_description.substr(pos - 2), "f."))) {
807  no_extras = true;
808  // but add plasmid name anyway
809  if (subsrcs.find(CSubSource::eSubtype_plasmid_name) == subsrcs.end()) {
811  }
812  }
813  }
814  if (m_ExcludeCfOrgs) {
815  string::size_type pos = NStr::Find(source_description, " cf. ");
816  if (pos != NCBI_NS_STD::string::npos) {
817  no_extras = true;
818  }
819  }
820  if (m_ExcludeNrOrgs) {
821  string::size_type pos = NStr::Find(source_description, " nr. ");
822  if (pos != NCBI_NS_STD::string::npos) {
823  no_extras = true;
824  }
825  }
826  if (m_ExcludeAffOrgs) {
827  string::size_type pos = NStr::Find(source_description, " aff. ");
828  if (pos != NCBI_NS_STD::string::npos) {
829  no_extras = true;
830  }
831  }
832 
833  if (!no_extras) {
834  if (bsrc.CanGetOrigin() && bsrc.GetOrigin() == CBioSource::eOrigin_mut) {
835  source_description = "Mutant " + source_description;
836  }
837 
838  // add requested orgmods
839  for (unsigned int k = 0; k < m_OrgMods.size(); k++) {
840  if (orgmods.find(m_OrgMods[k]) == orgmods.end()) {
841  orgmods.insert(TExtraOrgMod(m_OrgMods[k], true));
842  }
843  }
844 
845  // add requested subsources
846  for (unsigned int k = 0; k < m_SubSources.size(); k++) {
847  if (subsrcs.find(m_SubSources[k]) == subsrcs.end()) {
848  subsrcs.insert(TExtraSubSrc(m_SubSources[k], true));
849  }
850  }
851  }
852 
853  // special handling for influenza
854  if (influenza_type != eNotInfluenza) {
856  (influenza_type == eInfluenzaA && x_BioSourceHasOrgMod(bsrc, COrgMod::eSubtype_serotype)))) {
857  string paren = " (";
858  for (auto& it : bsrc.GetOrg().GetOrgname().GetMod()) {
859  if (it->IsSetSubtype() && it->GetSubtype() == COrgMod::eSubtype_strain &&
860  it->IsSetSubname() && !NStr::IsBlank(it->GetSubname())) {
861  paren += it->GetSubname();
862  break;
863  }
864  }
865  if (influenza_type == eInfluenzaA) {
866  string serotype;
867  for (auto& it : bsrc.GetOrg().GetOrgname().GetMod()) {
868  if (it->IsSetSubtype() && it->GetSubtype() == COrgMod::eSubtype_serotype &&
869  it->IsSetSubname() && !NStr::IsBlank(it->GetSubname())) {
870  serotype = it->GetSubname();
871  break;
872  }
873  }
874  if (!NStr::IsBlank(serotype)) {
875  paren += "(" + serotype + ")";
876  }
877  }
878  paren += ")";
879  if (!NStr::EndsWith(source_description, paren)) {
880  source_description += paren;
881  }
882  }
884  source_description += " clone";
885  }
886  x_AddSubsourceString(source_description, bsrc, CSubSource::eSubtype_clone);
887  x_AddSubsourceString(source_description, bsrc, CSubSource::eSubtype_segment);
888  }
889 
890  for (k = 0; k < kNumPreferred; k++) {
891  if (s_PreferredList[k].is_orgmod) {
893  if (orgmods.find(st) != orgmods.end() &&
894  !x_SpecialHandlingForInfluenza(influenza_type, st)) {
895  x_AddOrgModString(source_description, bsrc, st);
896  }
897  } else {
899  if (subsrcs.find(st) != subsrcs.end() &&
900  !x_SpecialHandlingForInfluenza(influenza_type, st)) {
901  x_AddSubsourceString(source_description, bsrc, st);
902  }
903  }
904  }
905 
906  // add maxicircle/minicircle
907  x_AddMinicircle(source_description, bsrc);
908 
909  return source_description;
910 }
911 
912 
913 bool CAutoDefModifierCombo::x_AddMinicircle(string& source_description, const string& note_text)
914 {
915  bool any_change = false;
916  vector<CTempString> tokens;
917  NStr::Split(note_text, ";", tokens, NStr::fSplit_Tokenize);
918  ITERATE(vector<CTempString>, t, tokens) {
919  if (NStr::Find(*t, "maxicircle") != string::npos ||
920  NStr::Find(*t, "minicircle") != string::npos) {
921  string add = *t;
923  source_description += " " + add;
924  any_change = true;
925  }
926  }
927  return any_change;
928 }
929 
930 
931 bool CAutoDefModifierCombo::x_AddMinicircle(string& source_description, const CBioSource& bsrc)
932 {
933  bool any_change = false;
934  if (bsrc.IsSetSubtype()) {
936  if ((*it)->IsSetSubtype() && (*it)->IsSetName() &&
937  (*it)->GetSubtype() == CSubSource::eSubtype_other) {
938  any_change |= x_AddMinicircle(source_description, (*it)->GetName());
939  }
940  }
941  }
942  if (bsrc.IsSetOrg() && bsrc.GetOrg().IsSetOrgname() && bsrc.GetOrg().GetOrgname().IsSetMod()) {
943  ITERATE(COrgName::TMod, it, bsrc.GetOrg().GetOrgname().GetMod()) {
944  if ((*it)->IsSetSubtype() && (*it)->IsSetSubname() &&
945  (*it)->GetSubtype() == COrgMod::eSubtype_other) {
946  any_change |= x_AddMinicircle(source_description, (*it)->GetSubname());
947  }
948  }
949  }
950  return any_change;
951 }
952 
953 
955 {
956  unsigned int num = 0;
957 
959  if ((*it)->GetSrcList().size() == 1) {
960  num++;
961  }
962  }
963  return num;
964 }
965 
966 
968 {
969  unsigned int num = 0;
970 
972  if ((*it)->GetSrcList().size() > num) {
973  num = (*it)->GetSrcList().size();
974  }
975  }
976  return num;
977 }
978 
979 
980 /* NOTE - we want to sort combos from most unique organisms to least unique organisms */
981 /* secondary sort - most groups to least groups */
982 /* tertiary sort - fewer max orgs in group to most max orgs in group */
983 /* fourth sort - least mods to most mods */
985 {
986  int rval = 0;
987  unsigned int num_this, num_other;
988 
989  num_this = GetNumUnique();
990  num_other = other.GetNumUnique();
991  if (num_this > num_other) {
992  rval = -1;
993  } else if (num_this < num_other) {
994  rval = 1;
995  } else {
996  num_this = m_GroupList.size();
997  num_other = other.GetGroupList().size();
998  if (num_this > num_other) {
999  rval = -1;
1000  } else if (num_this < num_other) {
1001  rval = 1;
1002  } else {
1003  num_this = GetMaxInGroup ();
1004  num_other = other.GetMaxInGroup();
1005  if (num_this < num_other) {
1006  rval = -1;
1007  } else if (num_this > num_other) {
1008  rval = 1;
1009  } else {
1010  num_this = m_Modifiers.size();
1011  num_other = other.GetModifiers().size();
1012  if (num_this < num_other) {
1013  rval = -1;
1014  } else if (num_this > num_other) {
1015  rval = 1;
1016  }
1017  }
1018  }
1019  }
1020  return rval;
1021 }
1022 
1023 
1026 {
1027  return (*s1 < *s2);
1028 }
1029 
1030 
1031 bool CAutoDefModifierCombo::AddQual (bool IsOrgMod, int subtype, bool even_if_not_uniquifying)
1032 {
1033  bool added = false, rval = false;
1034  vector <CRef<CAutoDefSourceGroup> > new_groups;
1035 
1036  new_groups.clear();
1038  if ((*it)->AddQual(IsOrgMod, subtype, m_KeepAfterSemicolon)) {
1039  (*it)->SortDescriptions();
1040  auto split = (*it)->SplitGroup();
1041  while (split) {
1042  rval = true;
1043  new_groups.emplace_back(split);
1044  // further split group if necessary
1045  split = split->SplitGroup();
1046  }
1047  }
1048  }
1049 
1050  // NOTE - need to put groups from non-matching descriptions and put them in a new_groups list
1051  // in order to avoid processing them twice
1052  if (!new_groups.empty()) {
1053  m_GroupList.insert(m_GroupList.end(), new_groups.begin(), new_groups.end());
1054  rval = true;
1055  }
1056 
1057 
1058  if (rval || even_if_not_uniquifying) {
1059  m_Modifiers.push_back (CAutoDefSourceModifierInfo (IsOrgMod, subtype, ""));
1061  if (IsOrgMod) {
1062  m_OrgMods.push_back ((COrgMod_Base::ESubtype)subtype);
1063  } else {
1064  m_SubSources.push_back ((CSubSource_Base::ESubtype)subtype);
1065  }
1066  }
1067  return rval;
1068 }
1069 
1070 
1071 
1072 bool CAutoDefModifierCombo::RemoveQual (bool IsOrgMod, int subtype)
1073 {
1074  bool rval = false;
1075 
1077  rval |= (*it)->RemoveQual (IsOrgMod, subtype);
1078  }
1079  return rval;
1080 }
1081 
1082 
1083 vector<CRef<CAutoDefModifierCombo>> CAutoDefModifierCombo::ExpandByAnyPresent()
1084 {
1086  vector<CRef<CAutoDefModifierCombo>> expanded;
1087 
1088  expanded.clear();
1089  for (auto it :m_GroupList) {
1090  if (it->GetSrcList().size() == 1) {
1091  continue;
1092  }
1093  mods = it->GetModifiersPresentForAny();
1094  for (auto mod_it : mods) {
1096  if (cpy->AddQual(mod_it.IsOrgMod(), mod_it.GetSubtype())) {
1097  expanded.emplace_back(cpy);
1098  }
1099  }
1100  if (!expanded.empty()) {
1101  break;
1102  }
1103  }
1104  return expanded;
1105 }
1106 
1107 
1109 {
1110  vector<string> clauses;
1111 
1113  CAutoDefSourceGroup::TSourceDescriptionVector src_list = (*g)->GetSrcList();
1114  CAutoDefSourceGroup::TSourceDescriptionVector::iterator s = src_list.begin();
1115  while (s != src_list.end()) {
1116  clauses.push_back((*s)->GetFeatureClauses());
1117  ++s;
1118  }
1119  }
1120  if (clauses.size() < 2) {
1121  return true;
1122  }
1123  sort (clauses.begin(), clauses.end());
1124  bool unique = true;
1125  vector<string>::iterator sit = clauses.begin();
1126  string prev = *sit;
1127  ++sit;
1128  while (sit != clauses.end() && unique) {
1129  if (NStr::Equal(prev, *sit)) {
1130  unique = false;
1131  } else {
1132  prev = *sit;
1133  }
1134  ++sit;
1135  }
1136  return unique;
1137 }
1138 
1139 
static const size_t kNumPreferred
static const SPreferredQual s_PreferredList[]
bool CompareAutoDefSourceGroupByStrings(CRef< CAutoDefSourceGroup > s1, CRef< CAutoDefSourceGroup > s2)
#define true
Definition: bool.h:35
#define false
Definition: bool.h:36
static string GetOrgModLabel(COrgMod::ESubtype st)
static string GetSubSourceLabel(CSubSource::ESubtype st)
static bool IsModifierRequiredByDefault(bool is_orgmod, int subtype)
static bool x_BioSourceHasOrgMod(const CBioSource &src, COrgMod::ESubtype subtype)
void x_AddTypeStrainModifiers(TExtraOrgMods &extra_orgmods, TExtraSubSrcs &extra_subsrcs, const CBioSource &bsrc)
bool HasSubSource(CSubSource::ESubtype st)
TGroupListVector m_GroupList
void AddSource(const CBioSource &bs, const string &feature_clauses="")
static bool x_HasTypeStrainComment(const CBioSource &bsrc)
void GetAvailableModifiers(CAutoDefSourceDescription::TAvailableModifierVector &modifier_list)
static bool IsUsableInDefline(CSubSource::ESubtype subtype)
void x_AddHIVModifiers(TExtraOrgMods &extra_orgmods, TExtraSubSrcs &extra_subsrcs, const CBioSource &bsrc)
vector< CRef< CAutoDefSourceGroup > > TGroupListVector
unsigned int GetNumUnique() const
static bool x_AddMinicircle(string &source_description, const CBioSource &bsrc)
CSubSource::ESubtype GetSubSource(unsigned int index)
static bool x_SpecialHandlingForInfluenza(EInfluenzaType influenza_type, CSubSource::ESubtype subtype)
void x_AddRequiredSubSourceModifiers(TExtraOrgMods &extra_orgmods, TExtraSubSrcs &extra_subsrcs, const CBioSource &bsrc)
string x_GetOrgModLabel(COrgMod::ESubtype st)
unsigned int GetMaxInGroup() const
bool HasOrgMod(COrgMod::ESubtype st)
void x_CleanUpTaxName(string &tax_name)
void AddOrgMod(COrgMod::ESubtype st, bool even_if_not_uniquifying=false)
string GetSourceDescriptionString(const CBioSource &bsrc)
bool AddQual(bool IsOrgMod, int subtype, bool even_if_not_uniquifying=false)
void AddSubsource(CSubSource::ESubtype st, bool even_if_not_uniquifying=false)
TOrgModTypeVector m_OrgMods
int Compare(const CAutoDefModifierCombo &other) const
string x_GetSubSourceLabel(CSubSource::ESubtype st)
unsigned int GetNumSubSources()
TSubSourceTypeVector m_SubSources
void InitFromOptions(const CAutoDefOptions &options)
void x_AddInfluenzaModifiers(TExtraOrgMods &extra_orgmods, TExtraSubSrcs &extra_subsrcs, EInfluenzaType influenza_type)
const TGroupListVector & GetGroupList() const
CAutoDefSourceDescription::TModifierVector m_Modifiers
static bool x_BioSourceHasSubSrc(const CBioSource &src, CSubSource::ESubtype subtype)
static EInfluenzaType GetInfluenzaType(const string &taxname)
bool x_AddOrgModString(string &source_description, const CBioSource &bsrc, COrgMod::ESubtype st)
static bool IsModifierInString(const string &find_this, const string &find_in, bool ignore_at_end)
pair< CSubSource::ESubtype, bool > TExtraSubSrc
bool x_AddSubsourceString(string &source_description, const CBioSource &bsrc, CSubSource::ESubtype st)
CAutoDefOptions::EHIVCloneIsolateRule m_HIVCloneIsolateRule
vector< CRef< CAutoDefModifierCombo > > ExpandByAnyPresent()
void InitOptions(CAutoDefOptions &options) const
const CAutoDefSourceDescription::TModifierVector & GetModifiers() const
bool RemoveQual(bool IsOrgMod, int subtype)
pair< COrgMod::ESubtype, bool > TExtraOrgMod
COrgMod::ESubtype GetOrgMod(unsigned int index)
bool GetKeepAfterSemicolon() const
THIVRule GetHIVRule() const
void AddOrgMod(COrgMod::TSubtype subtype)
void SetHIVRule(EHIVCloneIsolateRule rule)
void SetLeaveParenthetical(bool val=true)
bool GetDoNotApplyToSp() const
int GetMaxMods() const
const TOrgMods & GetOrgMods() const
void SetDoNotApplyToAff(bool val=true)
bool GetDoNotApplyToNr() const
void SetMaxMods(int val)
bool GetLeaveParenthetical() const
bool GetAllowModAtEndOfTaxname() const
void SetDoNotApplyToNr(bool val=true)
bool GetDoNotApplyToAff() const
void SetUseLabels(bool val=true)
void SetKeepAfterSemicolon(bool val=true)
vector< COrgMod::TSubtype > TOrgMods
bool GetIncludeCountryText() const
void SetDoNotApplyToCf(bool val=true)
bool GetDoNotApplyToCf() const
void SetAllowModAtEndOfTaxname(bool val=true)
void AddSubSource(CSubSource::TSubtype subtype)
void SetDoNotApplyToSp(bool val=true)
void SetIncludeCountryText(bool val=true)
const TSubSources & GetSubSources() const
bool GetUseLabels() const
vector< CSubSource::TSubtype > TSubSources
vector< CAutoDefSourceModifierInfo > TModifierVector
vector< CAutoDefAvailableModifier > TAvailableModifierVector
vector< CRef< CAutoDefSourceDescription > > TSourceDescriptionVector
CRef –.
Definition: ncbiobj.hpp:618
const_iterator end() const
Definition: map.hpp:152
iterator_bool insert(const value_type &val)
Definition: map.hpp:165
const_iterator find(const key_type &key) const
Definition: map.hpp:153
Definition: map.hpp:338
static DLIST_TYPE *DLIST_NAME() prev(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
Definition: dlist.tmpl.h:61
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
Definition: ncbistr.cpp:3457
static SIZE_TYPE FindNoCase(const CTempString str, const CTempString pattern, SIZE_TYPE start, SIZE_TYPE end, EOccurrence which=eFirst)
Find the pattern in the specified range of a string using a case insensitive search.
Definition: ncbistr.cpp:2989
static bool EndsWith(const CTempString str, const CTempString end, ECase use_case=eCase)
Check if a string ends with a specified suffix value.
Definition: ncbistr.hpp:5429
static bool IsBlank(const CTempString str, SIZE_TYPE pos=0)
Check if a string is blank (has no text).
Definition: ncbistr.cpp:106
static void TruncateSpacesInPlace(string &str, ETrunc where=eTrunc_Both)
Truncate spaces in a string (in-place)
Definition: ncbistr.cpp:3197
static SIZE_TYPE Find(const CTempString str, const CTempString pattern, ECase use_case=eCase, EDirection direction=eForwardSearch, SIZE_TYPE occurrence=0)
Find the pattern in the string.
Definition: ncbistr.cpp:2887
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
Definition: ncbistr.hpp:5411
static bool EqualNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive equality of a substring with another string.
Definition: ncbistr.hpp:5352
static bool Equal(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2, ECase use_case=eCase)
Test for equality of a substring with another string.
Definition: ncbistr.hpp:5383
@ fSplit_Tokenize
All delimiters are merged and trimmed, to get non-empty tokens only.
Definition: ncbistr.hpp:2508
@ eNocase
Case insensitive compare.
Definition: ncbistr.hpp:1206
static const char label[]
const TSubtype & GetSubtype(void) const
Get the Subtype member data.
Definition: BioSource_.hpp:539
TOrigin GetOrigin(void) const
Get the Origin member data.
Definition: BioSource_.hpp:472
bool IsSetOrg(void) const
Check if a value has been assigned to Org data member.
Definition: BioSource_.hpp:497
list< CRef< CSubSource > > TSubtype
Definition: BioSource_.hpp:145
bool IsSetSubtype(void) const
Check if a value has been assigned to Subtype data member.
Definition: BioSource_.hpp:527
bool CanGetOrigin(void) const
Check if it is safe to call GetOrigin method.
Definition: BioSource_.hpp:453
const TOrg & GetOrg(void) const
Get the Org member data.
Definition: BioSource_.hpp:509
@ eSubtype_endogenous_virus_name
Definition: SubSource_.hpp:109
@ eOrigin_mut
artificially mutagenized
Definition: BioSource_.hpp:132
const TMod & GetMod(void) const
Get the Mod member data.
Definition: OrgName_.hpp:839
const TTaxname & GetTaxname(void) const
Get the Taxname member data.
Definition: Org_ref_.hpp:372
bool IsSetMod(void) const
Check if a value has been assigned to Mod data member.
Definition: OrgName_.hpp:827
list< CRef< COrgMod > > TMod
Definition: OrgName_.hpp:332
bool IsSetOrgname(void) const
Check if a value has been assigned to Orgname data member.
Definition: Org_ref_.hpp:529
bool IsSetTaxname(void) const
preferred formal name Check if a value has been assigned to Taxname data member.
Definition: Org_ref_.hpp:360
const TOrgname & GetOrgname(void) const
Get the Orgname member data.
Definition: Org_ref_.hpp:541
@ eSubtype_biotype
Definition: OrgMod_.hpp:97
@ eSubtype_substrain
Definition: OrgMod_.hpp:86
@ eSubtype_pathovar
Definition: OrgMod_.hpp:94
@ eSubtype_other
ASN5: old-name (254) will be added to next spec.
Definition: OrgMod_.hpp:125
@ eSubtype_sub_species
Definition: OrgMod_.hpp:105
@ eSubtype_nat_host
natural host of this specimen
Definition: OrgMod_.hpp:104
@ eSubtype_cultivar
Definition: OrgMod_.hpp:93
@ eSubtype_variety
Definition: OrgMod_.hpp:89
@ eSubtype_strain
Definition: OrgMod_.hpp:85
@ eSubtype_biovar
Definition: OrgMod_.hpp:96
@ eSubtype_serogroup
Definition: OrgMod_.hpp:91
@ eSubtype_specimen_voucher
Definition: OrgMod_.hpp:106
@ eSubtype_serotype
Definition: OrgMod_.hpp:90
@ eSubtype_chemovar
Definition: OrgMod_.hpp:95
@ eSubtype_serovar
Definition: OrgMod_.hpp:92
@ eSubtype_bio_material
Definition: OrgMod_.hpp:119
@ eSubtype_culture_collection
Definition: OrgMod_.hpp:118
@ eSubtype_ecotype
Definition: OrgMod_.hpp:110
@ eSubtype_forma_specialis
Definition: OrgMod_.hpp:109
@ eSubtype_isolate
Definition: OrgMod_.hpp:100
where boath are integers</td > n< td ></td > n</tr > n< tr > n< td > tse</td > n< td > optional</td > n< td > String</td > n< td class=\"description\"> TSE option controls what blob is orig
constexpr auto sort(_Init &&init)
EIPRangeType t
Definition: ncbi_localip.c:101
Miscellaneous common-use basic types and functionality.
void split(std::vector< std::string > *strVec, const std::string &str_, const std::string &split_)
#define _ASSERT
int g(Seg_Gsm *spe, Seq_Mtf *psm, Thd_Gsm *tdg)
Definition: thrddgri.c:44
Modified on Fri Dec 01 04:43:02 2023 by modify_doxy.py rev. 669887