NCBI C++ ToolKit
valid_biosource.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: valid_biosource.cpp 102338 2024-04-24 15:51:01Z kans $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Jonathan Kans, Clifford Clausen, Aaron Ucko, Mati Shomrat, ....
27  *
28  * File Description:
29  * Implementation of private parts of the validator
30  * .......
31  *
32  */
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbistd.hpp>
35 #include <corelib/ncbistr.hpp>
36 #include <corelib/ncbiapp.hpp>
38 
43 
44 #include <serial/iterator.hpp>
45 #include <serial/enumvalues.hpp>
46 
47 //#include <objects/seqset/Bioseq_set.hpp>
48 //#include <objects/seqset/Seq_entry.hpp>
49 
50 //#include <objects/seq/Bioseq.hpp>
57 
58 #include <objmgr/seqdesc_ci.hpp>
59 #include <util/sgml_entity.hpp>
60 #include <util/strsearch.hpp>
61 
62 #include <mutex>
63 
64 #define NCBI_USE_ERRCODE_X Objtools_Validator
65 
68 BEGIN_SCOPE(validator)
69 using namespace sequence;
70 
71 static const string kInvalidReplyMsg = "Taxonomy service returned invalid reply";
72 
73 static unique_ptr<CTextFsa> m_SourceQualTags;
74 
75 static bool s_UnbalancedParentheses(string str)
76 {
77  if (NStr::IsBlank(str)) {
78  return false;
79  }
80 
81  int par = 0, bkt = 0;
82  string::iterator it = str.begin();
83  while (it != str.end()) {
84  if (*it == '(') {
85  ++par;
86  } else if (*it == ')') {
87  --par;
88  if (par < 0) {
89  return true;
90  }
91  } else if (*it == '[') {
92  ++bkt;
93  } else if (*it == ']') {
94  --bkt;
95  if (bkt < 0) {
96  return true;
97  }
98  }
99  ++it;
100  }
101  if (par > 0 || bkt > 0) {
102  return true;
103  } else {
104  return false;
105  }
106 }
107 
108 
109 #if 0
110 const char* sm_ValidModifiedPrimerBases[] = {
111  "ac4c",
112  "chm5u",
113  "cm",
114  "cmnm5s2u",
115  "cmnm5u",
116  "d",
117  "fm",
118  "gal q",
119  "gm",
120  "i",
121  "i6a",
122  "m1a",
123  "m1f",
124  "m1g",
125  "m1i",
126  "m22g",
127  "m2a",
128  "m2g",
129  "m3c",
130  "m4c",
131  "m5c",
132  "m6a",
133  "m7g",
134  "mam5u",
135  "mam5s2u",
136  "man q",
137  "mcm5s2u",
138  "mcm5u",
139  "mo5u",
140  "ms2i6a",
141  "ms2t6a",
142  "mt6a",
143  "mv",
144  "o5u",
145  "osyw",
146  "p",
147  "q",
148  "s2c",
149  "s2t",
150  "s2u",
151  "s4u",
152  "t",
153  "t6a",
154  "tm",
155  "um",
156  "yw",
157  "x",
158  "OTHER"
159 };
160 
161 static bool s_IsValidPrimerSequence (string str, char& bad_ch)
162 {
163  bad_ch = 0;
164  if (NStr::IsBlank(str)) {
165  return false;
166  }
167 
168  if (NStr::Find(str, ",") == string::npos) {
169  if (NStr::Find(str, "(") != string::npos
170  || NStr::Find(str, ")") != string::npos) {
171  return false;
172  }
173  } else {
174  if (!NStr::StartsWith(str, "(") || !NStr::EndsWith(str, ")")) {
175  return false;
176  }
177  }
178 
179  if (NStr::Find(str, ";") != string::npos) {
180  return false;
181  }
182 
183  const char* *list_begin = sm_ValidModifiedPrimerBases;
184  const char* *list_end = &(sm_ValidModifiedPrimerBases[sizeof(sm_ValidModifiedPrimerBases) / sizeof(const char*)]);
185 
186  size_t pos = 0;
187  string::iterator sit = str.begin();
188  while (sit != str.end()) {
189  if (*sit == '<') {
190  size_t pos2 = NStr::Find (str, ">", pos + 1);
191  if (pos2 == string::npos) {
192  bad_ch = '<';
193  return false;
194  }
195  string match = str.substr(pos + 1, pos2 - pos - 1);
196  if (find(list_begin, list_end, match) == list_end) {
197  bad_ch = '<';
198  return false;
199  }
200  sit += pos2 - pos + 1;
201  pos = pos2 + 1;
202  } else {
203  if (*sit != '(' && *sit != ')' && *sit != ',' && *sit != ':') {
204  if (!isalpha (*sit)) {
205  bad_ch = *sit;
206  return false;
207  }
208  char ch = toupper(*sit);
209  if (strchr ("ABCDGHKMNRSTVWY", ch) == NULL) {
210  bad_ch = tolower (ch);
211  return false;
212  }
213  }
214  ++sit;
215  ++pos;
216  }
217  }
218 
219  return true;
220 }
221 #endif
222 
223 
225 {
226  if (!src.IsSetOrg()) {
227  return false;
228  }
229  const COrg_ref& org = src.GetOrg();
230  if (org.IsSetTaxname()) {
231  const string & taxname = org.GetTaxname();
232  if (NStr::EqualNocase(taxname, "synthetic construct") ||
233  NStr::FindNoCase(taxname, "vector") != string::npos) {
234  return true;
235  }
236  }
237 
238  if (org.IsSetLineage()) {
239  if (NStr::FindNoCase(org.GetLineage(), "artificial sequences") != string::npos) {
240  return true;
241  }
242  }
243 
244  if (src.GetOrg().IsSetOrgname() && src.GetOrg().GetOrgname().IsSetDiv()
245  && NStr::EqualNocase(src.GetOrg().GetOrgname().GetDiv(), "syn")) {
246  return true;
247  }
248  return false;
249 }
250 
251 
253 {
254  if (src.IsSetOrigin()
256  return true;
257  }
258  return false;
259 }
260 
261 
262 static string x_RepairCountryName (string countryname)
263 {
264  if (NStr::CompareNocase (countryname, "USA: Washington DC") == 0 || NStr::CompareNocase (countryname, "USA: Washington, DC") == 0) {
265  countryname = "USA: District of Columbia";
266  } else if (NStr::StartsWith (countryname, "USA:") && NStr::EndsWith (countryname, ", Puerto Rico")) {
267  countryname = "USA: Puerto Rico";
268  } else if (NStr::StartsWith (countryname, "Puerto Rico")) {
269  countryname = "USA: Puerto Rico";
270  }
271  if (NStr::StartsWith (countryname, "USA: Puerto Rico")) {
272  countryname = countryname.substr(5);
273  }
274 
275  if (NStr::StartsWith (countryname, "China: Hong Kong")) {
276  countryname = countryname.substr(7);
277  }
278 
279  return countryname;
280 }
281 
282 
284 (string countryname,
285 string lat_lon,
286 const CSerialObject& obj,
287 const CSeq_entry *ctx)
288 {
290  countryname = x_RepairCountryName (countryname);
291  string error = CSubSource::ValidateLatLonCountry(countryname, lat_lon, IsLatLonCheckState(), errcode);
292  if (!NStr::IsBlank(error)) {
293  EErrType errtype = CValidator::ConvertCode(errcode);
295  if (errtype != eErr_UNKNOWN) {
296  PostObjErr(sev, errtype, error, obj, ctx);
297  }
298  }
299 }
300 
301 
302 /* note - special case for sex because it prevents a different message from being displayed, do not list here */
308 };
309 
311 
312 
314 {
315  int i;
316  bool rval = false;
317 
318  for (i = 0; i < sNumUnexpectedViralSubSourceQualifiers && !rval; i++) {
319  if (subtype == sUnexpectedViralSubSourceQualifiers[i]) {
320  rval = true;
321  }
322  }
323  return rval;
324 }
325 
330 };
331 
333 
335 {
336  int i;
337  bool rval = false;
338 
339  for (i = 0; i < sNumUnexpectedViralOrgModQualifiers && !rval; i++) {
340  if (subtype == sUnexpectedViralOrgModQualifiers[i]) {
341  rval = true;
342  }
343  }
344  return rval;
345 }
346 
347 
349 {
350  SetNotGood();
351 
352  if (bsrc.IsSetGenome()) {
354  switch (genome) {
355  // case CBioSource::eGenome_genomic: break;
356  // case CBioSource::eGenome_plastid: break;
357  // case CBioSource::eGenome_macronuclear: break;
358  // case CBioSource::eGenome_extrachrom: break;
359  // case CBioSource::eGenome_plasmid: break;
360  // case CBioSource::eGenome_transposon: break;
361  // case CBioSource::eGenome_insertion_seq: break;
362  // case CBioSource::eGenome_proviral: break;
363  // case CBioSource::eGenome_virion: break;
364  // case CBioSource::eGenome_endogenous_virus: break;
376  m_organelle = true;
377  break;
379  m_eukaryote = true;
380  break;
381  // case CBioSource::eGenome_chromatophore: break;
382  default:
383  break;
384  }
385  }
386  if (bsrc.IsSetLineage()) {
387  const string& lineage = bsrc.GetLineage();
388  if (NStr::StartsWith(lineage, "Eukaryota"))
389  m_eukaryote = true;
390  else if (NStr::StartsWith(lineage, "Bacteria"))
391  m_bacteria = true;
392  else if (NStr::StartsWith(lineage, "Archaea"))
393  m_archaea = true;
394  //else
395  // NCBI_ASSERT(0, "Not fully implemented switch statement");
396  }
397 
398  return *this;
399 }
400 
401 static bool s_IsEukaryoteOrProkaryote(const CBioSourceKind& biosourceKind)
402 {
403  return biosourceKind.IsOrganismBacteria() ||
404  biosourceKind.IsOrganismArchaea() ||
405  biosourceKind.IsOrganismEukaryote();
406 }
407 
408 
409 static bool s_ReportUndefinedSpeciesId(const CBioseq& bioseq)
410 {
411  if (bioseq.IsSetId()) {
412  bool all_local_or_gnl = true;
413  for (auto pId : bioseq.GetId()) {
414  switch (pId->Which()) {
415  case CSeq_id::e_Genbank:
416  case CSeq_id::e_Tpd:
417  case CSeq_id::e_Tpe:
418  case CSeq_id::e_Tpg:
419  return true;
420  case CSeq_id::e_Local:
421  case CSeq_id::e_General:
422  break;
423  default:
424  all_local_or_gnl = false;
425  }
426  }
427  return all_local_or_gnl;
428  }
429 
430  return false;
431 }
432 
433 
434 static bool s_IsChromosome(const CBioSource& biosource)
435 {
436  return biosource.IsSetGenome() &&
438 }
439 
440 
441 static bool s_HasWGSTech(const CBioseq& bioseq) {
442 
443  if (!bioseq.IsSetDescr()) {
444  return false;
445  }
446 
447  for (auto pDesc : bioseq.GetDescr().Get()) {
448  if (pDesc->IsMolinfo()) {
449  const auto& molinfo = pDesc->GetMolinfo();
450  return (molinfo.IsSetTech() && molinfo.GetTech() == CMolInfo::eTech_wgs);
451  }
452  }
453  return false;
454 }
455 
457 {
458  if (!ctx) {
459  return nullptr;
460  }
461 
462  if (ctx->IsSeq()) {
463  return &(ctx->GetSeq());
464  }
465 
466  if (ctx->IsSet() &&
467  ctx->GetSet().IsSetClass() &&
468  ctx->GetSet().GetClass() == CBioseq_set::eClass_nuc_prot) {
469  const auto& bioseq_set = ctx->GetSet();
470  if (bioseq_set.IsSetSeq_set()) {
471  for (const auto& pEntry : bioseq_set.GetSeq_set()) {
472  if (pEntry->IsSeq()) {
473  const auto& bioseq = pEntry->GetSeq();
474  if (bioseq.IsSetInst() &&
475  bioseq.GetInst().IsNa()) {
476  return &bioseq;
477  }
478  }
479  }
480  }
481  }
482  return nullptr;
483 }
484 
486 
487 {
488  if (NStr::IsBlank(str)) {
489  return false;
490  }
491  bool rval = true;
492  ITERATE(string, it, str) {
493  if (!isdigit(*it) && *it != ' ') {
494  rval = false;
495  break;
496  }
497  }
498  return rval;
499 }
500 
502 (const CBioSource& bsrc,
503 const CSerialObject& obj,
504 const CSeq_entry *ctx)
505 {
506  if (!bsrc.CanGetOrg()) {
508  "No organism has been applied to this Bioseq. Other qualifiers may exist.", obj, ctx);
509  return;
510  }
511 
512  const COrg_ref& orgref = bsrc.GetOrg();
513  const bool hasTaxname = orgref.IsSetTaxname();
514 
515  // look at uncultured required modifiers
516  if (hasTaxname) {
517  const string & taxname = orgref.GetTaxname();
518  if (NStr::StartsWith(taxname, "uncultured ", NStr::eNocase)) {
519  bool is_env_sample = false;
521  {
522  if ((*it)->IsSetSubtype() && (*it)->GetSubtype() == CSubSource::eSubtype_environmental_sample) {
523  is_env_sample = true;
524  break;
525  }
526  }
527  if (!is_env_sample) {
529  "Uncultured should also have /environmental_sample",
530  obj, ctx);
531  }
532  } else if (NStr::EqualNocase(taxname, "blank sample")) {
534  "Blank sample should not be associated with any sequences",
535  obj, ctx);
536  }
537  }
538 
539  // validate legal locations.
543  "Transposon and insertion sequence are no longer legal locations",
544  obj, ctx);
545  }
546 
547  if (IsIndexerVersion()
548  && bsrc.IsSetGenome()
551  "INDEXER_ONLY - BioSource location is chromosome",
552  obj, ctx);
553  }
554 
555  bool isViral = false, isAnimal = false, isPlant = false,
556  isBacteria = false, isArchaea = false, isFungal = false,
557  isViroid = false;
558  if (bsrc.IsSetLineage()) {
559  string lineage = bsrc.GetLineage();
560  if (NStr::StartsWith(lineage, "Viruses; ", NStr::eNocase) || NStr::EqualNocase(lineage, "Viruses")) {
561  isViral = true;
562  } else if (NStr::StartsWith(lineage, "Eukaryota; Metazoa; ", NStr::eNocase)) {
563  isAnimal = true;
564  } else if (NStr::StartsWith(lineage, "Eukaryota; Viridiplantae; Streptophyta; Embryophyta; ", NStr::eNocase)
565  || NStr::StartsWith(lineage, "Eukaryota; Rhodophyta; ", NStr::eNocase)
566  || NStr::StartsWith(lineage, "Eukaryota; stramenopiles; Phaeophyceae; ", NStr::eNocase)) {
567  isPlant = true;
568  } else if (NStr::StartsWith(lineage, "Bacteria; ", NStr::eNocase)) {
569  isBacteria = true;
570  } else if (NStr::StartsWith(lineage, "Archaea; ", NStr::eNocase)) {
571  isArchaea = true;
572  } else if (NStr::StartsWith(lineage, "Eukaryota; Fungi; ", NStr::eNocase)) {
573  isFungal = true;
574  } else if (NStr::StartsWith(lineage, "Viroids;", NStr::eNocase)) {
575  isViroid = true;
576  }
577  }
578 
579  TCount count;
580  bool chrom_conflict = false;
581  CPCRSetList pcr_set_list;
582  const CSubSource* chromosome = nullptr;
583  const CSubSource* linkage_group = nullptr;
584  string countryname;
585  string lat_lon;
586  double lat_value = 0.0, lon_value = 0.0;
587  bool is_single_cell_amplification = false;
588 
590  {
591  ValidateSubSource(**ssit, obj, ctx, isViral);
592  if (!(*ssit)->IsSetSubtype()) {
593  continue;
594  }
595 
596  if ((*ssit)->IsSetName()) {
597  string str = (*ssit)->GetName();
598  if (NStr::Equal(str, "N/A") || NStr::Equal(str, "Missing")) {
600  "Subsource name should not be " + str,
601  obj, ctx);
602  }
603  }
604 
605  CSubSource::TSubtype subtype = (*ssit)->GetSubtype();
606  count[subtype]++;
607 
608  switch (subtype) {
609 
611  countryname = (**ssit).GetName();
612  break;
613 
615  if ((*ssit)->IsSetName()) {
616  lat_lon = (*ssit)->GetName();
617  bool format_correct = false, lat_in_range = false, lon_in_range = false, precision_correct = false;
618  CSubSource::IsCorrectLatLonFormat(lat_lon, format_correct, precision_correct,
619  lat_in_range, lon_in_range,
620  lat_value, lon_value);
621  }
622  break;
623 
625  if (!(*ssit)->IsSetName() || !CSubSource::IsAltitudeValid((*ssit)->GetName())) {
626  string val;
627  if ((*ssit)->IsSetName()) {
628  val = (*ssit)->GetName();
629  }
631  "'" + val + "' is an invalid altitude value, altitude should be provided in meters",
632  obj, ctx);
633  }
634  break;
635 
637  if (chromosome) {
638  if (NStr::CompareNocase((**ssit).GetName(), chromosome->GetName()) != 0) {
639  chrom_conflict = true;
640  }
641  } else {
642  chromosome = ssit->GetPointer();
643  }
644  break;
645 
647  linkage_group = ssit->GetPointer();
648  break;
649 
651  if ((*ssit)->IsSetName()) {
652  pcr_set_list.AddFwdName((*ssit)->GetName());
653  }
654  break;
655 
657  if ((*ssit)->IsSetName()) {
658  pcr_set_list.AddRevName((*ssit)->GetName());
659  }
660  break;
661 
663  if ((*ssit)->IsSetName()) {
664  pcr_set_list.AddFwdSeq((*ssit)->GetName());
665  }
666  break;
667 
669  if ((*ssit)->IsSetName()) {
670  pcr_set_list.AddRevSeq((*ssit)->GetName());
671  }
672  break;
673 
675  {
676  EDiagSev sev = eDiag_Warning;
677  if (IsGpipe() && IsGenomic()) {
678  sev = eDiag_Error;
679  }
680  if (isAnimal || isPlant) {
681  /* always allow /sex, but now check values */
682  const string str = (*ssit)->GetName();
685  "Invalid value (" + str + ") for /sex qualifier", obj, ctx);
686  }
687  } else if (isViral) {
688  PostObjErr(sev, eErr_SEQ_DESCR_InvalidSexQualifier,
689  "Virus has unexpected Sex qualifier", obj, ctx);
690  } else if (isBacteria || isArchaea || isFungal) {
691  PostObjErr(sev, eErr_SEQ_DESCR_InvalidSexQualifier,
692  "Unexpected use of /sex qualifier", obj, ctx);
693  } else {
694  const string str = (*ssit)->GetName();
696  // otherwise values are restricted to specific list
698  "Invalid value (" + str + ") for /sex qualifier", obj, ctx);
699  }
700  }
701  }
702  break;
703 
705  if (isAnimal || isPlant || isViral) {
707  "Unexpected use of /mating_type qualifier", obj, ctx);
708  } else if (CSubSource::IsValidSexQualifierValue((*ssit)->GetName())) {
709  // complain if one of the values that should go in /sex
711  "Unexpected use of /mating_type qualifier", obj, ctx);
712  }
713  break;
714 
716  if (!bsrc.IsSetGenome() || bsrc.GetGenome() != CBioSource::eGenome_plasmid) {
718  "Plasmid subsource but not plasmid location", obj, ctx);
719  }
720  break;
721 
723  {
724  if ((*ssit)->IsSetName()) {
726  if (bsrc.IsSetGenome()) {
727  genome = bsrc.GetGenome();
728  }
729 
730  const string& subname = ((*ssit)->GetName());
732  if (genome_from_name == CBioSource::eGenome_chloroplast
733  || genome_from_name == CBioSource::eGenome_chromoplast
734  || genome_from_name == CBioSource::eGenome_kinetoplast
735  || genome_from_name == CBioSource::eGenome_plastid
736  || genome_from_name == CBioSource::eGenome_apicoplast
737  || genome_from_name == CBioSource::eGenome_leucoplast
738  || genome_from_name == CBioSource::eGenome_proplastid
739  || genome_from_name == CBioSource::eGenome_chromatophore) {
740  if (genome_from_name != genome) {
741  string val_name = CBioSource::GetOrganelleByGenome(genome_from_name);
742  if (NStr::StartsWith(val_name, "plastid:")) {
743  val_name = val_name.substr(8);
744  }
746  "Plastid name subsource " + val_name + " but not " + val_name + " location", obj, ctx);
747  }
748  } else {
750  "Plastid name subsource contains unrecognized value", obj, ctx);
751  }
752  }
753  }
754  break;
755 
757  if ((*ssit)->IsSetName() && hasTaxname) {
758  string warning = CSubSource::CheckCellLine((*ssit)->GetName(), orgref.GetTaxname());
759  if (!NStr::IsBlank(warning)) {
761  warning, obj, ctx);
762  }
763  }
764  break;
766  if (isBacteria) {
768  "Tissue-type is inappropriate for bacteria", obj, ctx);
769  } else if (isViroid) {
771  "Viroid has unexpected tissue-type qualifier", obj, ctx);
772  }
773  break;
775  {
776  if ((*ssit)->IsSetName()) {
777  const string& subname = ((*ssit)->GetName());
778  if (NStr::StartsWith(subname, "single cell amplified") || NStr::StartsWith(subname, "a few single cells amplified")) {
779  is_single_cell_amplification = true;
780  } else {
781  size_t pos = NStr::Find(subname, "single cells amplified");
782  if (pos != NPOS) {
783  string num = subname.substr(0, pos);
784  if (s_IsAllDigitsOrSpaces(num)) {
785  is_single_cell_amplification = true;
786  }
787  }
788  }
789  }
790  }
791  break;
792 
793  }
794 
795  if (isViral && IsUnexpectedViralSubSourceQualifier(subtype)) {
796  string subname = CSubSource::GetSubtypeName(subtype);
797  if (subname.length() > 0) {
798  subname[0] = toupper(subname[0]);
799  }
800  PostObjErr(eDiag_Warning,
802  "Virus has unexpected " + subname + " qualifier", obj, ctx);
803  }
804  }
805 
806  if (IsIndexerVersion() && ctx && CValidError_bioseq::IsWGS(*ctx) &&
807  chromosome && (!bsrc.IsSetGenome() || bsrc.GetGenome() != CBioSource::eGenome_chromosome)) {
808  // exception for /map="unlocalized"
809  bool suppress = false;
810  for (auto& it : bsrc.GetSubtype()) {
811  if (it->IsSetSubtype() && it->GetSubtype() == CSubSource::eSubtype_map &&
812  it->IsSetName() && NStr::Equal(it->GetName(), "unlocalized")) {
813  suppress = true;
814  break;
815  }
816  }
817  if (!suppress) {
818  if (chromosome->IsSetName() && NStr::EqualNocase(chromosome->GetName(), "Unknown")) {
819  const CSeq_entry& entry = *ctx;
820  if (entry.IsSeq()) {
821  const CBioseq& bsp = entry.GetSeq();
822  FOR_EACH_SEQID_ON_BIOSEQ(itr, bsp) {
823  const CSeq_id& sid = **itr;
824  switch (sid.Which()) {
825  case CSeq_id::e_Genbank:
826  case CSeq_id::e_Embl:
827  case CSeq_id::e_Ddbj:
828  case CSeq_id::e_Tpg:
829  case CSeq_id::e_Tpe:
830  case CSeq_id::e_Tpd:
831  {
832  const CTextseq_id* tsid = sid.GetTextseq_Id();
833  // need to check accession format
834  if (tsid && tsid->IsSetAccession()) {
835  const string& acc = tsid->GetAccession();
836  if (acc.length() == 8) {
837  suppress = true;
838  }
839  }
840  }
841  break;
842 
843  default:
844  break;
845  }
846  }
847  }
848  }
849  }
850  if (!suppress) {
851  string msg = "INDEXER_ONLY - source contains chromosome value '";
852  if (chromosome->IsSetName()) {
853  msg += chromosome->GetName();
854  }
855  msg += "' but the BioSource location is not set to chromosome";
857  msg, obj, ctx);
858  }
859  }
860 
861  if (IsIndexerVersion() && ctx && CValidError_bioseq::IsWGS(*ctx) &&
862  linkage_group && (!bsrc.IsSetGenome() || bsrc.GetGenome() != CBioSource::eGenome_chromosome)) {
863  // exception for /map="unlocalized"
864  bool suppress = false;
865  for (auto& it : bsrc.GetSubtype()) {
866  if (it->IsSetSubtype() && it->GetSubtype() == CSubSource::eSubtype_map &&
867  it->IsSetName() && NStr::Equal(it->GetName(), "unlocalized")) {
868  suppress = true;
869  break;
870  }
871  }
872  if (!suppress) {
873  if (linkage_group->IsSetName() && NStr::EqualNocase(linkage_group->GetName(), "Unknown")) {
874  const CSeq_entry& entry = *ctx;
875  if (entry.IsSeq()) {
876  const CBioseq& bsp = entry.GetSeq();
877  FOR_EACH_SEQID_ON_BIOSEQ(itr, bsp) {
878  const CSeq_id& sid = **itr;
879  switch (sid.Which()) {
880  case CSeq_id::e_Genbank:
881  case CSeq_id::e_Embl:
882  case CSeq_id::e_Ddbj:
883  case CSeq_id::e_Tpg:
884  case CSeq_id::e_Tpe:
885  case CSeq_id::e_Tpd:
886  {
887  const CTextseq_id* tsid = sid.GetTextseq_Id();
888  // need to check accession format
889  if (tsid && tsid->IsSetAccession()) {
890  const string& acc = tsid->GetAccession();
891  if (acc.length() == 8) {
892  suppress = true;
893  }
894  }
895  }
896  break;
897 
898  default:
899  break;
900  }
901  }
902  }
903  }
904  }
905  if (!suppress) {
906  string msg = "INDEXER_ONLY - source contains linkage_group value '";
907  if (linkage_group->IsSetName()) {
908  msg += linkage_group->GetName();
909  }
910  msg += "' but the BioSource location is not set to chromosome";
912  msg, obj, ctx);
913  }
914  }
915 
916  ITERATE(TCount, it, count)
917  {
918  if (it->second <= 1) continue;
919  if (CSubSource::IsMultipleValuesAllowed(it->first)) continue;
920  string qual = "***";
921  switch (it->first) {
923  qual = chrom_conflict ? "conflicting chromosome" : "identical chromosome"; break;
925  qual = "germline"; break;
927  qual = "rearranged"; break;
929  qual = "plasmid_name"; break;
931  qual = "segment"; break;
933  qual = "country"; break;
935  qual = "transgenic"; break;
937  qual = "environmental_sample"; break;
939  qual = "lat_lon"; break;
941  qual = "collection_date"; break;
943  qual = "collected_by"; break;
945  qual = "identified_by"; break;
947  qual = "fwd_primer_seq"; break;
949  qual = "rev_primer_seq"; break;
951  qual = "fwd_primer_name"; break;
953  qual = "rev_primer_name"; break;
955  qual = "metagenomic"; break;
957  qual = "altitude"; break;
958  default:
959  qual = CSubSource::GetSubtypeName(it->first);
960  break;
961  }
962  PostObjErr(eDiag_Warning, eErr_SEQ_DESCR_MultipleSourceQualifiers, "Multiple " + qual + " qualifiers present", obj, ctx);
963  }
964 
967  "Germline and rearranged should not both be present", obj, ctx);
968  }
971  "Transgenic and environmental sample should not both be present", obj, ctx);
972  }
975  "Metagenomic should also have environmental sample annotated", obj, ctx);
976  }
979  "Sex and mating type should not both be present", obj, ctx);
980  }
982  EDiagSev sev = eDiag_Warning;
983  if (m_genomeSubmission) {
984  sev = eDiag_Error;
985  }
986  PostObjErr(sev, eErr_SEQ_DESCR_MissingPlasmidName,
987  "Plasmid location set but plasmid name missing. Add a plasmid source modifier with the plasmid name. Use unnamed if the name is not known.",
988  obj, ctx);
989  }
990 
991  if (static_cast<bool>(count[CSubSource::eSubtype_fwd_primer_seq]) != static_cast<bool>(count[CSubSource::eSubtype_rev_primer_seq]) &&
994  // if there are forward primers then there should also be reverse primers, and vice versa,
995  // but ignore this if there are primer names of either flavor
997  "PCR primer does not have both sequences", obj, ctx);
998  }
999 
1000  bool has_duplicate_primers = false;
1001  if (!pcr_set_list.AreSetsUnique()) {
1002  has_duplicate_primers = true;
1003  }
1005  has_duplicate_primers = true;
1006  }
1007 
1008  if (has_duplicate_primers) {
1010  "PCR primer sequence has duplicates", obj, ctx);
1011  }
1012 
1013  // check that country and lat_lon are compatible
1014  ValidateLatLonCountry(countryname, lat_lon, obj, ctx);
1015 
1016  // validates orgref in the context of lineage
1017  if (!orgref.IsSetOrgname() ||
1018  !orgref.GetOrgname().IsSetLineage() ||
1019  NStr::IsBlank(orgref.GetOrgname().GetLineage())) {
1020 
1021  if (!IsSeqSubmitParent() && IsIndexerVersion()) {
1022  EDiagSev sev = eDiag_Error;
1023 
1024  if (IsRefSeq()) {
1025  FOR_EACH_DBXREF_ON_ORGREF(it, orgref)
1026  {
1027  if ((*it)->IsSetDb() && NStr::EqualNocase((*it)->GetDb(), "taxon")) {
1028  sev = eDiag_Critical;
1029  break;
1030  }
1031  }
1032  }
1033  if (IsEmbl() || IsDdbj()) {
1034  sev = eDiag_Warning;
1035  }
1036  if (!IsWP()) {
1037  PostObjErr(sev, eErr_SEQ_DESCR_MissingLineage,
1038  "No lineage for this BioSource.", obj, ctx);
1039  }
1040  }
1041  } else {
1042  const COrgName& orgname = orgref.GetOrgname();
1043  const string& lineage = orgname.GetLineage();
1045  if (lineage.find("Kinetoplastida") == string::npos && lineage.find("Kinetoplastea") == string::npos) {
1047  "Only Kinetoplastida have kinetoplasts", obj, ctx);
1048  }
1049  } else if (bsrc.GetGenome() == CBioSource::eGenome_nucleomorph) {
1050  if (lineage.find("Chlorarachniophyceae") == string::npos &&
1051  lineage.find("Cryptophyceae") == string::npos) {
1052  // RW-1807 Cryptophyta changed to Cryptophyceae
1054  "Only Chlorarachniophyceae and Cryptophyceae have nucleomorphs", obj, ctx);
1055  }
1056  } else if (bsrc.GetGenome() == CBioSource::eGenome_macronuclear) {
1057  if (lineage.find("Ciliophora") == string::npos) {
1059  "Only Ciliophora have macronuclear locations", obj, ctx);
1060  }
1061  }
1062 
1063  if (orgname.IsSetDiv()) {
1064  const string& div = orgname.GetDiv();
1065  if ((NStr::EqualCase(div, "BCT") || NStr::EqualCase(div, "VRL"))
1070  if (NStr::EqualCase(div, "BCT") && bsrc.GetGenome() == CBioSource::eGenome_extrachrom) {
1071  // it's ok
1072  } else if (NStr::EqualCase(div, "VRL") && bsrc.GetGenome() == CBioSource::eGenome_proviral) {
1073  // it's ok
1074  } else {
1076  "Bacterial or viral source should not have organelle location",
1077  obj, ctx);
1078  }
1079  } else if (NStr::EqualCase(div, "ENV") && !count[CSubSource::eSubtype_environmental_sample]) {
1081  "BioSource with ENV division is missing environmental sample subsource",
1082  obj, ctx);
1083  }
1084  }
1085 
1086  if (!count[CSubSource::eSubtype_metagenomic] && NStr::FindNoCase(lineage, "metagenomes") != string::npos) {
1088  "If metagenomes appears in lineage, BioSource should have metagenomic qualifier",
1089  obj, ctx);
1090  }
1091 
1092  }
1093 
1094  // look for conflicts in orgmods, also look for unexpected viral qualifiers
1095  bool specific_host = false;
1096  if (orgref.IsSetOrgMod()) {
1097  for (auto it : orgref.GetOrgname().GetMod())
1098  {
1099  if (!it->IsSetSubtype()) {
1100  continue;
1101  }
1102  COrgMod::TSubtype subtype = it->GetSubtype();
1103 
1104  if (subtype == COrgMod::eSubtype_nat_host) {
1105  specific_host = true;
1106  }
1107  else if (subtype == COrgMod::eSubtype_strain) {
1109  PostObjErr(eDiag_Error, eErr_SEQ_DESCR_StrainWithEnvironSample, "Strain should not be present in an environmental sample",
1110  obj, ctx);
1111  }
1112  }
1113  else if (subtype == COrgMod::eSubtype_metagenome_source) {
1114  if (!count[CSubSource::eSubtype_metagenomic]) {
1115  PostObjErr(eDiag_Error, eErr_SEQ_DESCR_MissingMetagenomicQualifier, "Metagenome source should also have metagenomic qualifier",
1116  obj, ctx);
1117  }
1118  }
1119  if (isViral && IsUnexpectedViralOrgModQualifier(subtype)) {
1120  string subname = COrgMod::GetSubtypeName(subtype);
1121  if (subname.length() > 0) {
1122  subname[0] = toupper(subname[0]);
1123  }
1125  "Virus has unexpected " + subname + " qualifier", obj, ctx);
1126  }
1127  }
1128  }
1129  if (count[CSubSource::eSubtype_environmental_sample] && !count[CSubSource::eSubtype_isolation_source] && !specific_host) {
1131  "Environmental sample should also have isolation source or specific host annotated",
1132  obj, ctx);
1133  }
1134 
1135  m_biosource_kind = bsrc;
1136 
1137  const CBioseq* pBioseq=nullptr;
1138  const bool checkForUndefinedSpecies = hasTaxname &&
1139  (IsGenomeSubmission() ||
1140  (((pBioseq = s_GetNucSeqFromContext(ctx)) && s_ReportUndefinedSpeciesId(*pBioseq)) &&
1141  (s_IsChromosome(bsrc) || s_HasWGSTech(*pBioseq)) &&
1142  s_IsEukaryoteOrProkaryote(m_biosource_kind)));
1143 
1144  ValidateOrgRef(orgref, obj, ctx, checkForUndefinedSpecies, is_single_cell_amplification);
1145  if (bsrc.IsSetPcr_primers()) {
1146  ValidatePCRReactionSet(bsrc.GetPcr_primers(), obj, ctx);
1147  }
1148 
1149 }
1150 
1151 
1153 (const string& primer_kind,
1154 char badch,
1155 const CSerialObject& obj,
1156 const CSeq_entry *ctx)
1157 {
1158  if (badch < ' ' || badch > '~') {
1159  badch = '?';
1160  }
1161  string msg = "PCR " + primer_kind + " primer sequence format is incorrect, first bad character is '";
1162  msg += badch;
1163  msg += "'";
1165  msg, obj, ctx);
1166 }
1167 
1168 
1170 (const CPCRPrimer& primer,
1171 const string& primer_kind,
1172 const CSerialObject& obj,
1173 const CSeq_entry *ctx)
1174 {
1175  char badch = 0;
1176  if (primer.IsSetSeq() && !CPCRPrimerSeq::IsValid(primer.GetSeq(), badch)) {
1177  x_ReportPCRSeqProblem(primer_kind, badch, obj, ctx);
1178  }
1179  badch = 0;
1180  if (primer.IsSetName() && primer.GetName().Get().length() > 10
1181  && CPCRPrimerSeq::IsValid(primer.GetName(), badch)) {
1183  "PCR " + primer_kind + " primer name appears to be a sequence",
1184  obj, ctx);
1185  }
1186 
1187 }
1188 
1189 
1191 (const CPCRReactionSet& pcrset,
1192 const CSerialObject& obj,
1193 const CSeq_entry *ctx)
1194 {
1195  for (auto it : pcrset.Get())
1196  {
1197  if (it->IsSetForward()) {
1198  for (auto pit : it->GetForward().Get())
1199  {
1200  x_CheckPCRPrimer(*pit, "forward", obj, ctx);
1201  }
1202  }
1203  if (it->IsSetReverse()) {
1204  for (auto pit : it->GetReverse().Get())
1205  {
1206  x_CheckPCRPrimer(*pit, "reverse", obj, ctx);
1207  }
1208  }
1209  }
1210 
1211 }
1212 
1213 
1215 (const CSubSource& subsrc,
1216 const CSerialObject& obj,
1217 const CSeq_entry *ctx,
1218 const bool isViral)
1219 {
1220  if (!subsrc.IsSetSubtype()) {
1222  "Unknown subsource subtype 0", obj, ctx);
1223  return;
1224  }
1225 
1226  // get taxname from object
1227  string taxname;
1228  if (obj.GetThisTypeInfo() == CSeqdesc::GetTypeInfo()) {
1229  const CSeqdesc* desc = dynamic_cast <const CSeqdesc*> (&obj);
1230  if (desc && desc->IsSource() && desc->GetSource().IsSetTaxname()) {
1231  taxname = desc->GetSource().GetOrg().GetTaxname();
1232  }
1233  } else if (obj.GetThisTypeInfo() == CSeq_feat::GetTypeInfo()) {
1234  const CSeq_feat* feat = dynamic_cast < const CSeq_feat* > (&obj);
1235  if (feat && feat->IsSetData()) {
1236  const auto& fdata = feat->GetData();
1237  if (fdata.IsBiosrc() && fdata.GetBiosrc().IsSetTaxname()) {
1238  taxname = feat->GetData().GetBiosrc().GetOrg().GetTaxname();
1239  }
1240  }
1241  }
1242  string sname;
1243  if (subsrc.IsSetName()) {
1244  sname = subsrc.GetName();
1245  }
1246 
1247  CSubSource::TSubtype subtype = subsrc.GetSubtype();
1248  switch (subtype) {
1249 
1251  {
1252  string countryname = subsrc.GetName();
1253  bool is_miscapitalized = false;
1254  bool use_geo_loc_name = CSubSource::NCBI_UseGeoLocNameForCountry();
1255  if (CCountries::IsValid(countryname, is_miscapitalized)) {
1256  if (is_miscapitalized) {
1257  if (use_geo_loc_name) {
1259  "Bad geo_loc_name capitalization [" + countryname + "]",
1260  obj, ctx);
1261  } else {
1263  "Bad country capitalization [" + countryname + "]",
1264  obj, ctx);
1265  }
1266  }
1267  if (NStr::EndsWith(countryname, ":")) {
1268  if (use_geo_loc_name) {
1270  "Colon at end of geo_loc_name [" + countryname + "]", obj, ctx);
1271  } else {
1273  "Colon at end of country name [" + countryname + "]", obj, ctx);
1274  }
1275  }
1276  if (CCountries::WasValid(countryname)) {
1277  if (use_geo_loc_name) {
1279  "Replaced geo_loc_name [" + countryname + "]", obj, ctx);
1280  } else {
1282  "Replaced country name [" + countryname + "]", obj, ctx);
1283  }
1284  }
1285  } else {
1286  if (countryname.empty()) {
1287  countryname = "?";
1288  }
1289  if (use_geo_loc_name) {
1291  "Bad geo_loc_name [" + countryname + "]", obj, ctx);
1292  } else {
1294  "Bad country name [" + countryname + "]", obj, ctx);
1295  }
1296  }
1297  }
1298  break;
1299 
1301  if (subsrc.IsSetName()) {
1302  bool format_correct = false, lat_in_range = false, lon_in_range = false, precision_correct = false;
1303  double lat_value = 0.0, lon_value = 0.0;
1304  string lat_lon = subsrc.GetName();
1305  CSubSource::IsCorrectLatLonFormat(lat_lon, format_correct, precision_correct,
1306  lat_in_range, lon_in_range,
1307  lat_value, lon_value);
1308  if (!format_correct) {
1309  size_t pos = NStr::Find(lat_lon, ",");
1310  if (pos != string::npos) {
1311  CSubSource::IsCorrectLatLonFormat(lat_lon.substr(0, pos), format_correct, precision_correct, lat_in_range, lon_in_range, lat_value, lon_value);
1312  if (format_correct) {
1314  "lat_lon format has extra text after correct dd.dd N|S ddd.dd E|W format",
1315  obj, ctx);
1316  }
1317  }
1318  }
1319 
1320  if (!format_correct) {
1322  "lat_lon format is incorrect - should be dd.dd N|S ddd.dd E|W",
1323  obj, ctx);
1324  } else {
1325  if (!lat_in_range) {
1327  "latitude value is out of range - should be between 90.00 N and 90.00 S",
1328  obj, ctx);
1329  }
1330  if (!lon_in_range) {
1332  "longitude value is out of range - should be between 180.00 E and 180.00 W",
1333  obj, ctx);
1334  }
1335  if (!precision_correct) {
1336  /*
1337  PostObjErr (eDiag_Info, eErr_SEQ_DESCR_LatLonPrecision,
1338  "lat_lon precision is incorrect - should only have two digits to the right of the decimal point",
1339  obj, ctx);
1340  */
1341  }
1342  }
1343  }
1344  break;
1345 
1347  if (subsrc.IsSetName()) {
1348  string name = subsrc.GetName();
1349  char bad_ch;
1350  if (name.length() > 10
1351  && CPCRPrimerSeq::IsValid(name, bad_ch)) {
1353  "PCR primer name appears to be a sequence",
1354  obj, ctx);
1355  }
1356  }
1357  break;
1358 
1360  if (subsrc.IsSetName()) {
1361  string name = subsrc.GetName();
1362  char bad_ch;
1363  if (name.length() > 10
1364  && CPCRPrimerSeq::IsValid(name, bad_ch)) {
1366  "PCR primer name appears to be a sequence",
1367  obj, ctx);
1368  }
1369  }
1370  break;
1371 
1373  {
1374  char bad_ch = 0;
1375  if (!subsrc.IsSetName() || !CPCRPrimerSeq::IsValid(subsrc.GetName(), bad_ch)) {
1376  x_ReportPCRSeqProblem("forward", bad_ch, obj, ctx);
1377  }
1378  }
1379  break;
1380 
1382  {
1383  char bad_ch = 0;
1384  if (!subsrc.IsSetName() || !CPCRPrimerSeq::IsValid(subsrc.GetName(), bad_ch)) {
1385  x_ReportPCRSeqProblem("reverse", bad_ch, obj, ctx);
1386  }
1387  }
1388  break;
1389 
1393  "Transposon name and insertion sequence name are no "
1394  "longer legal qualifiers", obj, ctx);
1395  break;
1396 
1397  case 0:
1399  "Unknown subsource subtype 0", obj, ctx);
1400  break;
1401 
1403  ValidateSourceQualTags(subsrc.GetName(), obj, ctx);
1404  break;
1405 
1407  break;
1408 
1410  break;
1411 
1413  break;
1414 
1416  break;
1417 
1419  break;
1420 
1422  break;
1423 
1425  break;
1426 
1428  break;
1429 
1431  if (!CSubSource::IsChromosomeNameValid(sname, taxname)) {
1433  "Problematic plasmid/chromosome/linkage group name '" + sname + "'",
1434  obj, ctx);
1435  }
1436  break;
1438  if (!CSubSource::IsLinkageGroupNameValid(sname, taxname)) {
1440  "Problematic plasmid/chromosome/linkage group name '" + sname + "'",
1441  obj, ctx);
1442  }
1443  break;
1445  if (!CSubSource::IsPlasmidNameValid(sname, taxname)) {
1447  "Problematic plasmid/chromosome/linkage group name '" + sname + "'",
1448  obj, ctx);
1449  }
1450  break;
1452  if (!CSubSource::IsSegmentValid(sname)) {
1454  CSubSource::GetSubtypeName(subsrc.GetSubtype()) + " value should start with letter or number",
1455  obj, ctx);
1456  }
1457  if ( ! isViral ) {
1459  "Non-viral source feature should not have a segment qualifier",
1460  obj, ctx);
1461  }
1462  break;
1466  CSubSource::GetSubtypeName(subsrc.GetSubtype()) + " value should start with letter or number",
1467  obj, ctx);
1468  }
1469  break;
1470 
1472  break;
1473 
1475  break;
1476 
1478  break;
1479 
1481  if (subsrc.IsSetName() && !NStr::IsBlank(subsrc.GetName())) {
1482  const string& frequency = subsrc.GetName();
1483  if (NStr::Equal(frequency, "0")) {
1484  //ignore
1485  } else if (NStr::Equal(frequency, "1")) {
1487  "bad frequency qualifier value " + frequency,
1488  obj, ctx);
1489  } else {
1490  string::const_iterator sit = frequency.begin();
1491  bool bad_frequency = false;
1492  if (*sit == '0') {
1493  ++sit;
1494  }
1495  if (sit != frequency.end() && *sit == '.') {
1496  ++sit;
1497  if (sit == frequency.end()) {
1498  bad_frequency = true;
1499  }
1500  while (sit != frequency.end() && isdigit(*sit)) {
1501  ++sit;
1502  }
1503  if (sit != frequency.end()) {
1504  bad_frequency = true;
1505  }
1506  } else {
1507  bad_frequency = true;
1508  }
1509  if (bad_frequency) {
1511  "bad frequency qualifier value " + frequency,
1512  obj, ctx);
1513  }
1514  }
1515  }
1516  break;
1518  if (!subsrc.IsSetName()) {
1520  "Collection_date format is not in DD-Mmm-YYYY format",
1521  obj, ctx);
1522  } else {
1523  string problem = CSubSource::GetCollectionDateProblem(subsrc.GetName());
1524  if (!NStr::IsBlank(problem)) {
1525  PostObjErr(eDiag_Warning, eErr_SEQ_DESCR_BadCollectionDate, problem, obj, ctx);
1526  }
1527  }
1528  break;
1529 
1530  default:
1531  break;
1532  }
1533 
1534  if (subsrc.IsSetName()) {
1535  if (CSubSource::NeedsNoText(subtype)) {
1536  if (subsrc.IsSetName() && !NStr::IsBlank(subsrc.GetName())) {
1537  string subname = CSubSource::GetSubtypeName(subtype);
1538  if (subname.length() > 0) {
1539  subname[0] = toupper(subname[0]);
1540  }
1541  NStr::ReplaceInPlace(subname, "-", "_");
1543  subname + " qualifier should not have descriptive text",
1544  obj, ctx);
1545  }
1546  } else {
1547  const string& subname = subsrc.GetName();
1550  "Unbalanced parentheses in subsource '" + subname + "'",
1551  obj, ctx);
1552  }
1553  if (ContainsSgml(subname)) {
1555  "subsource " + subname + " has SGML",
1556  obj, ctx);
1557  }
1558  }
1559  }
1560 }
1561 
1562 
1563 static bool s_FindWholeName(const string& taxname, const string& value)
1564 {
1565  if (NStr::IsBlank(taxname) || NStr::IsBlank(value)) {
1566  return false;
1567  }
1568  size_t pos = NStr::Find(taxname, value);
1569  size_t value_len = value.length();
1570  while (pos != string::npos
1571  && (((pos != 0 && isalpha(taxname.c_str()[pos - 1]))
1572  || isalpha(taxname.c_str()[pos + value_len])))) {
1573  pos = NStr::Find(taxname, value, pos + value_len);
1574  }
1575  if (pos == string::npos) {
1576  return false;
1577  } else {
1578  return true;
1579  }
1580 }
1581 
1582 
1583 static bool s_HasMetagenomeSource(const COrg_ref& org)
1584 {
1585  if (!org.IsSetOrgMod()) {
1586  return false;
1587  }
1588  for (auto it : org.GetOrgname().GetMod()) {
1589  if (it->IsSetSubtype() && it->GetSubtype() == COrgMod::eSubtype_metagenome_source) {
1590  return true;
1591  }
1592  }
1593  return false;
1594 }
1595 
1596 
1597 bool CValidError_imp::s_IsSalmonellaGenus(const string& taxname)
1598 {
1599  bool rval = false;
1600  auto pos = NStr::Find(taxname, " ");
1601  if (pos == string::npos) {
1602  if (NStr::EqualNocase(taxname, "Salmonella")) {
1603  rval = true;
1604  }
1605  } else if (pos > 0 && NStr::EqualNocase(taxname.substr(0, pos), "Salmonella")) {
1606  rval = true;
1607  }
1608  return rval;
1609 }
1610 
1612 {
1613  // per RW-1097
1614  return eDiag_Warning;
1615 }
1616 
1617 
1618 static bool s_IsUndefinedSpecies(const string& taxname)
1619 {
1620  if (NStr::EndsWith(taxname, " sp.", NStr::eNocase) ||
1621  NStr::EndsWith(taxname, " sp", NStr::eNocase) ||
1622  NStr::EndsWith(taxname, " (in: Fungi)", NStr::eNocase) ||
1623  NStr::EndsWith(taxname, " (in: Bacteria)", NStr::eNocase) ||
1624  NStr::EndsWith(taxname, " bacterium", NStr::eNocase) ||
1625  NStr::EndsWith(taxname, " archaeon", NStr::eNocase) ||
1626  NStr::EqualNocase(taxname, "bacterium") ||
1627  NStr::EqualNocase(taxname, "archaeon")) {
1628  return true;
1629  }
1630  if (NStr::Find(taxname, "sp. (in: ") != string::npos && NStr::EndsWith(taxname, ")")) {
1631  return true;
1632  }
1633  return false;
1634 }
1635 
1636 
1638 (const COrg_ref& orgref,
1639 const CSerialObject& obj,
1640 const CSeq_entry *ctx,
1641 const bool checkForUndefinedSpecies,
1642 const bool is_single_cell_amplification)
1643 {
1644  // Organism must have a name.
1645  if ((!orgref.IsSetTaxname() || orgref.GetTaxname().empty()) &&
1646  (!orgref.IsSetCommon() || orgref.GetCommon().empty())) {
1648  "No organism name included in the source. Other qualifiers may exist.", obj, ctx);
1649  }
1650 
1651  string taxname;
1652  string lineage;
1653  if (orgref.IsSetOrgname() && orgref.GetOrgname().IsSetLineage()) {
1654  lineage = orgref.GetOrgname().GetLineage();
1655  }
1656 
1657  if (orgref.IsSetTaxname()) {
1658  taxname = orgref.GetTaxname();
1659  if (checkForUndefinedSpecies && !s_HasMetagenomeSource(orgref) && !is_single_cell_amplification)
1660  {
1661  if(s_IsUndefinedSpecies(taxname) &&
1662  !NStr::StartsWith(taxname, "uncultured ", NStr::eNocase) &&
1663  !NStr::Equal(taxname, "Haemoproteus sp.", NStr::eNocase) &&
1664  !NStr::StartsWith(taxname, "symbiont ", NStr::eNocase) &&
1665  !NStr::StartsWith(taxname, "endosymbiont ", NStr::eNocase) &&
1666  NStr::FindNoCase(taxname, " symbiont ") == NPOS &&
1667  NStr::FindNoCase(taxname, " endosymbiont ") == NPOS) {
1668 
1670  "Organism '" + taxname + "' is undefined species and does not have a specific identifier.",
1671  obj, ctx);
1672  }
1673  }
1674  if (s_UnbalancedParentheses(taxname)) {
1676  "Unbalanced parentheses in taxname '" + orgref.GetTaxname() + "'", obj, ctx);
1677  }
1678  if (ContainsSgml(taxname)) {
1680  "taxname " + taxname + " has SGML",
1681  obj, ctx);
1682  }
1683 
1684 #if 0
1685  // VR-723: taxname must match orgname.name if present
1686  // commented out for now
1687  if (orgref.IsSetOrgname() && orgref.GetOrgname().IsSetName()) {
1688  ValidateTaxNameOrgname(taxname, orgref.GetOrgname(), obj, ctx);
1689  }
1690 #endif
1691  }
1692 
1693  if (orgref.IsSetDb()) {
1694  ValidateDbxref(orgref.GetDb(), obj, true, ctx);
1695  }
1696 
1697  bool has_taxon = false;
1698  FOR_EACH_DBXREF_ON_ORGREF(dbt, orgref)
1699  {
1700  if (NStr::CompareNocase((*dbt)->GetDb(), "taxon") != 0) continue;
1701  has_taxon = true;
1702  }
1703 
1704  EDiagSev sev = eDiag_Warning;
1705  if (! IsLocalGeneralOnly()) {
1706  sev = eDiag_Error;
1707  }
1708  if (IsRequireTaxonID() && /* IsIndexerVersion() && */ !has_taxon) {
1709  PostObjErr(sev, eErr_SEQ_DESCR_NoTaxonID,
1710  "BioSource is missing taxon ID", obj, ctx);
1711  }
1712 
1713  if (!orgref.IsSetOrgname()) {
1714  return;
1715  }
1716  const COrgName& orgname = orgref.GetOrgname();
1717  ValidateOrgName(orgname, has_taxon, obj, ctx);
1718 
1719  // Look for modifiers in taxname
1720  string taxname_search = taxname;
1721  // skip first two words
1722  size_t pos = NStr::Find(taxname_search, " ");
1723  if (pos == string::npos) {
1724  taxname_search.clear();
1725  } else {
1726  taxname_search = taxname_search.substr(pos + 1);
1727  NStr::TruncateSpacesInPlace(taxname_search);
1728  pos = NStr::Find(taxname_search, " ");
1729  if (pos == string::npos) {
1730  taxname_search.clear();
1731  } else {
1732  taxname_search = taxname_search.substr(pos + 1);
1733  NStr::TruncateSpacesInPlace(taxname_search);
1734  }
1735  }
1736 
1737  // determine if variety is present and in taxname - if so,
1738  // can ignore missing subspecies
1739  // also look for specimen-voucher (nat-host) if identical to taxname
1740  FOR_EACH_ORGMOD_ON_ORGNAME(it, orgname)
1741  {
1742  if (!(*it)->IsSetSubtype() || !(*it)->IsSetSubname()) {
1743  continue;
1744  }
1745  COrgMod::TSubtype subtype = (*it)->GetSubtype();
1746  const string& subname = (*it)->GetSubname();
1747  string orgmod_name = COrgMod::GetSubtypeName(subtype);
1748  if (orgmod_name.length() > 0) {
1749  orgmod_name[0] = toupper(orgmod_name[0]);
1750  }
1751  NStr::ReplaceInPlace(orgmod_name, "-", " ");
1752  if (subtype == COrgMod::eSubtype_sub_species) {
1753  if (!orgref.IsSubspeciesValid(subname)) {
1755  "Subspecies value specified is not found in taxname",
1756  obj, ctx);
1757  }
1758  } else if (subtype == COrgMod::eSubtype_variety) {
1759  if (!orgref.IsVarietyValid(subname)) {
1761  orgmod_name + " value specified is not found in taxname",
1762  obj, ctx);
1763  }
1764  } else if (subtype == COrgMod::eSubtype_forma
1765  || subtype == COrgMod::eSubtype_forma_specialis) {
1766  if (!s_FindWholeName(taxname_search, subname)) {
1768  orgmod_name + " value specified is not found in taxname",
1769  obj, ctx);
1770  }
1771  } else if (subtype == COrgMod::eSubtype_nat_host) {
1772  if (NStr::EqualNocase(subname, taxname)) {
1774  "Specific host is identical to taxname",
1775  obj, ctx);
1776  }
1777 
1778  } else if (subtype == COrgMod::eSubtype_serotype) {
1779  // RW-1063
1780  if (s_IsSalmonellaGenus(taxname)) {
1782  "Salmonella organisms should use serovar instead of serotype.",
1783  obj, ctx);
1784  }
1785  } else if (subtype == COrgMod::eSubtype_serovar) {
1786  // RW-1064
1787  if (s_IsSalmonellaGenus(taxname) && NStr::Find(taxname, subname) == string::npos) {
1788  PostObjErr(x_SalmonellaErrorLevel(), eErr_SEQ_DESCR_BadOrgMod,
1789  "Salmonella organism name should contain the serovar value.",
1790  obj, ctx);
1791  }
1792  }
1793  }
1794 }
1795 
1796 
1797 //LCOV_EXCL_START
1798 //per VR-723, the call to this code is commented out
1799 static bool s_MatchOrgname(const string& taxname, const COrgName& orgname, string& mismatch)
1800 {
1801  mismatch = kEmptyStr;
1802  bool rval = false;
1803  if (!orgname.IsSetName()) {
1804  return false;
1805  }
1806  orgname.GetFlatName(mismatch);
1807  if (NStr::Equal(taxname, mismatch)) {
1808  return true;
1809  }
1810  // special cases
1811  switch (orgname.GetName().Which()) {
1813  {
1814  const auto& hybrid = orgname.GetName().GetHybrid().Get();
1815  for (auto it : hybrid) {
1816  if (it->IsSetName() && s_MatchOrgname(taxname, *it, mismatch)) {
1817  rval = true;
1818  break;
1819  }
1820  }
1821  if (!rval && hybrid.size() > 1 &&
1822  hybrid.front()->IsSetName()) {
1823  // use first element for error
1824  s_MatchOrgname(taxname, *(hybrid.front()), mismatch);
1825  }
1826  }
1827  break;
1829  {
1830  const auto& partial = orgname.GetName().GetPartial().Get();
1831  for (auto it : partial) {
1832  if (it->IsSetName()) {
1833  mismatch = it->GetName();
1834  rval = NStr::Equal(taxname, mismatch);
1835  if (rval) {
1836  break;
1837  }
1838  }
1839  }
1840  if (!rval && partial.size() > 1 &&
1841  partial.front()->IsSetName()) {
1842  // use first element for error
1843  mismatch = partial.front()->GetName();
1844  }
1845  }
1846  break;
1847  default:
1848  break;
1849  }
1850  return rval;
1851 }
1852 
1853 
1855 (const string& taxname,
1856  const COrgName& orgname,
1857  const CSerialObject& obj,
1858  const CSeq_entry *ctx)
1859 {
1860  string mismatch;
1861  if (!s_MatchOrgname(taxname, orgname, mismatch)) {
1863  "Taxname does not match orgname ('" + taxname + "', '" + mismatch + "')",
1864  obj, ctx);
1865  }
1866 }
1867 //LCOV_EXCL_STOP
1868 
1869 
1871 (const COrgName& orgname,
1872 const bool has_taxon,
1873 const CSerialObject& obj,
1874 const CSeq_entry *ctx)
1875 {
1876  bool is_viral = false;
1877  string lineage;
1878  string genus;
1879  string species;
1880  string strain;
1881  string sub_species;
1882  string serovar;
1883 
1884  if (orgname.IsSetLineage()) {
1885  lineage = orgname.GetLineage();
1886  if (NStr::StartsWith(lineage, "Viruses; ") || NStr::StartsWith(lineage, "Viroids; ")) {
1887  is_viral = true;
1888  }
1889  }
1890  if (orgname.IsSetName()) {
1891  const COrgName::TName& name = orgname.GetName();
1892  if (name.Which() == COrgName::C_Name::e_Binomial) {
1893  const CBinomialOrgName& bin = name.GetBinomial();
1894  if (bin.IsSetGenus()) {
1895  genus = bin.GetGenus();
1896  }
1897  if (bin.IsSetSpecies()) {
1898  species = bin.GetSpecies();
1899  }
1900  }
1901  }
1902  if (orgname.IsSetMod()) {
1903  bool has_strain = false;
1904  vector<string> vouchers;
1905  FOR_EACH_ORGMOD_ON_ORGNAME(omd_itr, orgname)
1906  {
1907  const COrgMod& omd = **omd_itr;
1908  COrgMod::TSubtype subtype = omd.GetSubtype();
1909 
1910  if (omd.IsSetSubname()) {
1911  string str = omd.GetSubname();
1912  if (NStr::Equal(str, "N/A") || NStr::Equal(str, "Missing")) {
1914  "Orgmod name should not be " + str,
1915  obj, ctx);
1916  }
1917  }
1918 
1919  switch (subtype) {
1920  case 0:
1921  case 1:
1923  "Unknown orgmod subtype " + NStr::IntToString(subtype), obj, ctx);
1924  break;
1926  if (omd.IsSetSubname()) {
1927  string str = omd.GetSubname();
1928  strain = str;
1929  if (NStr::StartsWith(str, "subsp.", NStr::eNocase)) {
1931  "Orgmod.strain should not start with subsp.",
1932  obj, ctx);
1933  } else if (NStr::StartsWith(str, "serovar", NStr::eNocase)) {
1935  "Orgmod.strain should not start with serovar",
1936  obj, ctx);
1937  } else if (!COrgMod::IsStrainValid(str)) {
1939  "Orgmod.strain should not be '" + str + "'",
1940  obj, ctx);
1941  }
1942  }
1943  if (has_strain) {
1945  "Multiple strain qualifiers on the same BioSource", obj, ctx);
1946  }
1947  has_strain = true;
1948  break;
1950  if (omd.IsSetSubname()) {
1951  string str = omd.GetSubname();
1952  serovar = str;
1953  if (NStr::StartsWith(str, "subsp.", NStr::eNocase)) {
1955  "Orgmod.serovar should not start with subsp.",
1956  obj, ctx);
1957  } else if (NStr::StartsWith(str, "strain ", NStr::eNocase)) {
1959  "Orgmod.serovar should not start with strain",
1960  obj, ctx);
1961  }
1962  }
1963  break;
1965  if (omd.IsSetSubname()) {
1966  string str = omd.GetSubname();
1967  sub_species = str;
1968  if (NStr::Find(str, "subsp.") != string::npos) {
1970  "Orgmod.sub-species should not contain subsp.",
1971  obj, ctx);
1972  }
1973  }
1974  break;
1976  if ((!orgname.IsSetDiv() || !NStr::EqualNocase(orgname.GetDiv(), "PLN"))
1977  && (!orgname.IsSetLineage() ||
1978  (NStr::Find(orgname.GetLineage(), "Cyanobacteria") == string::npos
1979  && NStr::Find(orgname.GetLineage(), "Cyanobacteriota") == string::npos
1980  && NStr::Find(orgname.GetLineage(), "Myxogastria") == string::npos
1981  && NStr::Find(orgname.GetLineage(), "Oomycetes") == string::npos))) {
1982  if (!has_taxon) {
1984  "Orgmod variety should only be in plants, fungi, or cyanobacteria",
1985  obj, ctx);
1986  }
1987  }
1988  break;
1990  ValidateSourceQualTags(omd.GetSubname(), obj, ctx);
1991  break;
1993  if ((*omd_itr)->IsSetSubname() && !NStr::IsBlank((*omd_itr)->GetSubname())) {
1994  const string& val = (*omd_itr)->GetSubname();
1995 
1996  // look for synonym/gb_synonym duplication
1997  FOR_EACH_ORGMOD_ON_ORGNAME(it2, orgname)
1998  {
1999  if ((*it2)->IsSetSubtype()
2000  && (*it2)->GetSubtype() == COrgMod::eSubtype_gb_synonym
2001  && (*it2)->IsSetSubname()
2002  && NStr::EqualNocase(val, (*it2)->GetSubname())) {
2004  "OrgMod synonym is identical to OrgMod gb_synonym",
2005  obj, ctx);
2006  }
2007  }
2008  }
2009  break;
2013  ValidateOrgModVoucher(omd, obj, ctx);
2014  vouchers.push_back(omd.GetSubname());
2015  break;
2016 
2018  if (!(*omd_itr)->IsSetSubname() ||
2019  !COrgMod::IsValidTypeMaterial((*omd_itr)->GetSubname())) {
2021  "Bad value for type_material", obj, ctx);
2022  }
2023  break;
2024 
2025  default:
2026  break;
2027  }
2028  if (omd.IsSetSubname()) {
2029  const string& subname = omd.GetSubname();
2030 
2033  "Unbalanced parentheses in orgmod '" + subname + "'",
2034  obj, ctx);
2035  }
2036  if (ContainsSgml(subname)) {
2038  "orgmod " + subname + " has SGML",
2039  obj, ctx);
2040  }
2041  }
2042  }
2043 
2044  string err = COrgMod::CheckMultipleVouchers(vouchers);
2045  if (!err.empty()) PostObjErr(eDiag_Warning, eErr_SEQ_DESCR_IdenticalInstitutionCode, err, obj, ctx);
2046  }
2047 
2048  if (is_viral) {
2049  return;
2050  }
2051  if (strain.length() < 1) {
2052  return;
2053  }
2054  if (NStr::EqualNocase(strain, species) && species.length() > 0) {
2056  "Orgmod.strain should not be species '" + species + "'",
2057  obj, ctx);
2058  }
2059  if (NStr::EqualNocase(strain, sub_species) && sub_species.length() > 0) {
2061  "Orgmod.strain should not be subspecies '" + sub_species + "'",
2062  obj, ctx);
2063  }
2064  if (NStr::EqualNocase(strain, serovar) && serovar.length() > 0) {
2066  "Orgmod.strain should not be serovar '" + serovar + "'",
2067  obj, ctx);
2068  }
2069  if (NStr::FindNoCase(strain, genus + " " + species) != string::npos && genus.length() > 0 && species.length() > 0) {
2071  "Orgmod.strain should not contain '" + genus + " " + species + "'",
2072  obj, ctx);
2073  }
2074 }
2075 
2076 
2078 {
2079  if (bsh) {
2081  if (sd) {
2082  const CSeqdesc::TMolinfo& molinfo = sd->GetMolinfo();
2083  if (molinfo.CanGetBiomol() &&
2084  molinfo.GetBiomol() == CMolInfo::eBiomol_other) {
2085  return true;
2086  }
2087  }
2088  }
2089  return false;
2090 }
2091 
2092 
2094 {
2095  bool rval = false;
2096 
2097  if (!source.IsSetGenome()
2098  || source.GetGenome() == CBioSource::eGenome_genomic
2099  || source.GetGenome() == CBioSource::eGenome_unknown) {
2100  bool is_viral = false;
2101  if (source.IsSetOrg()) {
2102  const COrg_ref& org = source.GetOrg();
2103  if (org.IsSetDivision() && NStr::Equal(org.GetDivision(), "PHG")) {
2104  is_viral = true;
2105  } else if (org.IsSetLineage()) {
2106  const string& lineage = org.GetLineage();
2107  if (NStr::StartsWith(lineage, "Viruses; ")
2108  || NStr::StartsWith(lineage, "Viroids; ")) {
2109  is_viral = true;
2110  }
2111  }
2112  }
2113  rval = !is_viral;
2114  }
2115  return rval;
2116 }
2117 
2118 
2120 {
2121  bool rval = false;
2122 
2123  if (source.IsSetLineage()) {
2124  string lineage = source.GetLineage();
2125  if (NStr::StartsWith(lineage, "Bacteria; ", NStr::eNocase)) {
2126  rval = true;
2127  }
2128  }
2129  return rval;
2130 }
2131 
2132 
2134 {
2135  bool rval = false;
2136 
2137  if (source.IsSetLineage()) {
2138  string lineage = source.GetLineage();
2139  if (NStr::StartsWith(lineage, "Archaea; ", NStr::eNocase)) {
2140  rval = true;
2141  }
2142  }
2143  return rval;
2144 }
2145 
2146 
2148 {
2149  bool rval = false;
2150  CSeqdesc_CI d(bsh, CSeqdesc::e_User);
2151  while (d && !rval) {
2152  const auto & user = d->GetUser();
2153  if (user.IsSetType() && user.GetType().IsStr() && NStr::Equal(user.GetType().GetStr(), "DBLink")) {
2154  for (auto f : user.GetData()) {
2155  if (f->IsSetLabel() && f->GetLabel().IsStr() && NStr::Equal(f->GetLabel().GetStr(), "BioSample")
2156  && f->IsSetData() && (f->GetData().IsStr() || f->GetData().IsStrs())) {
2157  rval = true;
2158  break;
2159  }
2160  }
2161  }
2162  ++d;
2163  }
2164  return rval;
2165 }
2166 
2167 
2169 (const CBioSource& source,
2170 const CSerialObject& obj,
2171 const CSeq_entry *ctx,
2172 const CBioseq_Handle& bsh)
2173 {
2174  m_biosource_kind = source;
2175 
2176  const auto & inst = bsh.GetInst();
2177 
2178  if (source.IsSetIs_focus()) {
2179  // skip proteins, segmented bioseqs, or segmented parts
2180  if (!bsh.IsAa() &&
2181  !(inst.GetRepr() == CSeq_inst::eRepr_seg) &&
2182  !(GetAncestor(*(bsh.GetCompleteBioseq()), CBioseq_set::eClass_parts) != 0)) {
2183  if (!CFeat_CI(bsh, CSeqFeatData::e_Biosrc)) {
2184  PostObjErr(eDiag_Error,
2186  "BioSource descriptor has focus, "
2187  "but no BioSource feature", obj, ctx);
2188  }
2189  }
2190  }
2191  if (source.CanGetOrigin() &&
2192  source.GetOrigin() == CBioSource::eOrigin_synthetic) {
2193  if (!IsOtherDNA(bsh) && !bsh.IsAa()) {
2195  "Molinfo-biomol other should be used if "
2196  "Biosource-location is synthetic", obj, ctx);
2197  }
2198  }
2199 
2200  // check locations for HIV biosource
2201  if (inst.IsSetMol()
2202  && source.IsSetOrg() && source.GetOrg().IsSetTaxname()
2203  && (NStr::EqualNocase(source.GetOrg().GetTaxname(), "Human immunodeficiency virus")
2204  || NStr::EqualNocase(source.GetOrg().GetTaxname(), "Human immunodeficiency virus 1")
2205  || NStr::EqualNocase(source.GetOrg().GetTaxname(), "Human immunodeficiency virus 2"))) {
2206 
2207  if (inst.GetMol() == CSeq_inst::eMol_dna) {
2208  if (!source.IsSetGenome() || source.GetGenome() != CBioSource::eGenome_proviral) {
2210  "HIV with moltype DNA should be proviral",
2211  obj, ctx);
2212  }
2213  } else if (inst.GetMol() == CSeq_inst::eMol_rna) {
2215  if (mi && mi->GetMolinfo().IsSetBiomol()
2218  "HIV with mRNA molecule type is rare",
2219  obj, ctx);
2220  }
2221  }
2222  }
2223 
2225  CSeqdesc_CI ti(bsh, CSeqdesc::e_Title);
2226 
2227  string title;
2228  if ( ti ) {
2229  title = ti->GetTitle();
2230  } else {
2231  sequence::CDeflineGenerator defline_generator;
2232  title = defline_generator.GenerateDefline(bsh, sequence::CDeflineGenerator::fIgnoreExisting);
2233  }
2234 
2235  bool isViral = false;
2236  if (source.IsSetLineage()) {
2237  string lineage = source.GetLineage();
2238  if (NStr::StartsWith(lineage, "Viruses; ", NStr::eNocase) || NStr::EqualNocase(lineage, "Viruses")) {
2239  isViral = true;
2240  }
2241  }
2242 
2243  // look for viral completeness
2244  if (!isViral && mi && mi->IsMolinfo() && !NStr::IsBlank(title)) {
2245  const CMolInfo& molinfo = mi->GetMolinfo();
2246  if (molinfo.IsSetBiomol() && molinfo.GetBiomol() == CMolInfo::eBiomol_genomic
2248  && NStr::Find(title, "complete genome") != string::npos
2251  "Non-viral complete genome not labeled as chromosome",
2252  obj, ctx);
2253  }
2254  }
2255 
2256  if (mi) {
2257  // look for synthetic/artificial
2258  bool is_synthetic_construct = IsSyntheticConstruct(source);
2259  bool is_artificial = IsArtificial(source);
2260 
2261  if (is_synthetic_construct) {
2262  if ((!mi->GetMolinfo().IsSetBiomol()
2263  || mi->GetMolinfo().GetBiomol() != CMolInfo::eBiomol_other_genetic) && !bsh.IsAa()) {
2265  "synthetic construct should have other-genetic",
2266  obj, ctx);
2267  }
2268  if (!is_artificial) {
2270  "synthetic construct should have artificial origin",
2271  obj, ctx);
2272  }
2273  } else if (is_artificial) {
2275  "artificial origin should have other-genetic and synthetic construct",
2276  obj, ctx);
2277  }
2278  if (is_artificial) {
2279  if ((!mi->GetMolinfo().IsSetBiomol()
2281  && !bsh.IsAa()) {
2283  "artificial origin should have other-genetic",
2284  obj, ctx);
2285  }
2286  }
2287  }
2288 
2289 
2290  // validate subsources in context
2291 
2293  {
2294  if (!(*it)->IsSetSubtype()) {
2295  continue;
2296  }
2297  CSubSource::TSubtype subtype = (*it)->GetSubtype();
2298 
2299  switch (subtype) {
2301  // look for conflicting cRNA notes on subsources
2302  if (mi && (*it)->IsSetName() && NStr::EqualNocase((*it)->GetName(), "cRNA")) {
2303  const CMolInfo& molinfo = mi->GetMolinfo();
2304  if (!molinfo.IsSetBiomol()
2305  || molinfo.GetBiomol() != CMolInfo::eBiomol_cRNA) {
2307  "cRNA note conflicts with molecule type",
2308  obj, ctx);
2309  } else {
2311  "cRNA note redundant with molecule type",
2312  obj, ctx);
2313  }
2314  }
2315  break;
2316  default:
2317  break;
2318  }
2319  }
2320 
2321 
2322  // look at orgref in context
2323  if (source.IsSetOrg()) {
2324  const COrg_ref& orgref = source.GetOrg();
2325 
2326  if (mi) {
2327  const CMolInfo& molinfo = mi->GetMolinfo();
2328  // look for conflicting cRNA notes on orgmod
2329  if (orgref.IsSetOrgname() && orgref.GetOrgname().IsSetMod()) {
2330  for (auto it : orgref.GetOrgname().GetMod())
2331  {
2332  if (it->IsSetSubtype()
2333  && it->GetSubtype() == COrgMod::eSubtype_other
2334  && it->IsSetSubname()
2335  && NStr::EqualNocase(it->GetSubname(), "cRNA")) {
2336  if (!molinfo.IsSetBiomol()
2337  || molinfo.GetBiomol() != CMolInfo::eBiomol_cRNA) {
2339  "cRNA note conflicts with molecule type",
2340  obj, ctx);
2341  }
2342  else {
2344  "cRNA note redundant with molecule type",
2345  obj, ctx);
2346  }
2347  }
2348  }
2349  }
2350 
2351  if (orgref.IsSetLineage()) {
2352  const string& lineage = orgref.GetOrgname().GetLineage();
2353 
2354  // look for incorrect DNA stage
2355  if (molinfo.IsSetBiomol() && molinfo.GetBiomol() == CMolInfo::eBiomol_genomic
2356  && bsh.IsSetInst() && bsh.GetInst().IsSetMol() && bsh.GetInst().GetMol() == CSeq_inst::eMol_dna
2357  && NStr::StartsWith(lineage, "Viruses; ")
2358  && NStr::FindNoCase(lineage, "no DNA stage") != string::npos) {
2360  "Genomic DNA viral lineage indicates no DNA stage",
2361  obj, ctx);
2362  }
2363  }
2364  }
2365 
2366  }
2367 
2368  if ( (IsGpipe() || IsIndexerVersion() ) && s_IsBioSample(bsh) ) {
2369  bool is_bact = s_IsBacteria(source);
2370  bool is_arch = s_IsArchaea(source);
2371  if ( is_bact || is_arch ) {
2372  bool has_strain = false;
2373  bool has_isolate = false;
2374  bool env_sample = false;
2375  if (source.IsSetSubtype()) {
2376  ITERATE(CBioSource::TSubtype, s, source.GetSubtype())
2377  {
2378  if ((*s)->IsSetSubtype() && (*s)->GetSubtype() == CSubSource::eSubtype_environmental_sample) {
2379  env_sample = true;
2380  break;
2381  }
2382  }
2383  }
2384  if (!env_sample && source.IsSetOrg()
2385  && source.GetOrg().IsSetOrgname()) {
2386  const auto& orgname = source.GetOrg().GetOrgname();
2387  if (orgname.IsSetMod()) {
2388  for (auto om : orgname.GetMod()) {
2389  if (om->IsSetSubtype()) {
2390  if (om->GetSubtype() == COrgMod::eSubtype_isolate) {
2391  has_isolate = true;
2392  break;
2393  }
2394  else if (om->GetSubtype() == COrgMod::eSubtype_strain) {
2395  has_strain = true;
2396  break;
2397  }
2398  }
2399  }
2400  }
2401  }
2402 
2403 
2404  if (!has_strain && !has_isolate && !env_sample) {
2405  if (is_bact) {
2407  "Bacteria should have strain or isolate or environmental sample",
2408  obj, ctx);
2409  } else if (is_arch) {
2411  "Archaea should have strain or isolate or environmental sample",
2412  obj, ctx);
2413  }
2414  }
2415  }
2416  }
2417 
2418 
2419 }
2420 
2421 
2422 
2423 
2425 {
2426  if (bsrc.CanGetSubtype()) {
2427  FOR_EACH_SUBSOURCE_ON_BIOSOURCE(sbs_itr, bsrc)
2428  {
2429  const CSubSource& sbs = **sbs_itr;
2431  return true;
2432  }
2433  }
2434  }
2435  return false;
2436 }
2437 
2438 std::string_view sm_SourceQualPrefixes[] =
2439 {
2440  "acronym:",
2441  "altitude:",
2442  "anamorph:",
2443  "authority:",
2444  "biotype:",
2445  "biovar:",
2446  "bio_material:",
2447  "breed:",
2448  "cell_line:",
2449  "cell_type:",
2450  "chemovar:",
2451  "chromosome:",
2452  "clone:",
2453  "clone_lib:",
2454  "collected_by:",
2455  "collection_date:",
2456  "common:",
2457  "country:",
2458  "cultivar:",
2459  "culture_collection:",
2460  "dev_stage:",
2461  "dosage:",
2462  "ecotype:",
2463  "endogenous_virus_name:",
2464  "environmental_sample:",
2465  "forma:",
2466  "forma_specialis:",
2467  "frequency:",
2468  "fwd_pcr_primer_name",
2469  "fwd_pcr_primer_seq",
2470  "fwd_primer_name",
2471  "fwd_primer_seq",
2472  "genotype:",
2473  "germline:",
2474  "group:",
2475  "haplogroup:",
2476  "haplotype:",
2477  "identified_by:",
2478  "insertion_seq_name:",
2479  "isolate:",
2480  "isolation_source:",
2481  "lab_host:",
2482  "lat_lon:",
2483  "left_primer:",
2484  "linkage_group:",
2485  "map:",
2486  "mating_type:",
2487  "metagenome_source:",
2488  "metagenomic:",
2489  "nat_host:",
2490  "pathovar:",
2491  "phenotype:",
2492  "placement:",
2493  "plasmid_name:",
2494  "plastid_name:",
2495  "pop_variant:",
2496  "rearranged:",
2497  "rev_pcr_primer_name",
2498  "rev_pcr_primer_seq",
2499  "rev_primer_name",
2500  "rev_primer_seq",
2501  "right_primer:",
2502  "segment:",
2503  "serogroup:",
2504  "serotype:",
2505  "serovar:",
2506  "sex:",
2507  "specimen_voucher:",
2508  "strain:",
2509  "subclone:",
2510  "subgroup:",
2511  "substrain:",
2512  "subtype:",
2513  "sub_species:",
2514  "synonym:",
2515  "taxon:",
2516  "teleomorph:",
2517  "tissue_lib:",
2518  "tissue_type:",
2519  "transgenic:",
2520  "transposon_name:",
2521  "type:",
2522  "variety:",
2523  "whole_replicon:",
2524 };
2525 
2526 
2528 {
2529  static std::mutex m;
2530 
2531  std::lock_guard g(m);
2532 
2533  if (m_SourceQualTags)
2534  return;
2535 
2536  m_SourceQualTags.reset(new CTextFsa(true));
2537 
2538  for (auto rec: sm_SourceQualPrefixes) {
2539  m_SourceQualTags->AddWord(string(rec));
2540  }
2541 
2542  m_SourceQualTags->Prime();
2543 }
2544 
2545 
2547 (const string& str,
2548 const CSerialObject& obj,
2549 const CSeq_entry *ctx)
2550 {
2551  if (NStr::IsBlank(str)) return;
2552 
2553  size_t str_len = str.length();
2554 
2555  int state = m_SourceQualTags->GetInitialState();
2556 
2557  for (size_t i = 0; i < str_len; ++i) {
2558  state = m_SourceQualTags->GetNextState(state, str[i]);
2559  if (m_SourceQualTags->IsMatchFound(state)) {
2560  auto match = m_SourceQualTags->GetMatches(state)[0];
2561  if (match.empty()) {
2562  match = "?";
2563  }
2564  size_t match_len = match.length();
2565 
2566  bool okay = true;
2567  if ((int)(i - match_len) >= 0) {
2568  char ch = str[i - match_len];
2569  if (!isspace((unsigned char)ch) && ch != ';') {
2570  okay = false;
2571 #if 0
2572  // look to see if there's a longer match in the list
2573  for (size_t k = 0;
2574  k < sizeof (CValidError_imp::sm_SourceQualPrefixes) / sizeof (string);
2575  k++) {
2577  if (pos != string::npos) {
2578  if (pos == 0 || isspace ((unsigned char) str[pos]) || str[pos] == ';') {
2579  okay = true;
2581  break;
2582  }
2583  }
2584  }
2585 #endif
2586  }
2587  }
2588  if (okay) {
2590  "Source note has structured tag '" + match + "'", obj, ctx);
2591  }
2592  }
2593  }
2594 }
2595 
2596 
2597 
2598 static bool x_HasTentativeName(const CUser_object &user_object)
2599 {
2600  if (!FIELD_IS_SET_AND_IS(user_object, Type, Str) ||
2601  user_object.GetType().GetStr() != "StructuredComment") {
2602  return false;
2603  }
2604 
2605  FOR_EACH_USERFIELD_ON_USEROBJECT(user_field_iter, user_object)
2606  {
2607  const CUser_field &field = **user_field_iter;
2608  if (FIELD_IS_SET_AND_IS(field, Label, Str) && FIELD_IS_SET_AND_IS(field, Data, Str)) {
2609  if (GET_FIELD(field.GetLabel(), Str) == "Tentative Name") {
2610  if (GET_FIELD(field.GetData(), Str) != "not provided") {
2611  return true;
2612  }
2613  }
2614  }
2615  }
2616 
2617  return false;
2618 }
2619 
2620 
2621 static string x_GetTentativeName(const CUser_object &user_object)
2622 {
2623  if (!FIELD_IS_SET_AND_IS(user_object, Type, Str) ||
2624  user_object.GetType().GetStr() != "StructuredComment") {
2625  return "";
2626  }
2627 
2628  FOR_EACH_USERFIELD_ON_USEROBJECT(user_field_iter, user_object)
2629  {
2630  const CUser_field &field = **user_field_iter;
2631  if (FIELD_IS_SET_AND_IS(field, Label, Str) && FIELD_IS_SET_AND_IS(field, Data, Str)) {
2632  if (GET_FIELD(field.GetLabel(), Str) == "Tentative Name") {
2633  if (GET_FIELD(field.GetData(), Str) != "not provided") {
2634  return GET_FIELD(field.GetData(), Str);
2635  }
2636  }
2637  }
2638  }
2639 
2640  return "";
2641 }
2642 
2643 
2645 (const CSeq_entry& se,
2646 vector<CConstRef<CSeqdesc> >& usr_descs,
2647 vector<CConstRef<CSeq_entry> >& desc_ctxs,
2648 vector<CConstRef<CSeq_feat> >& usr_feats)
2649 {
2650  // get source descriptors
2651  if (se.IsSetDescr()) {
2652  for (auto it : se.GetDescr().Get())
2653  {
2654  if (it->IsUser() && x_HasTentativeName(it->GetUser())) {
2655  CConstRef<CSeqdesc> desc;
2656  desc.Reset(it);
2657  usr_descs.push_back(desc);
2658  CConstRef<CSeq_entry> r_se;
2659  r_se.Reset(&se);
2660  desc_ctxs.push_back(r_se);
2661  }
2662  }
2663  }
2664  // also get features
2665  if (se.IsSetAnnot()) {
2666  for (auto annot_it : se.GetAnnot()) {
2667  if (annot_it->IsFtable()) {
2668  for (auto feat_it : annot_it->GetData().GetFtable()) {
2669  if (feat_it->IsSetData() && feat_it->GetData().IsUser()
2670  && x_HasTentativeName(feat_it->GetData().GetUser())) {
2671  CConstRef<CSeq_feat> feat;
2672  feat.Reset(feat_it);
2673  usr_feats.push_back(feat);
2674  }
2675  }
2676  }
2677  }
2678  }
2679 
2680  // if set, recurse
2681  if (se.IsSet()) {
2683  {
2684  GatherTentativeName(**it, usr_descs, desc_ctxs, usr_feats);
2685  }
2686  }
2687 }
2688 
2689 
2690 const size_t kDefaultChunkSize = 1000;
2692 {
2693  vector< CRef<COrg_ref> > org_rq_list = tval.GetTaxonomyLookupRequest();
2694 
2695  if (org_rq_list.size() > 0) {
2696 
2697  size_t chunk_size = kDefaultChunkSize;
2698  size_t i = 0;
2699  while (i < org_rq_list.size()) {
2700  size_t len = min(chunk_size, org_rq_list.size() - i);
2701  vector< CRef<COrg_ref> > tmp_rq(org_rq_list.begin() + i, org_rq_list.begin() + i + len);
2702  CRef<CTaxon3_reply> reply = m_pContext->m_taxon_update(tmp_rq);
2703  if (!reply || !reply->IsSetReply()) {
2704  if (chunk_size > 20) {
2705  chunk_size = chunk_size / 2;
2706  } else {
2708  "Taxonomy service connection failure", *(tval.GetTopReportObject()));
2709  break;
2710  }
2711  } else {
2712  tval.ReportIncrementalTaxLookupErrors(*reply, *this, (IsPatent() || IsINSDInSep()), i);
2713  i += chunk_size;
2714  }
2715  }
2716  }
2717 }
2718 
2721 {
2722  vector<CRef<COrg_ref> > org_rq_list = tval.GetSpecificHostLookupRequest(false);
2723 
2724  if (org_rq_list.size() == 0) {
2725  return;
2726  }
2727 
2728  size_t chunk_size = kDefaultChunkSize;
2729  size_t i = 0;
2730  while (i < org_rq_list.size()) {
2731  size_t len = min(chunk_size, org_rq_list.size() - i);
2732  vector< CRef<COrg_ref> > tmp_rq(org_rq_list.begin() + i, org_rq_list.begin() + i + len);
2733  CRef<CTaxon3_reply> tmp_spec_host_reply = m_pContext->m_taxon_update(tmp_rq);
2734  string err_msg;
2735  if (tmp_spec_host_reply) {
2736  err_msg = tval.IncrementalSpecificHostMapUpdate(tmp_rq, *tmp_spec_host_reply);
2737  } else {
2738  err_msg = "Connection to taxonomy failed";
2739  }
2740  if (!NStr::IsBlank(err_msg)) {
2742  return;
2743  }
2744  i += chunk_size;
2745  }
2746 
2747  tval.ReportSpecificHostErrors(*this);
2748 }
2749 
2750 
2752 (CTaxValidationAndCleanup& tval, TTaxId descTaxID)
2753 {
2754  vector<CRef<COrg_ref> > org_rq_list = tval.GetStrainLookupRequest();
2755 
2756  if (org_rq_list.size() == 0) {
2757  return;
2758  }
2759 
2760  size_t chunk_size = kDefaultChunkSize;
2761  size_t i = 0;
2762  while (i < org_rq_list.size()) {
2763  size_t len = min(chunk_size, org_rq_list.size() - i);
2764  vector< CRef<COrg_ref> > tmp_rq(org_rq_list.begin() + i, org_rq_list.begin() + i + len);
2765  CRef<CTaxon3_reply> tmp_spec_host_reply = m_pContext->m_taxon_update(tmp_rq);
2766  string err_msg = tval.IncrementalStrainMapUpdate(tmp_rq, *tmp_spec_host_reply, descTaxID);
2767  if (!NStr::IsBlank(err_msg)) {
2769  return;
2770  }
2771  i += chunk_size;
2772  }
2773 
2774  tval.ReportStrainErrors(*this);
2775 }
2776 
2777 
2779 {
2780  auto pTval = x_CreateTaxValidator();
2781  pTval->Init(se);
2782  ValidateSpecificHost(*pTval);
2783 }
2784 
2785 
2787 {
2788  const string err_str = error.IsSetMessage() ? error.GetMessage() : "?";
2789 
2790  if (NStr::Equal(err_str, "Organism not found")) {
2791  return true;
2792  } else {
2793  return false;
2794  }
2795 }
2796 
2797 
2799 {
2800  vector<CConstRef<CSeqdesc> > src_descs;
2801  vector<CConstRef<CSeq_entry> > desc_ctxs;
2802  vector<CConstRef<CSeq_feat> > src_feats;
2803 
2804  GatherTentativeName(se, src_descs, desc_ctxs, src_feats);
2805 
2806  // request list for taxon3
2807  vector< CRef<COrg_ref> > org_rq_list;
2808 
2809  // first do descriptors
2810  vector<CConstRef<CSeqdesc> >::iterator desc_it = src_descs.begin();
2811  vector<CConstRef<CSeq_entry> >::iterator ctx_it = desc_ctxs.begin();
2812  while (desc_it != src_descs.end() && ctx_it != desc_ctxs.end()) {
2813  const string& taxname = x_GetTentativeName((*desc_it)->GetUser());
2814  CRef<COrg_ref> rq(new COrg_ref);
2815  rq->SetTaxname(taxname);
2816  org_rq_list.push_back(rq);
2817 
2818  ++desc_it;
2819  ++ctx_it;
2820  }
2821 
2822  // now do features
2823  vector<CConstRef<CSeq_feat> >::iterator feat_it = src_feats.begin();
2824  while (feat_it != src_feats.end()) {
2825  const string& taxname = x_GetTentativeName((*feat_it)->GetData().GetUser());
2826  CRef<COrg_ref> rq(new COrg_ref);
2827  rq->SetTaxname(taxname);
2828  org_rq_list.push_back(rq);
2829 
2830  ++feat_it;
2831  }
2832 
2833  if (org_rq_list.empty()) {
2834  return;
2835  }
2836 
2837  CRef<CTaxon3_reply> reply = m_pContext->m_taxon_update(org_rq_list);
2838  if (!reply || !reply->IsSetReply()) {
2840  "Taxonomy service connection failure", se);
2841  }
2842 
2843  const auto& rlist = reply->GetReply();
2844  CTaxon3_reply::TReply::const_iterator reply_it = rlist.begin();
2845 
2846  // process descriptor responses
2847  desc_it = src_descs.begin();
2848  ctx_it = desc_ctxs.begin();
2849  size_t pos = 0;
2850 
2851  while (reply_it != rlist.end()
2852  && desc_it != src_descs.end()
2853  && ctx_it != desc_ctxs.end()) {
2854  if ((*reply_it)->IsError()) {
2855  if (IsOrgNotFound((*reply_it)->GetError())) {
2857  "Taxonomy lookup failed for Tentative Name '" + org_rq_list[pos]->GetTaxname() + "'",
2858  **desc_it, *ctx_it);
2859  } else {
2860  HandleTaxonomyError((*reply_it)->GetError(),
2861  eErr_SEQ_DESCR_BadTentativeName, **desc_it, *ctx_it);
2862  }
2863  }
2864  ++reply_it;
2865  ++desc_it;
2866  ++ctx_it;
2867  ++pos;
2868  }
2869 
2870  // process feat responses
2871  feat_it = src_feats.begin();
2872  while (reply_it != rlist.end()
2873  && feat_it != src_feats.end()) {
2874  if ((*reply_it)->IsError()) {
2875  if (IsOrgNotFound((*reply_it)->GetError())) {
2877  "Taxonomy lookup failed for Tentative Name '" + org_rq_list[pos]->GetTaxname() + "'",
2878  **feat_it);
2879  } else {
2880  HandleTaxonomyError((*reply_it)->GetError(),
2881  eErr_SEQ_DESCR_BadTentativeName, **feat_it);
2882  }
2883  }
2884  ++reply_it;
2885  ++feat_it;
2886  ++pos;
2887  }
2888 }
2889 
2891  const EErrType type, const CSeqdesc& desc, const CSeq_entry* entry)
2892 {
2893  const string err_str = error.IsSetMessage() ? error.GetMessage() : "?";
2894 
2895  if (NStr::Equal(err_str, "Organism not found")) {
2896  string msg = "Organism not found in taxonomy database";
2897  if (error.IsSetOrg()) {
2898  const auto& e_org = error.GetOrg();
2899  const auto& d_org = desc.GetSource().GetOrg();
2900  if (e_org.IsSetTaxname() &&
2901  !NStr::Equal(e_org.GetTaxname(), "Not valid") &&
2902  (!d_org.IsSetTaxname() ||
2903  !NStr::Equal(d_org.GetTaxname(), e_org.GetTaxname()))) {
2904  msg += " (suggested:" + e_org.GetTaxname() + ")";
2905  }
2906  }
2908  msg,
2909  desc, entry);
2910  } else if (NStr::Equal(err_str, kInvalidReplyMsg)) {
2912  err_str,
2913  desc, entry);
2914  } else if (NStr::Find(err_str, "ambiguous name") != NPOS) {
2916  "Taxonomy lookup failed with message '" + err_str + "'",
2917  desc, entry);
2918  } else {
2919  PostObjErr(eDiag_Warning, type,
2920  "Taxonomy lookup failed with message '" + err_str + "'",
2921  desc, entry);
2922  }
2923 }
2924 
2926  const EErrType type, const CSeq_feat& feat)
2927 {
2928  const string err_str = error.IsSetMessage() ? error.GetMessage() : "?";
2929 
2930  if (NStr::Equal(err_str, kInvalidReplyMsg)) {
2932  err_str,
2933  feat);
2934  } else if (NStr::Find(err_str, "ambiguous name") != NPOS) {
2936  "Taxonomy lookup failed with message '" + err_str + "'",
2937  feat);
2938  } else {
2939  PostErr(eDiag_Warning, type,
2940  "Taxonomy lookup failed with message '" + err_str + "'",
2941  feat);
2942  }
2943 }
2944 
2946  const string& host, const COrg_ref& org)
2947 {
2948  const string err_str = error.IsSetMessage() ? error.GetMessage() : "?";
2949 
2950  if (NStr::Equal(err_str, "Organism not found")) {
2952  "Organism not found in taxonomy database",
2953  org);
2954  } else if (NStr::FindNoCase(err_str, "ambiguous") != string::npos) {
2956  "Specific host value is ambiguous: " + host, org);
2957  } else if (NStr::Equal(err_str, kInvalidReplyMsg)) {
2959  err_str,
2960  org);
2961  } else {
2963  "Invalid value for specific host: " + host, org);
2964  }
2965 }
2966 
2967 
2968 /*
2969 static bool StrainCheckCallback(const string& organism, const string& strain)
2970 
2971 {
2972  CTaxon3 taxon3(CTaxon3::initialize::yes);
2973  auto responder = [&taxon3](const vector<CRef<COrg_ref>>& request)->CRef<CTaxon3_reply>
2974  {
2975  CRef<CTaxon3_reply> reply = taxon3.SendOrgRefList(request);
2976  return reply;
2977  };
2978  return CStrainRequest::StrainContainsTaxonInfo(organism, strain, responder);
2979 }
2980 */
2981 
2982 
2983 static bool s_init_NewTaxVal(void)
2984 {
2985  if (! CNcbiApplication::Instance()) {
2986  return false;
2987  }
2988 
2990  string fromEnv = env.Get("NCBI_NEW_STRAIN_VALIDATION");
2991  NStr::ToLower(fromEnv);
2992  if (fromEnv == "true") {
2993  return true;
2994  } else if (fromEnv == "false") {
2995  return false;
2996  }
2997 
2998  return false;
2999 }
3000 
3001 
3002 static bool NCBI_NewTaxVal(void)
3003 {
3004  static bool value = s_init_NewTaxVal();
3005  return value;
3006 }
3007 
3008 
3009 /*
3010 static CRef<CTaxon3_reply> ExploreStrainsCallback (const vector<CRef<COrg_ref>>& request)
3011 
3012 {
3013  CTaxon3 taxon3(CTaxon3::initialize::yes);
3014  CRef<CTaxon3_reply> reply = taxon3.SendOrgRefList(request);
3015 
3016  // CRef<CTaxon3_reply> reply = m_pContext->m_taxon_update(request);
3017  // cerr << "TaxonReply: " << MSerial_AsnText << reply << endl;
3018 
3019  return reply;
3020 }
3021 */
3022 
3023 
3025 {
3026  auto pTval = x_CreateTaxValidator();
3027 
3028  pTval->Init(se);
3029 
3030  ValidateOrgRefs(*pTval);
3031 
3032  // Now look at specific-host values
3033  ValidateSpecificHost(*pTval);
3034 
3035  // Commented out until TM-725 is resolved
3036  if (NCBI_NewTaxVal()) {
3038  [this] (const vector<CRef<COrg_ref>>& request) -> CRef<CTaxon3_reply>
3039  { return m_pContext->m_taxon_update(request);});
3040  } else {
3041  ValidateStrain(*pTval, pTval->m_descTaxID);
3042  }
3043 
3044  ValidateTentativeName(se);
3045 }
3046 
3047 
3049 {
3050  auto pTval = x_CreateTaxValidator();
3051  pTval->CheckOneOrg(org, genome, *this);
3052 }
3053 
3054 
3055 CPCRSet::CPCRSet(size_t pos) : m_OrigPos(pos)
3056 {
3057 }
3058 
3059 
3061 {
3062 }
3063 
3064 
3066 {
3067  m_SetList.clear();
3068 }
3069 
3070 
3072 {
3073  for (size_t i = 0; i < m_SetList.size(); i++) {
3074  delete m_SetList[i];
3075  }
3076  m_SetList.clear();
3077 }
3078 
3079 
3080 void CPCRSetList::AddFwdName(string name)
3081 {
3082  unsigned int pcr_num = 0;
3083  if (NStr::StartsWith(name, "(") && NStr::EndsWith(name, ")") && NStr::Find(name, ",") != string::npos) {
3084  name = name.substr(1, name.length() - 2);
3085  vector<string> mult_names;
3086  NStr::Split(name, ",", mult_names, 0);
3087  unsigned int name_num = 0;
3088  while (name_num < mult_names.size()) {
3089  while (pcr_num < m_SetList.size() && !NStr::IsBlank(m_SetList[pcr_num]->GetFwdName())) {
3090  pcr_num++;
3091  }
3092  if (pcr_num == m_SetList.size()) {
3093  m_SetList.push_back(new CPCRSet(pcr_num));
3094  }
3095  m_SetList[pcr_num]->SetFwdName(mult_names[name_num]);
3096  name_num++;
3097  pcr_num++;
3098  }
3099  } else {
3100  while (pcr_num < m_SetList.size() && !NStr::IsBlank(m_SetList[pcr_num]->GetFwdName())) {
3101  pcr_num++;
3102  }
3103  if (pcr_num == m_SetList.size()) {
3104  m_SetList.push_back(new CPCRSet(pcr_num));
3105  }
3106  m_SetList[pcr_num]->SetFwdName(name);
3107  }
3108 }
3109 
3110 
3111 void CPCRSetList::AddRevName(string name)
3112 {
3113  unsigned int pcr_num = 0;
3114  if (NStr::StartsWith(name, "(") && NStr::EndsWith(name, ")") && NStr::Find(name, ",") != string::npos) {
3115  name = name.substr(1, name.length() - 2);
3116  vector<string> mult_names;
3117  NStr::Split(name, ",", mult_names, 0);
3118  unsigned int name_num = 0;
3119  while (name_num < mult_names.size()) {
3120  while (pcr_num < m_SetList.size() && !NStr::IsBlank(m_SetList[pcr_num]->GetRevName())) {
3121  pcr_num++;
3122  }
3123  if (pcr_num == m_SetList.size()) {
3124  m_SetList.push_back(new CPCRSet(pcr_num));
3125  }
3126  m_SetList[pcr_num]->SetRevName(mult_names[name_num]);
3127  name_num++;
3128  pcr_num++;
3129  }
3130  } else {
3131  while (pcr_num < m_SetList.size() && !NStr::IsBlank(m_SetList[pcr_num]->GetRevName())) {
3132  pcr_num++;
3133  }
3134  if (pcr_num == m_SetList.size()) {
3135  m_SetList.push_back(new CPCRSet(pcr_num));
3136  }
3137  m_SetList[pcr_num]->SetRevName(name);
3138  }
3139 }
3140 
3141 
3142 void CPCRSetList::AddFwdSeq(string name)
3143 {
3144  unsigned int pcr_num = 0;
3145  if (NStr::StartsWith(name, "(") && NStr::EndsWith(name, ")") && NStr::Find(name, ",") != string::npos) {
3146  name = name.substr(1, name.length() - 2);
3147  vector<string> mult_names;
3148  NStr::Split(name, ",", mult_names, 0);
3149  unsigned int name_num = 0;
3150  while (name_num < mult_names.size()) {
3151  while (pcr_num < m_SetList.size() && !NStr::IsBlank(m_SetList[pcr_num]->GetFwdSeq())) {
3152  pcr_num++;
3153  }
3154  if (pcr_num == m_SetList.size()) {
3155  m_SetList.push_back(new CPCRSet(pcr_num));
3156  }
3157  m_SetList[pcr_num]->SetFwdSeq(mult_names[name_num]);
3158  name_num++;
3159  pcr_num++;
3160  }
3161  } else {
3162  while (pcr_num < m_SetList.size() && !NStr::IsBlank(m_SetList[pcr_num]->GetFwdSeq())) {
3163  pcr_num++;
3164  }
3165  if (pcr_num == m_SetList.size()) {
3166  m_SetList.push_back(new CPCRSet(pcr_num));
3167  }
3168  m_SetList[pcr_num]->SetFwdSeq(name);
3169  }
3170 }
3171 
3172 
3173 void CPCRSetList::AddRevSeq(string name)
3174 {
3175  unsigned int pcr_num = 0;
3176  if (NStr::StartsWith(name, "(") && NStr::EndsWith(name, ")") && NStr::Find(name, ",") != string::npos) {
3177  name = name.substr(1, name.length() - 2);
3178  vector<string> mult_names;
3179  NStr::Split(name, ",", mult_names, 0);
3180  unsigned int name_num = 0;
3181  while (name_num < mult_names.size()) {
3182  while (pcr_num < m_SetList.size() && !NStr::IsBlank(m_SetList[pcr_num]->GetRevSeq())) {
3183  pcr_num++;
3184  }
3185  if (pcr_num == m_SetList.size()) {
3186  m_SetList.push_back(new CPCRSet(pcr_num));
3187  }
3188  m_SetList[pcr_num]->SetRevSeq(mult_names[name_num]);
3189  name_num++;
3190  pcr_num++;
3191  }
3192  } else {
3193  while (pcr_num < m_SetList.size() && !NStr::IsBlank(m_SetList[pcr_num]->GetRevSeq())) {
3194  pcr_num++;
3195  }
3196  if (pcr_num == m_SetList.size()) {
3197  m_SetList.push_back(new CPCRSet(pcr_num));
3198  }
3199  m_SetList[pcr_num]->SetRevSeq(name);
3200  }
3201 }
3202 
3203 
3204 static bool s_PCRSetCompare(
3205  const CPCRSet* p1,
3206  const CPCRSet* p2
3207  )
3208 
3209 {
3210  int compare = NStr::CompareNocase(p1->GetFwdSeq(), p2->GetFwdSeq());
3211  if (compare < 0) {
3212  return true;
3213  } else if (compare > 0) {
3214  return false;
3215  } else if ((compare = NStr::CompareNocase(p1->GetRevSeq(), p2->GetRevSeq())) < 0) {
3216  return true;
3217  } else if (compare > 0) {
3218  return false;
3219  } else if ((compare = NStr::CompareNocase(p1->GetFwdName(), p2->GetFwdName())) < 0) {
3220  return true;
3221  } else if (compare > 0) {
3222  return false;
3223  } else if ((compare = NStr::CompareNocase(p1->GetRevName(), p2->GetRevName())) < 0) {
3224  return true;
3225  } else if (p1->GetOrigPos() < p2->GetOrigPos()) {
3226  return true;
3227  } else {
3228  return false;
3229  }
3230 }
3231 
3232 
3233 static bool s_PCRSetEqual(
3234  const CPCRSet* p1,
3235  const CPCRSet* p2
3236  )
3237 
3238 {
3239  if (NStr::EqualNocase(p1->GetFwdSeq(), p2->GetFwdSeq())
3240  && NStr::EqualNocase(p1->GetRevSeq(), p2->GetRevSeq())
3241  && NStr::EqualNocase(p1->GetFwdName(), p2->GetFwdName())
3242  && NStr::EqualNocase(p1->GetRevName(), p2->GetRevName())) {
3243  return true;
3244  } else {
3245  return false;
3246  }
3247 }
3248 
3249 
3251 {
3252  stable_sort(m_SetList.begin(),
3253  m_SetList.end(),
3254  s_PCRSetCompare);
3255 
3256  return seq_mac_is_unique(m_SetList.begin(),
3257  m_SetList.end(),
3258  s_PCRSetEqual);
3259 
3260 }
3261 
3262 
3263 static bool s_PCRPrimerLess(const CPCRPrimer& p1, const CPCRPrimer& p2)
3264 {
3265  if (!p1.IsSetSeq() && p2.IsSetSeq()) {
3266  return true;
3267  } else if (p1.IsSetSeq() && !p2.IsSetSeq()) {
3268  return false;
3269  } else if (p1.IsSetSeq() && p2.IsSetSeq()) {
3270  int compare = NStr::CompareNocase(p1.GetSeq().Get(), p2.GetSeq().Get());
3271  if (compare < 0) {
3272  return true;
3273  } else if (compare > 0) {
3274  return false;
3275  }
3276  }
3277  if (!p1.IsSetName() && p2.IsSetName()) {
3278  return true;
3279  } else if (p1.IsSetName() && !p2.IsSetName()) {
3280  return false;
3281  } else if (p1.IsSetName() && p2.IsSetName()) {
3282  return (NStr::CompareNocase(p1.GetName().Get(), p2.GetName().Get()) < 0);
3283  } else {
3284  return false;
3285  }
3286 }
3287 
3288 
3289 static bool s_PCRPrimerSetLess(const CPCRPrimerSet& s1, const CPCRPrimerSet& s2)
3290 {
3291  if (!s1.IsSet() && s2.IsSet()) {
3292  return true;
3293  } else if (s1.IsSet() && !s2.IsSet()) {
3294  return false;
3295  } else if (!s1.IsSet() && !s2.IsSet()) {
3296  return false;
3297  } else if (s1.Get().size() < s2.Get().size()) {
3298  return true;
3299  } else if (s1.Get().size() > s2.Get().size()) {
3300  return false;
3301  } else {
3302  auto it1 = s1.Get().begin();
3303  auto it2 = s2.Get().begin();
3304  while (it1 != s1.Get().end()) {
3305  if (s_PCRPrimerLess(**it1, **it2)) {
3306  return true;
3307  } else if (s_PCRPrimerLess(**it2, **it1)) {
3308  return false;
3309  } else {
3310  // the two are equal, continue comparisons
3311  }
3312  ++it1;
3313  ++it2;
3314  }
3315  return false;
3316  }
3317 }
3318 
3319 
3320 static bool s_PCRReactionLess(
3323 )
3324 
3325 {
3326  const CPCRReaction& p1 = *pp1;
3327  const CPCRReaction& p2 = *pp2;
3328  if (!p1.IsSetForward() && p2.IsSetForward()) {
3329  return true;
3330  } else if (p1.IsSetForward() && !p2.IsSetForward()) {
3331  return false;
3332  } else if (p2.IsSetForward() && p1.IsSetForward()) {
3333  if (s_PCRPrimerSetLess(p1.GetForward(), p2.GetForward())) {
3334  return true;
3335  } else if (s_PCRPrimerSetLess(p2.GetForward(), p1.GetForward())) {
3336  return false;
3337  }
3338  }
3339  if (!p1.IsSetReverse() && p2.IsSetReverse()) {
3340  return true;
3341  } else if (p1.IsSetReverse() && !p2.IsSetReverse()) {
3342  return false;
3343  } else if (p1.IsSetReverse() && p2.IsSetReverse()) {
3344  if (s_PCRPrimerSetLess(p1.GetReverse(), p2.GetReverse())) {
3345  return true;
3346  } else if (s_PCRPrimerSetLess(p2.GetReverse(), p1.GetReverse())) {
3347  return false;
3348  }
3349  }
3350  return false;
3351 }
3352 
3354 {
3355  template <typename T>
3356  bool operator()(T l, T r) const
3357  {
3358  _ASSERT(l.NotEmpty());
3359  _ASSERT(r.NotEmpty());
3360 
3361  return s_PCRReactionLess(l, r);
3362  }
3363 };
3364 
3366 
3368 {
3369  if (!primers.IsSet() || primers.Get().size() < 2) {
3370  return true;
3371  }
3372  TPCRReactionSet already_seen;
3373  for (auto it : primers.Get()) {
3374  if (already_seen.find(it) != already_seen.end()) {
3375  return false;
3376  }
3377  already_seen.insert(it);
3378  }
3379  return true;
3380 }
3381 
3382 
3383 
3384 // ===== for validating instituation and collection codes in specimen-voucher, ================
3386 {
3387 
3388  if (!orgmod.IsSetSubtype() || !orgmod.IsSetSubname() || NStr::IsBlank(orgmod.GetSubname())) {
3389  return;
3390  }
3391 
3392  bool use_geo_loc_name = CSubSource::NCBI_UseGeoLocNameForCountry();
3393 
3394  int subtype = orgmod.GetSubtype();
3395  string val = orgmod.GetSubname();
3396 
3397  string error;
3398  switch (subtype) {
3401  break;
3404  break;
3407  break;
3408  default:
3409  break;
3410  }
3411 
3412  vector<string> error_list;
3413  NStr::Split(error, "\n", error_list, 0);
3414  ITERATE(vector<string>, err, error_list) {
3415  if (NStr::IsBlank(*err)) {
3416  // do nothing
3417  } else if (NStr::FindNoCase(*err, "should be structured") != string::npos) {
3419  } else if (NStr::FindNoCase(*err, "missing institution code") != string::npos) {
3421  } else if (NStr::FindNoCase(*err, "missing specific identifier") != string::npos) {
3423  } else if (NStr::FindNoCase(*err, "should be") != string::npos) {
3424  EDiagSev level = eDiag_Info;
3425  if (NStr::StartsWith(*err, "DNA")) {
3426  level = eDiag_Warning;
3427  }
3428  PostObjErr(level, eErr_SEQ_DESCR_WrongVoucherType, *err, obj, ctx);
3429  } else if (NStr::StartsWith(*err, "Personal")) {
3431  *err, obj, ctx);
3432  } else if (NStr::FindNoCase(*err, "should not be qualified with a <COUNTRY> designation") != string::npos) {
3433  if (use_geo_loc_name) {
3435  } else {
3437  }
3438  } else if (NStr::FindNoCase(*err, "needs to be qualified with a <COUNTRY> designation") != string::npos) {
3440  } else if (NStr::FindNoCase(*err, " exists, but collection ") != string::npos) {
3442  } else {
3444  }
3445  }
3446 }
3447 
3448 
3449 unique_ptr<CTaxValidationAndCleanup> CValidError_imp::x_CreateTaxValidator() const
3450 {
3451  #if 0
3452  if (m_taxon) {
3453  auto taxFunc = [this](const vector<CRef<COrg_ref>>& orgRefs)->CRef<CTaxon3_reply> {
3454  return m_taxon->SendOrgRefList(orgRefs);
3455  };
3456  return make_unique<CTaxValidationAndCleanup>(taxFunc);
3457  }
3458  #endif
3459 
3460  return make_unique<CTaxValidationAndCleanup>(m_pContext->m_taxon_update);
3461 }
3462 
3463 
3464 END_SCOPE(validator)
const char * sm_ValidModifiedPrimerBases[]
EErrType
@ eErr_SEQ_DESCR_BadPlastidName
@ eErr_SEQ_DESCR_ObsoleteSourceQual
@ eErr_SEQ_DESCR_MissingEnvironmentalSample
@ eErr_SEQ_DESCR_ObsoleteSourceLocation
@ eErr_SEQ_DESCR_MissingPlasmidLocation
@ eErr_SEQ_DESCR_InvalidTissueType
@ eErr_SEQ_DESCR_TaxonomyServiceProblem
@ eErr_SEQ_DESCR_TaxonomyBlankSample
@ eErr_SEQ_DESCR_MissingPersonalCollectionName
@ eErr_SEQ_DESCR_LatLonRange
@ eErr_SEQ_DESCR_DuplicatePCRPrimerSequence
@ eErr_GENERIC_SgmlPresentInText
@ eErr_SEQ_DESCR_UnstructuredVoucher
@ eErr_SEQ_DESCR_BadVariety
@ eErr_SEQ_DESCR_BadInstitutionGeoLocName
@ eErr_SEQ_DESCR_BadTypeMaterial
@ eErr_SEQ_DESCR_OrgModMissingValue
@ eErr_SEQ_DESCR_NoOrgFound
@ eErr_SEQ_DESCR_BadPCRPrimerSequence
@ eErr_SEQ_DESCR_UnnecessaryBioSourceFocus
@ eErr_SEQ_DESCR_InvalidForType
@ eErr_SEQ_DESCR_LatLonValue
@ eErr_SEQ_DESCR_OrganismIsUndefinedSpecies
@ eErr_SEQ_DESCR_IdenticalInstitutionCode
@ eErr_SEQ_DESCR_BacteriaMissingSourceQualifier
@ eErr_SEQ_DESCR_BadCountryCapitalization
@ eErr_SEQ_DESCR_BadCollectionDate
@ eErr_SEQ_DESCR_BadInstitutionCode
@ eErr_SEQ_DESCR_BadAltitude
@ eErr_SEQ_DESCR_IncorrectlyFormattedVoucherID
@ eErr_SEQ_DESCR_StrainWithEnvironSample
@ eErr_SEQ_DESCR_OrganismNotFound
@ eErr_SEQ_DESCR_InconsistentVirusMoltype
@ eErr_SEQ_DESCR_BadInstitutionCountry
@ eErr_SEQ_DESCR_MissingPlasmidName
@ eErr_SEQ_DESCR_UnculturedNeedsEnvSample
@ eErr_SEQ_DESCR_BadTentativeName
@ eErr_SEQ_DESCR_BadPlasmidChromosomeLinkageName
@ eErr_SEQ_DESCR_BadTextInSourceQualifier
@ eErr_SEQ_DESCR_SuspectedContaminatedCellLine
@ eErr_SEQ_DESCR_AmbiguousSpecificHost
@ eErr_SEQ_DESCR_BadGeoLocNameCapitalization
@ eErr_SEQ_DESCR_ChromosomeWithoutLocation
@ eErr_SEQ_DESCR_StructuredSourceNote
@ eErr_SEQ_DESCR_InvalidMatingType
@ eErr_SEQ_DESCR_BadSubSource
@ eErr_SEQ_DESCR_MultipleStrains
@ eErr_SEQ_DESCR_BadGeoLocNameCode
@ eErr_SEQ_DESCR_InvalidSexQualifier
@ eErr_SEQ_DESCR_TaxonomyAmbiguousName
@ eErr_SEQ_DESCR_MultipleSourceQualifiers
@ eErr_SEQ_DESCR_WrongVoucherType
@ eErr_SEQ_DESCR_BadCollectionCode
@ eErr_SEQ_DESCR_SyntheticConstructWrongMolType
@ eErr_SEQ_DESCR_TaxonomyLookupProblem
@ eErr_SEQ_DESCR_NoTaxonID
@ eErr_SEQ_DESCR_LatLonFormat
@ eErr_SEQ_DESCR_MissingLineage
@ eErr_SEQ_DESCR_BadOrgMod
@ eErr_SEQ_DESCR_BadSpecificHost
@ eErr_SEQ_DESCR_BadPCRPrimerName
@ eErr_SEQ_DESCR_OrgModValueInvalid
@ eErr_SEQ_DESCR_BadOrganelleLocation
@ eErr_SEQ_DESCR_EnvironSampleMissingQualifier
@ eErr_SEQ_DESCR_BadCountryCode
@ eErr_SEQ_DESCR_ChromosomeLocation
@ eErr_SEQ_DESCR_BioSourceNeedsChromosome
@ eErr_UNKNOWN
@ eErr_SEQ_DESCR_BioSourceInconsistency
@ eErr_SEQ_DESCR_HostIdenticalToOrganism
@ eErr_SEQ_DESCR_BadBioSourceFrequencyValue
@ eErr_SEQ_DESCR_ReplacedCountryCode
@ eErr_SEQ_DESCR_ReplacedGeoLocNameCode
@ eErr_SEQ_DESCR_UnbalancedParentheses
@ eErr_SEQ_DESCR_MissingMetagenomicQualifier
@ eErr_SEQ_DESCR_SyntheticConstructNeedsArtificial
@ eErr_SEQ_DESCR_NonViralSegment
CBinomialOrgName –.
bool IsOrganismArchaea() const
Definition: cache_impl.hpp:108
bool IsOrganismEukaryote() const
Definition: cache_impl.hpp:103
bool IsOrganismBacteria() const
Definition: cache_impl.hpp:98
CBioSourceKind & operator=(const CBioSource &bsrc)
const string & GetLineage(void) const
Definition: BioSource.cpp:360
static string GetOrganelleByGenome(unsigned int genome)
Definition: BioSource.cpp:216
bool IsSetLineage(void) const
Definition: BioSource.cpp:355
static CBioSource::EGenome GetGenomeByOrganelle(const string &organelle, NStr::ECase use_case=NStr::eCase, bool starts_with=false)
Definition: BioSource.cpp:168
bool IsSetTaxname(void) const
Definition: BioSource.cpp:335
CBioseq_Handle –.
static bool WasValid(const string &country)
Definition: SubSource.cpp:3377
static bool IsValid(const string &country)
Definition: SubSource.cpp:3304
CFeat_CI –.
Definition: feat_ci.hpp:64
static CNcbiApplication * Instance(void)
Singleton method.
Definition: ncbiapp.cpp:264
CNcbiEnvironment –.
Definition: ncbienv.hpp:110
@OrgMod.hpp User-defined methods of the data storage class.
Definition: OrgMod.hpp:54
static string IsCultureCollectionValid(const string &culture_collection)
Definition: OrgMod.cpp:424
static bool IsStrainValid(const string &strain)
Definition: OrgMod.cpp:835
static bool IsValidTypeMaterial(const string &type_material)
Definition: OrgMod.cpp:1201
static string IsBiomaterialValid(const string &biomaterial)
Definition: OrgMod.cpp:446
static string GetSubtypeName(TSubtype stype, EVocabulary vocabulary=eVocabulary_raw)
Definition: OrgMod.cpp:108
static string CheckMultipleVouchers(const vector< string > &)
Definition: OrgMod.cpp:736
static string IsSpecimenVoucherValid(const string &specimen_voucher)
Definition: OrgMod.cpp:435
C_Name –.
Definition: OrgName_.hpp:98
bool GetFlatName(string &name_out, string *lineage=0) const
Definition: OrgName.cpp:58
const string & GetLineage(void) const
Definition: Org_ref.cpp:124
const string & GetDivision(void) const
Definition: Org_ref.cpp:164
bool IsSetDivision(void) const
Definition: Org_ref.cpp:159
bool IsVarietyValid(const string &variety) const
Definition: Org_ref.cpp:220
bool IsSetOrgMod(void) const
Definition: Org_ref.cpp:169
bool IsSubspeciesValid(const string &subspecies) const
Definition: Org_ref.cpp:246
bool IsSetLineage(void) const
Definition: Org_ref.cpp:119
static bool IsValid(const string &seq, char &bad_ch)
CPCRPrimerSet –.
CPCRPrimer –.
Definition: PCRPrimer.hpp:66
CPCRReactionSet –.
CPCRReaction –.
Definition: PCRReaction.hpp:66
void AddFwdName(string name)
vector< CPCRSet * > m_SetList
Definition: validatorp.hpp:94
void AddRevName(string name)
void AddFwdSeq(string name)
void AddRevSeq(string name)
CPCRSet(size_t pos)
size_t GetOrigPos() const
Definition: validatorp.hpp:64
string GetRevName() const
Definition: validatorp.hpp:62
string GetFwdName() const
Definition: validatorp.hpp:60
string GetFwdSeq() const
Definition: validatorp.hpp:61
string GetRevSeq() const
Definition: validatorp.hpp:63
virtual ~CPCRSet()
CRef –.
Definition: ncbiobj.hpp:618
Definition: Seq_entry.hpp:56
const TAnnot & GetAnnot(void) const
Definition: Seq_entry.cpp:179
const CSeq_descr & GetDescr(void) const
Definition: Seq_entry.cpp:120
bool IsSetAnnot(void) const
Definition: Seq_entry.cpp:165
bool IsSetDescr(void) const
Definition: Seq_entry.cpp:106
namespace ncbi::objects::
Definition: Seq_feat.hpp:58
CSeqdesc_CI –.
Definition: seqdesc_ci.hpp:65
Base class for all serializable objects.
Definition: serialbase.hpp:150
static void ExploreStrainsForTaxonInfo(CTaxValidationAndCleanup &tval, CValidError_imp &imp, const CSeq_entry &se, std::function< CRef< CTaxon3_reply >(const vector< CRef< COrg_ref >> &)> taxoncallback)
static string GetCollectionDateProblem(const string &date_string)
static bool NCBI_UseGeoLocNameForCountry(void)
Definition: SubSource.cpp:94
static bool IsPlasmidNameValid(const string &value, const string &taxname)
Definition: SubSource.cpp:2871
static bool IsValidSexQualifierValue(const string &value)
Definition: SubSource.cpp:2488
static bool IsMultipleValuesAllowed(TSubtype)
Definition: SubSource.cpp:208
static bool IsAltitudeValid(const string &value)
Definition: SubSource.cpp:2653
static string ValidateLatLonCountry(const string &countryname, string &lat_lon, bool check_state, ELatLonCountryErr &errcode)
Definition: SubSource.cpp:2101
static string CheckCellLine(const string &cell_line, const string &organism)
Definition: SubSource.cpp:2955
static string GetSubtypeName(CSubSource::TSubtype stype, EVocabulary vocabulary=eVocabulary_raw)
Definition: SubSource.cpp:185
static bool NeedsNoText(const TSubtype &subtype)
Definition: SubSource.cpp:233
static bool IsEndogenousVirusNameValid(const string &value)
Definition: SubSource.cpp:2753
static bool IsChromosomeNameValid(const string &value, const string &taxname)
Definition: SubSource.cpp:2846
static void IsCorrectLatLonFormat(string lat_lon, bool &format_correct, bool &precision_correct, bool &lat_in_range, bool &lon_in_range, double &lat_value, double &lon_value)
Definition: SubSource.cpp:1237
static bool IsSegmentValid(const string &value)
Definition: SubSource.cpp:2747
static bool IsLinkageGroupNameValid(const string &value, const string &taxname)
Definition: SubSource.cpp:2859
vector< CRef< COrg_ref > > GetTaxonomyLookupRequest() const
void ReportSpecificHostErrors(const CTaxon3_reply &reply, CValidError_imp &imp)
void ReportIncrementalTaxLookupErrors(const CTaxon3_reply &reply, CValidError_imp &imp, bool is_insd_patent, size_t offset) const
vector< CRef< COrg_ref > > GetStrainLookupRequest()
string IncrementalSpecificHostMapUpdate(const vector< CRef< COrg_ref > > &input, const CTaxon3_reply &reply)
CConstRef< CSeq_entry > GetTopReportObject() const
string IncrementalStrainMapUpdate(const vector< CRef< COrg_ref > > &input, const CTaxon3_reply &reply, TTaxId descTaxID=ZERO_TAX_ID)
void ReportStrainErrors(CValidError_imp &imp)
vector< CRef< COrg_ref > > GetSpecificHostLookupRequest(bool for_fix)
static bool IsWGS(const CBioseq &seq)
void ValidateTaxNameOrgname(const string &taxname, const COrgName &orgname, const CSerialObject &obj, const CSeq_entry *ctx)
bool IsSyntheticConstruct(const CBioSource &src)
void ValidateSubSource(const CSubSource &subsrc, const CSerialObject &obj, const CSeq_entry *ctx=nullptr, const bool isViral=false)
void HandleTaxonomyError(const CT3Error &error, const string &host, const COrg_ref &orf)
void ValidateLatLonCountry(string countryname, string lat_lon, const CSerialObject &obj, const CSeq_entry *ctx)
void GatherTentativeName(const CSeq_entry &se, vector< CConstRef< CSeqdesc > > &usr_descs, vector< CConstRef< CSeq_entry > > &desc_ctxs, vector< CConstRef< CSeq_feat > > &usr_feats)
void ValidateOrgName(const COrgName &orgname, const bool has_taxon, const CSerialObject &obj, const CSeq_entry *ctx)
void PostObjErr(EDiagSev sv, EErrType et, const string &msg, const CSerialObject &obj, const CSeq_entry *ctx=nullptr)
void ValidateTaxonomy(const CSeq_entry &se)
void ValidateOrgRef(const COrg_ref &orgref, const CSerialObject &obj, const CSeq_entry *ctx, const bool checkForUndefinedSpecies=false, const bool is_single_cell_amplification=false)
static bool s_IsSalmonellaGenus(const string &taxname)
unique_ptr< CTaxValidationAndCleanup > x_CreateTaxValidator() const
void x_ReportPCRSeqProblem(const string &primer_kind, char badch, const CSerialObject &obj, const CSeq_entry *ctx)
void x_CheckPCRPrimer(const CPCRPrimer &primer, const string &primer_kind, const CSerialObject &obj, const CSeq_entry *ctx)
void ValidateSourceQualTags(const string &str, const CSerialObject &obj, const CSeq_entry *ctx=nullptr)
void ValidateBioSource(const CBioSource &bsrc, const CSerialObject &obj, const CSeq_entry *ctx=nullptr)
bool IsArtificial(const CBioSource &src)
void ValidatePCRReactionSet(const CPCRReactionSet &pcrset, const CSerialObject &obj, const CSeq_entry *ctx=nullptr)
void ValidateBioSourceForSeq(const CBioSource &bsrc, const CSerialObject &obj, const CSeq_entry *ctx, const CBioseq_Handle &bsh)
void ValidateOrgRefs(CTaxValidationAndCleanup &tval)
bool IsOtherDNA(const CBioseq_Handle &bsh) const
void ValidateSpecificHost(CTaxValidationAndCleanup &tval)
void ValidateStrain(CTaxValidationAndCleanup &tval, TTaxId descTaxID=ZERO_TAX_ID)
shared_ptr< SValidatorContext > m_pContext
void ValidateOrgModVoucher(const COrgMod &orgmod, const CSerialObject &obj, const CSeq_entry *ctx)
void ValidateTentativeName(const CSeq_entry &se)
bool IsTransgenic(const CBioSource &bsrc)
EDiagSev x_SalmonellaErrorLevel()
static EErrType ConvertCode(CSubSource::ELatLonCountryErr errcode)
Definition: validator.cpp:708
Definition: map.hpp:338
Definition: set.hpp:45
iterator_bool insert(const value_type &val)
Definition: set.hpp:149
const_iterator find(const key_type &key) const
Definition: set.hpp:137
const_iterator end() const
Definition: set.hpp:136
static const int chunk_size
Include a standard set of the NCBI C++ Toolkit most basic headers.
The NCBI C++ standard methods for dealing with std::string.
#define T(s)
Definition: common.h:230
CS_CONTEXT * ctx
Definition: t0006.c:12
static const char * str(char *buf, int n)
Definition: stats.c:84
static HENV env
Definition: transaction2.c:38
#define FOR_EACH_USERFIELD_ON_USEROBJECT(Itr, Var)
FOR_EACH_USERFIELD_ON_USEROBJECT EDIT_EACH_USERFIELD_ON_USEROBJECT.
const CNcbiEnvironment & GetEnvironment(void) const
Get the application's cached environment.
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
SStrictId_Tax::TId TTaxId
Taxon id type.
Definition: ncbimisc.hpp:1048
#define NULL
Definition: ncbistd.hpp:225
EDiagSev
Severity level for the posted diagnostics.
Definition: ncbidiag.hpp:650
@ eDiag_Info
Informational message.
Definition: ncbidiag.hpp:651
@ eDiag_Error
Error message.
Definition: ncbidiag.hpp:653
@ eDiag_Warning
Warning message.
Definition: ncbidiag.hpp:652
@ eDiag_Fatal
Fatal error – guarantees exit(or abort)
Definition: ncbidiag.hpp:655
@ eDiag_Critical
Critical error message.
Definition: ncbidiag.hpp:654
const TPrim & Get(void) const
Definition: serialbase.hpp:347
virtual const CTypeInfo * GetThisTypeInfo(void) const =0
const CTextseq_id * GetTextseq_Id(void) const
Return embedded CTextseq_id, if any.
Definition: Seq_id.cpp:169
CConstRef< CBioseq > GetCompleteBioseq(void) const
Get the complete bioseq.
bool IsAa(void) const
bool IsSetInst(void) const
const TInst & GetInst(void) const
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:1439
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
#define kEmptyStr
Definition: ncbistr.hpp:123
static int CompareNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive compare of a substring with another string.
Definition: ncbistr.cpp:219
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
Definition: ncbistr.cpp:3461
static SIZE_TYPE FindNoCase(const CTempString str, const CTempString pattern, SIZE_TYPE start, SIZE_TYPE end, EOccurrence which=eFirst)
Find the pattern in the specified range of a string using a case insensitive search.
Definition: ncbistr.cpp:2993
static bool EndsWith(const CTempString str, const CTempString end, ECase use_case=eCase)
Check if a string ends with a specified suffix value.
Definition: ncbistr.hpp:5430
static bool IsBlank(const CTempString str, SIZE_TYPE pos=0)
Check if a string is blank (has no text).
Definition: ncbistr.cpp:106
#define NPOS
Definition: ncbistr.hpp:133
static void TruncateSpacesInPlace(string &str, ETrunc where=eTrunc_Both)
Truncate spaces in a string (in-place)
Definition: ncbistr.cpp:3201
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5084
static SIZE_TYPE Find(const CTempString str, const CTempString pattern, ECase use_case=eCase, EDirection direction=eForwardSearch, SIZE_TYPE occurrence=0)
Find the pattern in the string.
Definition: ncbistr.cpp:2891
static bool EqualCase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-sensitive equality of a substring with another string.
Definition: ncbistr.hpp:5325
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
Definition: ncbistr.hpp:5412
static bool EqualNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive equality of a substring with another string.
Definition: ncbistr.hpp:5353
static bool Equal(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2, ECase use_case=eCase)
Test for equality of a substring with another string.
Definition: ncbistr.hpp:5384
static string & ReplaceInPlace(string &src, const string &search, const string &replace, SIZE_TYPE start_pos=0, SIZE_TYPE max_replace=0, SIZE_TYPE *num_replace=0)
Replace occurrences of a substring within a string.
Definition: ncbistr.cpp:3405
static string & ToLower(string &str)
Convert string to lower case – string& version.
Definition: ncbistr.cpp:405
@ eNocase
Case insensitive compare.
Definition: ncbistr.hpp:1206
@ eCase
Case sensitive compare.
Definition: ncbistr.hpp:1205
const Tdata & Get(void) const
Get the member data.
const TSubtype & GetSubtype(void) const
Get the Subtype member data.
Definition: BioSource_.hpp:539
const TPcr_primers & GetPcr_primers(void) const
Get the Pcr_primers member data.
Definition: BioSource_.hpp:588
TGenome GetGenome(void) const
Get the Genome member data.
Definition: BioSource_.hpp:422
TOrigin GetOrigin(void) const
Get the Origin member data.
Definition: BioSource_.hpp:472
bool IsSetSeq(void) const
Check if a value has been assigned to Seq data member.
Definition: PCRPrimer_.hpp:199
const Tdata & Get(void) const
Get the member data.
bool CanGetSubtype(void) const
Check if it is safe to call GetSubtype method.
Definition: BioSource_.hpp:533
bool IsSetName(void) const
Check if a value has been assigned to Name data member.
Definition: PCRPrimer_.hpp:234
bool IsSetOrg(void) const
Check if a value has been assigned to Org data member.
Definition: BioSource_.hpp:497
bool CanGetOrg(void) const
Check if it is safe to call GetOrg method.
Definition: BioSource_.hpp:503
list< CRef< CSubSource > > TSubtype
Definition: BioSource_.hpp:145
bool IsSetPcr_primers(void) const
Check if a value has been assigned to Pcr_primers data member.
Definition: BioSource_.hpp:576
const TForward & GetForward(void) const
Get the Forward member data.
const TOrg & GetOrg(void) const
Get the Org member data.
Definition: BioSource_.hpp:509
bool IsSetOrigin(void) const
Check if a value has been assigned to Origin data member.
Definition: BioSource_.hpp:447
TSubtype GetSubtype(void) const
Get the Subtype member data.
Definition: SubSource_.hpp:310
bool IsSetGenome(void) const
Check if a value has been assigned to Genome data member.
Definition: BioSource_.hpp:397
bool IsSetSubtype(void) const
Check if a value has been assigned to Subtype data member.
Definition: SubSource_.hpp:291
const TSeq & GetSeq(void) const
Get the Seq member data.
Definition: PCRPrimer_.hpp:211
bool IsSetReverse(void) const
Check if a value has been assigned to Reverse data member.
const TName & GetName(void) const
Get the Name member data.
Definition: SubSource_.hpp:350
const TReverse & GetReverse(void) const
Get the Reverse member data.
const TName & GetName(void) const
Get the Name member data.
Definition: PCRPrimer_.hpp:246
bool IsSetForward(void) const
Check if a value has been assigned to Forward data member.
EGenome
biological context
Definition: BioSource_.hpp:97
bool IsSet(void) const
Check if a value has been assigned to data member.
bool IsSet(void) const
Check if a value has been assigned to data member.
bool IsSetName(void) const
Check if a value has been assigned to Name data member.
Definition: SubSource_.hpp:338
@ eSubtype_collection_date
DD-MMM-YYYY format.
Definition: SubSource_.hpp:114
@ eSubtype_fwd_primer_seq
sequence (possibly more than one; semicolon-separated)
Definition: SubSource_.hpp:117
@ eSubtype_lat_lon
+/- decimal degrees
Definition: SubSource_.hpp:113
@ eSubtype_collected_by
name of person who collected the sample
Definition: SubSource_.hpp:115
@ eSubtype_rev_primer_seq
sequence (possibly more than one; semicolon-separated)
Definition: SubSource_.hpp:118
@ eSubtype_environmental_sample
Definition: SubSource_.hpp:111
@ eSubtype_endogenous_virus_name
Definition: SubSource_.hpp:109
@ eSubtype_identified_by
name of person who identified the sample
Definition: SubSource_.hpp:116
@ eOrigin_synthetic
purely synthetic
Definition: BioSource_.hpp:134
@ eOrigin_artificial
artificially engineered
Definition: BioSource_.hpp:133
const TData & GetData(void) const
Get the Data member data.
const TStr & GetStr(void) const
Get the variant data.
Definition: Object_id_.hpp:297
const TLabel & GetLabel(void) const
Get the Label member data.
const TType & GetType(void) const
Get the Type member data.
const TMod & GetMod(void) const
Get the Mod member data.
Definition: OrgName_.hpp:839
bool IsSetDb(void) const
ids in taxonomic or culture dbases Check if a value has been assigned to Db data member.
Definition: Org_ref_.hpp:479
const TLineage & GetLineage(void) const
Get the Lineage member data.
Definition: OrgName_.hpp:864
TSubtype GetSubtype(void) const
Get the Subtype member data.
Definition: OrgMod_.hpp:307
const TDiv & GetDiv(void) const
Get the Div member data.
Definition: OrgName_.hpp:1005
E_Choice Which(void) const
Which variant is currently selected.
Definition: OrgName_.hpp:686
bool IsSetSubtype(void) const
Check if a value has been assigned to Subtype data member.
Definition: OrgMod_.hpp:288
const TSubname & GetSubname(void) const
Get the Subname member data.
Definition: OrgMod_.hpp:347
const THybrid & GetHybrid(void) const
Get the variant data.
Definition: OrgName_.cpp:149
bool IsSetCommon(void) const
common name Check if a value has been assigned to Common data member.
Definition: Org_ref_.hpp:407
bool IsSetLineage(void) const
lineage with semicolon separators Check if a value has been assigned to Lineage data member.
Definition: OrgName_.hpp:852
const TName & GetName(void) const
Get the Name member data.
Definition: OrgName_.hpp:771
const TTaxname & GetTaxname(void) const
Get the Taxname member data.
Definition: Org_ref_.hpp:372
const TCommon & GetCommon(void) const
Get the Common member data.
Definition: Org_ref_.hpp:419
const TBinomial & GetBinomial(void) const
Get the variant data.
Definition: OrgName_.cpp:121
const TDb & GetDb(void) const
Get the Db member data.
Definition: Org_ref_.hpp:491
bool IsSetDiv(void) const
GenBank division code Check if a value has been assigned to Div data member.
Definition: OrgName_.hpp:993
void SetTaxname(const TTaxname &value)
Assign a value to Taxname data member.
Definition: Org_ref_.hpp:381
bool IsSetMod(void) const
Check if a value has been assigned to Mod data member.
Definition: OrgName_.hpp:827
const Tdata & Get(void) const
Get the member data.
bool IsSetGenus(void) const
required Check if a value has been assigned to Genus data member.
const TSpecies & GetSpecies(void) const
Get the Species member data.
bool IsSetOrgname(void) const
Check if a value has been assigned to Orgname data member.
Definition: Org_ref_.hpp:529
bool IsSetSubname(void) const
Check if a value has been assigned to Subname data member.
Definition: OrgMod_.hpp:335
bool IsSetTaxname(void) const
preferred formal name Check if a value has been assigned to Taxname data member.
Definition: Org_ref_.hpp:360
const TGenus & GetGenus(void) const
Get the Genus member data.
const TPartial & GetPartial(void) const
Get the variant data.
Definition: OrgName_.cpp:193
bool IsSetSpecies(void) const
species required if subspecies used Check if a value has been assigned to Species data member.
bool IsSetName(void) const
Check if a value has been assigned to Name data member.
Definition: OrgName_.hpp:759
const Tdata & Get(void) const
Get the member data.
const TOrgname & GetOrgname(void) const
Get the Orgname member data.
Definition: Org_ref_.hpp:541
@ eSubtype_gb_synonym
used by taxonomy database
Definition: OrgMod_.hpp:117
@ eSubtype_other
ASN5: old-name (254) will be added to next spec.
Definition: OrgMod_.hpp:125
@ eSubtype_sub_species
Definition: OrgMod_.hpp:105
@ eSubtype_nat_host
natural host of this specimen
Definition: OrgMod_.hpp:104
@ eSubtype_cultivar
Definition: OrgMod_.hpp:93
@ eSubtype_variety
Definition: OrgMod_.hpp:89
@ eSubtype_strain
Definition: OrgMod_.hpp:85
@ eSubtype_metagenome_source
Definition: OrgMod_.hpp:120
@ eSubtype_old_name
Definition: OrgMod_.hpp:124
@ eSubtype_synonym
Definition: OrgMod_.hpp:111
@ eSubtype_type_material
Definition: OrgMod_.hpp:121
@ eSubtype_specimen_voucher
Definition: OrgMod_.hpp:106
@ eSubtype_serotype
Definition: OrgMod_.hpp:90
@ eSubtype_serovar
Definition: OrgMod_.hpp:92
@ eSubtype_bio_material
Definition: OrgMod_.hpp:119
@ eSubtype_culture_collection
Definition: OrgMod_.hpp:118
@ eSubtype_forma_specialis
Definition: OrgMod_.hpp:109
@ eSubtype_isolate
Definition: OrgMod_.hpp:100
@ e_Hybrid
hybrid between organisms
Definition: OrgName_.hpp:115
@ e_Binomial
genus/species type name
Definition: OrgName_.hpp:113
@ e_Partial
when genus not known
Definition: OrgName_.hpp:117
bool IsSetData(void) const
the specific data Check if a value has been assigned to Data data member.
Definition: Seq_feat_.hpp:913
const TData & GetData(void) const
Get the Data member data.
Definition: Seq_feat_.hpp:925
const TBiosrc & GetBiosrc(void) const
Get the variant data.
bool IsSetAccession(void) const
Check if a value has been assigned to Accession data member.
E_Choice Which(void) const
Which variant is currently selected.
Definition: Seq_id_.hpp:746
const TAccession & GetAccession(void) const
Get the Accession member data.
@ e_Tpe
Third Party Annot/Seq EMBL.
Definition: Seq_id_.hpp:111
@ e_Tpd
Third Party Annot/Seq DDBJ.
Definition: Seq_id_.hpp:112
@ e_General
for other databases
Definition: Seq_id_.hpp:105
@ e_Ddbj
DDBJ.
Definition: Seq_id_.hpp:107
@ e_Tpg
Third Party Annot/Seq Genbank.
Definition: Seq_id_.hpp:110
@ e_Local
local use
Definition: Seq_id_.hpp:95
const TSeq & GetSeq(void) const
Get the variant data.
Definition: Seq_entry_.cpp:102
const TSet & GetSet(void) const
Get the variant data.
Definition: Seq_entry_.cpp:124
bool IsSeq(void) const
Check if variant Seq is selected.
Definition: Seq_entry_.hpp:257
bool IsSet(void) const
Check if variant Set is selected.
Definition: Seq_entry_.hpp:263
@ eClass_parts
parts for 2 or 3
@ eClass_nuc_prot
nuc acid and coded proteins
Definition: Bioseq_set_.hpp:99
bool IsSetCompleteness(void) const
Check if a value has been assigned to Completeness data member.
Definition: MolInfo_.hpp:569
bool CanGetBiomol(void) const
Check if it is safe to call GetBiomol method.
Definition: MolInfo_.hpp:428
const TUser & GetUser(void) const
Get the variant data.
Definition: Seqdesc_.cpp:384
bool IsMolinfo(void) const
Check if variant Molinfo is selected.
Definition: Seqdesc_.hpp:1196
bool IsSetMol(void) const
Check if a value has been assigned to Mol data member.
Definition: Seq_inst_.hpp:593
const TTitle & GetTitle(void) const
Get the variant data.
Definition: Seqdesc_.hpp:1032
const TSource & GetSource(void) const
Get the variant data.
Definition: Seqdesc_.cpp:566
bool IsSource(void) const
Check if variant Source is selected.
Definition: Seqdesc_.hpp:1190
bool IsSetBiomol(void) const
Check if a value has been assigned to Biomol data member.
Definition: MolInfo_.hpp:422
const TId & GetId(void) const
Get the Id member data.
Definition: Bioseq_.hpp:290
const Tdata & Get(void) const
Get the member data.
Definition: Seq_descr_.hpp:166
TMol GetMol(void) const
Get the Mol member data.
Definition: Seq_inst_.hpp:612
bool IsSetDescr(void) const
descriptors Check if a value has been assigned to Descr data member.
Definition: Bioseq_.hpp:303
TBiomol GetBiomol(void) const
Get the Biomol member data.
Definition: MolInfo_.hpp:447
TCompleteness GetCompleteness(void) const
Get the Completeness member data.
Definition: MolInfo_.hpp:594
bool IsSetId(void) const
equivalent identifiers Check if a value has been assigned to Id data member.
Definition: Bioseq_.hpp:278
const TDescr & GetDescr(void) const
Get the Descr member data.
Definition: Bioseq_.hpp:315
const TMolinfo & GetMolinfo(void) const
Get the variant data.
Definition: Seqdesc_.cpp:588
@ eRepr_seg
segmented sequence
Definition: Seq_inst_.hpp:95
@ eCompleteness_complete
complete biological entity
Definition: MolInfo_.hpp:156
@ eTech_wgs
whole genome shotgun sequencing
Definition: MolInfo_.hpp:143
@ eBiomol_cRNA
viral RNA genome copy intermediate
Definition: MolInfo_.hpp:111
@ eBiomol_other_genetic
other genetic material
Definition: MolInfo_.hpp:109
@ e_User
user defined object
Definition: Seqdesc_.hpp:124
@ e_Molinfo
info on the molecule and techniques
Definition: Seqdesc_.hpp:134
@ e_Title
a title for this sequence
Definition: Seqdesc_.hpp:115
int i
int len
static char * subname
Definition: mdb_load.c:26
const GenericPointer< typename T::ValueType > T2 value
Definition: pointer.h:1227
const CharType(& source)[N]
Definition: pointer.h:1149
Defines the CNcbiApplication and CAppException classes for creating NCBI applications.
int isalpha(Uchar c)
Definition: ncbictype.hpp:61
int isspace(Uchar c)
Definition: ncbictype.hpp:69
int tolower(Uchar c)
Definition: ncbictype.hpp:72
int isdigit(Uchar c)
Definition: ncbictype.hpp:64
int toupper(Uchar c)
Definition: ncbictype.hpp:73
T min(T x_, T y_)
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
double f(double x_, const double &y_)
Definition: njn_root.hpp:188
The Object manager core.
static int match(register const pcre_uchar *eptr, register const pcre_uchar *ecode, const pcre_uchar *mstart, int offset_top, match_data *md, eptrblock *eptrb, unsigned int rdepth)
Definition: pcre_exec.c:513
#define FOR_EACH_SEQID_ON_BIOSEQ(Itr, Var)
FOR_EACH_SEQID_ON_BIOSEQ EDIT_EACH_SEQID_ON_BIOSEQ.
Definition: seq_macros.hpp:308
#define FOR_EACH_DBXREF_ON_ORGREF(Itr, Var)
FOR_EACH_DBXREF_ON_ORGREF EDIT_EACH_DBXREF_ON_ORGREF.
#define FOR_EACH_SUBSOURCE_ON_BIOSOURCE(Itr, Var)
FOR_EACH_SUBSOURCE_ON_BIOSOURCE EDIT_EACH_SUBSOURCE_ON_BIOSOURCE.
#define FOR_EACH_ORGMOD_ON_ORGNAME(Itr, Var)
FOR_EACH_ORGMOD_ON_ORGNAME EDIT_EACH_ORGMOD_ON_ORGNAME.
#define FOR_EACH_SEQENTRY_ON_SEQSET(Itr, Var)
FOR_EACH_SEQENTRY_ON_SEQSET EDIT_EACH_SEQENTRY_ON_SEQSET.
#define FIELD_IS_SET_AND_IS(Var, Fld, Chs)
FIELD_IS_SET_AND_IS base macro.
#define GET_FIELD(Var, Fld)
GET_FIELD base macro.
bool seq_mac_is_unique(Iterator iter1, Iterator iter2, Predicate pred)
bool ContainsSgml(const string &str)
CRef< objects::CObjectManager > om
String search utilities.
bool operator()(T l, T r) const
Definition: type.c:6
#define _ASSERT
#define Type
int g(Seg_Gsm *spe, Seq_Mtf *psm, Thd_Gsm *tdg)
Definition: thrddgri.c:44
static string x_RepairCountryName(string countryname)
static bool s_PCRReactionLess(CConstRef< CPCRReaction > pp1, CConstRef< CPCRReaction > pp2)
static const int sNumUnexpectedViralOrgModQualifiers
static const int sNumUnexpectedViralSubSourceQualifiers
static bool s_MatchOrgname(const string &taxname, const COrgName &orgname, string &mismatch)
std::string_view sm_SourceQualPrefixes[]
bool s_IsAllDigitsOrSpaces(string str)
static bool s_PCRPrimerSetLess(const CPCRPrimerSet &s1, const CPCRPrimerSet &s2)
static unique_ptr< CTextFsa > m_SourceQualTags
static bool NCBI_NewTaxVal(void)
static bool IsUnexpectedViralOrgModQualifier(COrgMod::TSubtype subtype)
bool s_IsBioSample(const CBioseq_Handle &bsh)
static const CSubSource::ESubtype sUnexpectedViralSubSourceQualifiers[]
static bool s_HasMetagenomeSource(const COrg_ref &org)
static const COrgMod::TSubtype sUnexpectedViralOrgModQualifiers[]
static bool s_PCRSetEqual(const CPCRSet *p1, const CPCRSet *p2)
static bool s_UnbalancedParentheses(string str)
static bool s_IsChromosome(const CBioSource &biosource)
static bool x_HasTentativeName(const CUser_object &user_object)
const size_t kDefaultChunkSize
static const string kInvalidReplyMsg
static string x_GetTentativeName(const CUser_object &user_object)
bool s_IsArchaea(const CBioSource &source)
static bool s_HasWGSTech(const CBioseq &bioseq)
static bool s_PCRSetCompare(const CPCRSet *p1, const CPCRSet *p2)
static bool s_FindWholeName(const string &taxname, const string &value)
static bool s_IsEukaryoteOrProkaryote(const CBioSourceKind &biosourceKind)
static bool s_ReportUndefinedSpeciesId(const CBioseq &bioseq)
static bool s_CompleteGenomeNeedsChromosome(const CBioSource &source)
static bool s_PCRPrimerLess(const CPCRPrimer &p1, const CPCRPrimer &p2)
bool IsOrgNotFound(const CT3Error &error)
static const CBioseq * s_GetNucSeqFromContext(const CSeq_entry *ctx)
bool s_IsBacteria(const CBioSource &source)
static bool s_IsUndefinedSpecies(const string &taxname)
static bool IsUnexpectedViralSubSourceQualifier(CSubSource::TSubtype subtype)
static bool s_init_NewTaxVal(void)
Modified on Thu Apr 25 08:18:07 2024 by modify_doxy.py rev. 669887