NCBI C++ ToolKit
biosource_tests.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: biosource_tests.cpp 99033 2023-02-06 18:11:31Z foleyjp $
2  * =========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * =========================================================================
25  *
26  * Authors: Colleen Bollin, based on similar discrepancy tests
27  *
28  */
29 
30 #include <ncbi_pch.hpp>
31 #include <sstream>
37 #include <objmgr/seqdesc_ci.hpp>
38 #include <objmgr/seq_vector.hpp>
41 #include <util/xregexp/regexp.hpp>
42 
43 #include "discrepancy_core.hpp"
44 
48 
49 #if 0
50 static unsigned int AutofixBiosrc(TReportObjectList& list, CScope& scope, bool (*call)(CBioSource& src))
51 {
52  unsigned int n = 0;
53  for (auto& it : list) {
54  if (it->CanAutofix()) {
55  const CSeq_feat* sf = dynamic_cast<const CSeq_feat*>(dynamic_cast<CDiscrepancyObject*>(it.GetNCPointer())->GetObject().GetPointer());
56  const CSeqdesc* csd = dynamic_cast<const CSeqdesc*>(dynamic_cast<CDiscrepancyObject*>(it.GetNCPointer())->GetObject().GetPointer());
57  if (sf) {
58  if (sf->IsSetData() && sf->GetData().IsBiosrc()) {
59  CRef<CSeq_feat> new_feat(new CSeq_feat());
60  new_feat->Assign(*sf);
61  if (call(new_feat->SetData().SetBiosrc())) {
63  feh.Replace(*new_feat);
64  n++;
65  dynamic_cast<CDiscrepancyObject*>(it.GetNCPointer())->SetFixed();
66  }
67  }
68  }
69  else if (csd) {
70  if (csd->IsSource()) {
71  CSeqdesc* sd = const_cast<CSeqdesc*>(csd);
72  if (call(sd->SetSource())) {
73  n++;
74  dynamic_cast<CDiscrepancyObject*>(it.GetNCPointer())->SetFixed();
75  }
76  }
77  }
78  }
79  }
80  return n;
81 }
82 #endif
83 
84 // MAP_CHROMOSOME_CONFLICT
85 
86 DISCREPANCY_CASE(MAP_CHROMOSOME_CONFLICT, BIOSRC, eDisc | eOncaller | eSmart | eFatal, "Eukaryotic sequences with a map source qualifier should also have a chromosome source qualifier")
87 {
88  const CSeqdesc* biosrc = context.GetBiosource();
89  const CBioSource& src = biosrc->GetSource();
90  if (src.IsSetSubtype() && context.IsEukaryotic(&src)) {
91  bool has_map = false;
92  bool has_chromosome = false;
93  for (const auto& it : src.GetSubtype()) {
94  if (it->IsSetSubtype()) {
95  if (it->GetSubtype() == CSubSource::eSubtype_map) {
96  has_map = true;
97  }
98  else if (it->GetSubtype() == CSubSource::eSubtype_chromosome) {
99  has_chromosome = true;
100  break;
101  }
102  }
103  }
104  if (has_map && !has_chromosome) {
105  m_Objs["[n] source[s] on eukaryotic sequence[s] [has] map but not chromosome"].Add(*context.SeqdescObjRef(*biosrc)).Fatal();
106  }
107  }
108 }
109 
110 
111 // INFLUENZA_DATE_MISMATCH
112 
113 DISCREPANCY_CASE(INFLUENZA_DATE_MISMATCH, BIOSRC, eOncaller, "Influenza Strain/Collection Date Mismatch")
114 {
115  for (const CBioSource* biosrc : context.GetBiosources()) {
116  if (biosrc->IsSetOrg() && biosrc->IsSetSubtype() && biosrc->GetOrg().IsSetOrgname() && biosrc->GetOrg().GetOrgname().IsSetMod() && biosrc->GetOrg().IsSetTaxname() && NStr::StartsWith(biosrc->GetOrg().GetTaxname(), "Influenza ")) {
117  int strain_year = 0;
118  int collection_year = 0;
119  for (const auto& it : biosrc->GetOrg().GetOrgname().GetMod()) {
120  if (it->IsSetSubtype() && it->GetSubtype() == COrgMod::eSubtype_strain) {
121  string s = it->GetSubname();
122  size_t pos = s.rfind('/');
123  if (pos == string::npos) {
124  return;
125  }
126  ++pos;
127  while (isspace(s.c_str()[pos])) {
128  ++pos;
129  }
130  size_t len = 0;
131  while (isdigit(s.c_str()[pos + len])) {
132  len++;
133  }
134  if (!len) {
135  return;
136  }
137  strain_year = NStr::StringToInt(s.substr(pos, len));
138  break;
139  }
140  }
141  for (const auto& it : biosrc->GetSubtype()) {
142  if (it->IsSetSubtype() && it->GetSubtype() == CSubSource::eSubtype_collection_date) {
143  try {
144  CRef<CDate> date = CSubSource::DateFromCollectionDate(it->GetName());
145  if (date && date->IsStd() && date->GetStd().IsSetYear()) {
146  collection_year = date->GetStd().GetYear();
147  }
148  }
149  catch (const CException& /* ignore */) {
150  // CSubSource can throw all sorts of exceptions, but we're not interested in the specifics here.
151  }
152  break;
153  }
154  }
155  if (strain_year != collection_year) {
156  m_Objs["[n] influenza strain[s] conflict with collection date"].Add(*context.BiosourceObjRef(*biosrc));
157  }
158  }
159  }
160 }
161 
162 
163 // INFLUENZA_QUALS
164 
165 DISCREPANCY_CASE(INFLUENZA_QUALS, BIOSRC, eOncaller, "Influenza must have strain, host, isolation_source, country, collection_date")
166 {
167  for (const CBioSource* biosrc : context.GetBiosources()) {
168  if (biosrc->IsSetOrg() && biosrc->GetOrg().IsSetTaxname() && NStr::StartsWith(biosrc->GetOrg().GetTaxname(), "Influenza ")) {
169  bool found_strain = false;
170  bool found_host = false;
171  bool found_country = false;
172  bool found_collection_date = false;
173  if (biosrc->IsSetSubtype()) {
174  for (const auto& it : biosrc->GetSubtype()) {
175  if (it->IsSetSubtype()) {
176  switch (it->GetSubtype()) {
178  found_country = true;
179  break;
181  found_collection_date = true;
182  break;
183  }
184  }
185  }
186  }
187  if (biosrc->GetOrg().IsSetOrgname() && biosrc->GetOrg().GetOrgname().IsSetMod()) {
188  for (const auto& it : biosrc->GetOrg().GetOrgname().GetMod()) {
189  if (it->IsSetSubtype()) {
190  switch (it->GetSubtype()) {
192  found_strain = true;
193  break;
195  found_host = true;
196  break;
197  }
198  }
199  }
200  }
201  if (!found_strain) {
202  m_Objs["[n] Influenza biosource[s] [does] not have strain"].Add(*context.BiosourceObjRef(*biosrc));
203  }
204  if (!found_host) {
205  m_Objs["[n] Influenza biosource[s] [does] not have host"].Add(*context.BiosourceObjRef(*biosrc));
206  }
207  if (!found_country) {
208  m_Objs["[n] Influenza biosource[s] [does] not have country"].Add(*context.BiosourceObjRef(*biosrc));
209  }
210  if (!found_collection_date) {
211  m_Objs["[n] Influenza biosource[s] [does] not have collection-date"].Add(*context.BiosourceObjRef(*biosrc));
212  }
213  }
214  }
215 }
216 
217 
218 // INFLUENZA_SEROTYPE
219 
220 DISCREPANCY_CASE(INFLUENZA_SEROTYPE, BIOSRC, eOncaller, "Influenza A virus must have serotype")
221 {
222  for (const CBioSource* biosrc : context.GetBiosources()) {
223  if (biosrc->IsSetOrg() && biosrc->GetOrg().IsSetTaxname() && NStr::StartsWith(biosrc->GetOrg().GetTaxname(), "Influenza A virus ") && biosrc->GetOrg().IsSetOrgname() && biosrc->GetOrg().GetOrgname().IsSetMod()) {
224  bool found = false;
225  for (const auto& it : biosrc->GetOrg().GetOrgname().GetMod()) {
226  if (it->IsSetSubtype() && it->GetSubtype() == COrgMod::eSubtype_serotype) {
227  found = true;
228  break;
229  }
230  }
231  if (!found) {
232  m_Objs["[n] Influenza A virus biosource[s] [does] not have serotype"].Add(*context.BiosourceObjRef(*biosrc));
233  }
234  }
235  }
236 }
237 
238 
239 // INFLUENZA_SEROTYPE_FORMAT
240 
241 DISCREPANCY_CASE(INFLUENZA_SEROTYPE_FORMAT, BIOSRC, eOncaller, "Influenza A virus serotype must match /^H[1-9]\\d*$|^N[1-9]\\d*$|^H[1-9]\\d*N[1-9]\\d*$|^mixed$/")
242 {
243  for (const CBioSource* biosrc : context.GetBiosources()) {
244  if (biosrc->IsSetOrg() && biosrc->GetOrg().IsSetTaxname() && NStr::StartsWith(biosrc->GetOrg().GetTaxname(), "Influenza A virus ")) {
245  static CRegexp rx("^H[1-9]\\d*$|^N[1-9]\\d*$|^H[1-9]\\d*N[1-9]\\d*$|^mixed$");
246  if (biosrc->GetOrg().IsSetOrgname() && biosrc->GetOrg().GetOrgname().IsSetMod()) {
247  for (const auto& it : biosrc->GetOrg().GetOrgname().GetMod()) {
248  if (it->IsSetSubtype() && it->GetSubtype() == COrgMod::eSubtype_serotype && !rx.IsMatch(it->GetSubname())) {
249  m_Objs["[n] Influenza A virus serotype[s] [has] incorrect format"].Add(*context.BiosourceObjRef(*biosrc));
250  }
251  }
252  }
253  }
254  }
255 }
256 
257 
258 // UNCULTURED_NOTES
259 
260 DISCREPANCY_CASE(UNCULTURED_NOTES, BIOSRC, eOncaller | eFatal, "Uncultured Notes")
261 {
262  for (const CBioSource* biosrc : context.GetBiosources()) {
263  if (biosrc->IsSetSubtype()) {
264  for (const auto& it : biosrc->GetSubtype()) {
265  if (it->IsSetSubtype() && it->GetSubtype() == CSubSource::eSubtype_other && it->IsSetName() && CSubSource::HasCultureNotes(it->GetName())) {
266  m_Objs["[n] bio-source[s] [has] uncultured note[s]"].Add(*context.BiosourceObjRef(*biosrc)).Fatal();
267  break;
268  }
269  }
270  }
271  }
272 }
273 
274 
275 // MISSING_VIRAL_QUALS
276 
277 const string kMissingViralQualsTop = "[n] virus organism[s] [is] missing required qualifiers";
278 
279 DISCREPANCY_CASE(MISSING_VIRAL_QUALS, BIOSRC, eOncaller, "Viruses should specify collection-date, country, and specific-host")
280 {
281  const CSeqdesc* src = context.GetBiosource();
282  if (context.HasLineage(src ? &src->GetSource() : nullptr, "Viruses")) {
283  for (const CBioSource* biosrc : context.GetBiosources()) {
284  bool has_collection_date = false;
285  bool has_country = false;
286  bool has_specific_host = false;
287  if (biosrc->IsSetSubtype()) {
288  for (const auto& it : biosrc->GetSubtype()) {
289  if (it->IsSetSubtype()) {
290  if (it->GetSubtype() == CSubSource::eSubtype_collection_date) {
291  has_collection_date = true;
292  }
293  else if (it->GetSubtype() == CSubSource::eSubtype_country) {
294  has_country = true;
295  }
296  if (has_collection_date && has_country) {
297  break;
298  }
299  }
300  }
301  }
302  if (biosrc->IsSetOrg() && biosrc->GetOrg().IsSetOrgname() && biosrc->GetOrg().GetOrgname().IsSetMod()) {
303  for (const auto& it : biosrc->GetOrg().GetOrgname().GetMod()) {
304  if (it->IsSetSubtype() && it->GetSubtype() == COrgMod::eSubtype_nat_host) {
305  has_specific_host = true;
306  }
307  }
308  }
309  if (!has_collection_date || !has_country || !has_specific_host) {
310  if (!has_collection_date) {
311  m_Objs[kMissingViralQualsTop]["[n] virus organism[s] [is] missing suggested qualifier collection date"].Ext().Add(*context.BiosourceObjRef(*biosrc));
312  }
313  if (!has_country) {
314  m_Objs[kMissingViralQualsTop]["[n] virus organism[s] [is] missing suggested qualifier country"].Ext().Add(*context.BiosourceObjRef(*biosrc));
315  }
316  if (!has_specific_host) {
317  m_Objs[kMissingViralQualsTop]["[n] virus organism[s] [is] missing suggested qualifier specific-host"].Ext().Add(*context.BiosourceObjRef(*biosrc));
318  }
319  }
320  }
321  }
322 }
323 
324 
325 // ATCC_CULTURE_CONFLICT
326 
327 bool HasCultureCollectionForATCCStrain(const COrgName::TMod& mods, const string& strain)
328 {
329  if (NStr::IsBlank(strain)) {
330  return true;
331  }
332  bool found = false;
333  for (const auto& m : mods) {
334  if (m->IsSetSubtype() && m->GetSubtype() == COrgMod::eSubtype_culture_collection && m->IsSetSubname() && NStr::StartsWith(m->GetSubname(), "ATCC:")) {
335  string cmp = m->GetSubname().substr(5);
337  size_t pos = NStr::Find(cmp, ";");
338  if (pos != string::npos) {
339  cmp = cmp.substr(0, pos);
340  }
341  if (NStr::Equal(cmp, strain)) {
342  found = true;
343  break;
344  }
345  }
346  }
347  return found;
348 }
349 
350 
351 bool HasStrainForATCCCultureCollection(const COrgName::TMod& mods, const string& culture_collection)
352 {
353  if (NStr::IsBlank(culture_collection)) {
354  return true;
355  }
356  bool found = false;
357  for (const auto& m : mods) {
358  if (m->IsSetSubtype() && m->GetSubtype() == COrgMod::eSubtype_strain && m->IsSetSubname() && NStr::StartsWith(m->GetSubname(), "ATCC ")) {
359  string cmp = m->GetSubname().substr(5);
361  size_t pos = NStr::Find(cmp, ";");
362  if (pos != string::npos) {
363  cmp = cmp.substr(0, pos);
364  }
365  if (NStr::Equal(cmp, culture_collection)) {
366  found = true;
367  break;
368  }
369  }
370  }
371  return found;
372 }
373 
374 
375 DISCREPANCY_CASE(ATCC_CULTURE_CONFLICT, BIOSRC, eDisc | eOncaller, "ATCC strain should also appear in culture collection")
376 {
377  for (const CBioSource* biosrc : context.GetBiosources()) {
378  if (biosrc->IsSetOrg() && biosrc->GetOrg().IsSetOrgname() && biosrc->GetOrg().GetOrgname().IsSetMod()) {
379  bool report = false;
380  for (const auto& m : biosrc->GetOrg().GetOrgname().GetMod()) {
381  if (m->IsSetSubtype() && m->IsSetSubname()) {
382  if (m->GetSubtype() == COrgMod::eSubtype_strain && NStr::StartsWith(m->GetSubname(), "ATCC ") && !HasCultureCollectionForATCCStrain(biosrc->GetOrg().GetOrgname().GetMod(), m->GetSubname().substr(5))) {
383  report = true;
384  break;
385  }
386  else if (m->GetSubtype() == COrgMod::eSubtype_culture_collection && NStr::StartsWith(m->GetSubname(), "ATCC:") && !HasStrainForATCCCultureCollection(biosrc->GetOrg().GetOrgname().GetMod(), m->GetSubname().substr(5))) {
387  report = true;
388  break;
389  }
390  }
391  }
392  if (report) {
393  m_Objs["[n] biosource[s] [has] conflicting ATCC strain and culture collection values"].Add(*context.BiosourceObjRef(*biosrc, true));
394  }
395  }
396  }
397 }
398 
399 
401 {
402  if (!src.IsSetOrg() || !src.GetOrg().IsSetOrgMod() || !src.GetOrg().GetOrgname().IsSetMod()) {
403  return false;
404  }
405  vector<string> add;
406  for (const auto& m : src.GetOrg().GetOrgname().GetMod()) {
407  if (m->IsSetSubtype() && m->IsSetSubname()) {
408  if (m->GetSubtype() == COrgMod::eSubtype_strain && NStr::StartsWith(m->GetSubname(), "ATCC ") &&
409  !HasCultureCollectionForATCCStrain(src.GetOrg().GetOrgname().GetMod(), m->GetSubname().substr(5))) {
410  add.push_back("ATCC:" + m->GetSubname());
411  }
412  }
413  }
414  if (!add.empty()) {
415  for (const string& s : add) {
417  src.SetOrg().SetOrgname().SetMod().push_back(m);
418  }
419  return true;
420  }
421  return false;
422 }
423 
424 
425 DISCREPANCY_AUTOFIX(ATCC_CULTURE_CONFLICT)
426 {
427  const CSeq_feat* feat = dynamic_cast<const CSeq_feat*>(context.FindObject(*obj));
428  const CSeqdesc* desc = dynamic_cast<const CSeqdesc*>(context.FindObject(*obj));
429  if (feat) {
430  if (SetCultureCollectionFromStrain(const_cast<CSeq_feat*>(feat)->SetData().SetBiosrc())) {
431  obj->SetFixed();
432  return CRef<CAutofixReport>(new CAutofixReport("ATCC_CULTURE_CONFLICT: Set culture collection for [n] source[s]", 1));
433  }
434  }
435  if (desc) {
436  if (SetCultureCollectionFromStrain(const_cast<CSeqdesc*>(desc)->SetSource())) {
437  obj->SetFixed();
438  return CRef<CAutofixReport>(new CAutofixReport("ATCC_CULTURE_CONFLICT: Set culture collection for [n] source[s]", 1));
439  }
440  }
441  return CRef<CAutofixReport>();
442 }
443 
444 
445 // BACTERIA_SHOULD_NOT_HAVE_ISOLATE
446 
447 const string kAmplifiedWithSpeciesSpecificPrimers = "amplified with species-specific primers";
448 
449 DISCREPANCY_CASE(BACTERIA_SHOULD_NOT_HAVE_ISOLATE, BIOSRC, eDisc | eOncaller | eSmart, "Bacterial sources should not have isolate")
450 {
451  const CSeqdesc* src = context.GetBiosource();
452  if (context.HasLineage(src ? &src->GetSource() : nullptr, "Bacteria") || context.HasLineage(src ? &src->GetSource() : nullptr, "Archaea")) {
453  for (const CBioSource* biosrc : context.GetBiosources()) {
454  bool has_bad_isolate = false;
455  bool is_metagenomic = false;
456  bool is_env_sample = false;
457  if (biosrc->IsSetSubtype()) {
458  for (const auto& s : biosrc->GetSubtype()) {
459  if (s->IsSetSubtype()) {
460  if (s->GetSubtype() == CSubSource::eSubtype_environmental_sample) {
461  is_env_sample = true;
462  if (is_metagenomic && is_env_sample) {
463  return;
464  }
465  }
466  if (s->GetSubtype() == CSubSource::eSubtype_metagenomic) {
467  is_metagenomic = true;
468  if (is_metagenomic && is_env_sample) {
469  return;
470  }
471  }
472  if (s->GetSubtype() == CSubSource::eSubtype_other && s->IsSetName() && NStr::Equal(s->GetName(), kAmplifiedWithSpeciesSpecificPrimers)) {
473  return;
474  }
475  }
476  }
477  }
478  if (biosrc->IsSetOrg() && biosrc->GetOrg().IsSetOrgname() && biosrc->GetOrg().GetOrgname().IsSetMod()) {
479  for (const auto& m : biosrc->GetOrg().GetOrgname().GetMod()) {
480  if (m->IsSetSubtype()) {
481  if (m->GetSubtype() == CSubSource::eSubtype_other && m->IsSetSubname() && NStr::Equal(m->GetSubname(), kAmplifiedWithSpeciesSpecificPrimers)) {
482  return;
483  }
484  if (m->GetSubtype() == COrgMod::eSubtype_isolate && m->IsSetSubname() &&
485  !NStr::StartsWith(m->GetSubname(), "DGGE gel band") &&
486  !NStr::StartsWith(m->GetSubname(), "TGGE gel band") &&
487  !NStr::StartsWith(m->GetSubname(), "SSCP gel band")) {
488  has_bad_isolate = true;
489  }
490  }
491  }
492  }
493  if (has_bad_isolate) {
494  m_Objs["[n] bacterial biosource[s] [has] isolate"].Add(*context.BiosourceObjRef(*biosrc));
495  }
496  }
497  }
498 }
499 
500 
501 // MAG_SHOULD_NOT_HAVE_STRAIN
502 
503 DISCREPANCY_CASE(MAG_SHOULD_NOT_HAVE_STRAIN, BIOSRC, eDisc | eSmart, "Organism assembled from metagenome reads should not have strain")
504 {
505  const CSeqdesc* src = context.GetBiosource();
506  if (context.HasLineage(src ? &src->GetSource() : nullptr, "Bacteria") || context.HasLineage(src ? &src->GetSource() : nullptr, "Archaea")) {
507  for (const CBioSource* biosrc : context.GetBiosources()) {
508  bool is_metagenomic = false;
509  bool is_env_sample = false;
510  if (biosrc->IsSetSubtype()) {
511  for (const auto& s : biosrc->GetSubtype()) {
512  if (s->IsSetSubtype()) {
513  if (s->GetSubtype() == CSubSource::eSubtype_environmental_sample) {
514  is_env_sample = true;
515  }
516  if (s->GetSubtype() == CSubSource::eSubtype_metagenomic) {
517  is_metagenomic = true;
518  }
519  }
520  }
521  }
522  if (is_metagenomic && is_env_sample && biosrc->IsSetOrg() && biosrc->GetOrg().IsSetOrgname() && biosrc->GetOrg().GetOrgname().IsSetMod()) {
523  for (const auto& m : biosrc->GetOrg().GetOrgname().GetMod()) {
524  if (m->IsSetSubtype() && m->GetSubtype() == COrgMod::eSubtype_strain) {
525  m_Objs["[n] organism[s] assembled from metagenome [has] strain"].Add(*context.BiosourceObjRef(*biosrc));
526  break;
527  }
528  }
529  }
530  }
531  }
532 }
533 
534 
535 // MAG_MISSING_ISOLATE
536 
537 DISCREPANCY_CASE(MAG_MISSING_ISOLATE, BIOSRC, eDisc | eSmart, "Organism assembled from metagenome reads should have isolate")
538 {
539  const CSeqdesc* src = context.GetBiosource();
540  if (context.HasLineage(src ? &src->GetSource() : nullptr, "Bacteria") || context.HasLineage(src ? &src->GetSource() : nullptr, "Archaea")) {
541  for (const CBioSource* biosrc : context.GetBiosources()) {
542  bool is_metagenomic = false;
543  bool is_env_sample = false;
544  bool has_isolate = false;
545  if (biosrc->IsSetSubtype()) {
546  for (const auto& s : biosrc->GetSubtype()) {
547  if (s->IsSetSubtype()) {
548  if (s->GetSubtype() == CSubSource::eSubtype_environmental_sample) {
549  is_env_sample = true;
550  }
551  if (s->GetSubtype() == CSubSource::eSubtype_metagenomic) {
552  is_metagenomic = true;
553  }
554  }
555  }
556  }
557  if (!is_metagenomic || !is_env_sample) {
558  continue;
559  }
560  if (biosrc->IsSetOrg() && biosrc->GetOrg().IsSetOrgname() && biosrc->GetOrg().GetOrgname().IsSetMod()) {
561  for (const auto& m : biosrc->GetOrg().GetOrgname().GetMod()) {
562  if (m->IsSetSubtype() && m->GetSubtype() == COrgMod::eSubtype_isolate) {
563  has_isolate = true;
564  break;
565  }
566  }
567  }
568  if (!has_isolate) {
569  m_Objs["[n] organism[s] assembled from metagenome [is] missing isolate"].Add(*context.BiosourceObjRef(*biosrc));
570  }
571  }
572  }
573 }
574 
575 
576 // MULTISRC
577 
578 DISCREPANCY_CASE(MULTISRC, BIOSRC, eDisc | eOncaller, "Comma or semicolon appears in strain or isolate")
579 {
580  for (const CBioSource* biosrc : context.GetBiosources()) {
581  if (biosrc->IsSetOrg() && biosrc->GetOrg().IsSetOrgname() && biosrc->GetOrg().GetOrgname().IsSetMod()) {
582  bool report = false;
583  for (const auto& m : biosrc->GetOrg().GetOrgname().GetMod()) {
584  if (m->IsSetSubtype() && (m->GetSubtype() == COrgMod::eSubtype_isolate || m->GetSubtype() == COrgMod::eSubtype_strain) && m->IsSetSubname() && (NStr::Find(m->GetSubname(), ",") != string::npos || NStr::Find(m->GetSubname(), ";") != string::npos)) {
585  report = true;
586  break;
587  }
588  }
589  if (report) {
590  m_Objs["[n] organism[s] [has] comma or semicolon in strain or isolate"].Add(*context.BiosourceObjRef(*biosrc));
591  }
592  }
593  }
594 }
595 
596 
597 // MULTIPLE_CULTURE_COLLECTION
598 
599 DISCREPANCY_CASE(MULTIPLE_CULTURE_COLLECTION, BIOSRC, eOncaller, "Multiple culture-collection quals")
600 {
601  for (const CBioSource* biosrc : context.GetBiosources()) {
602  if (biosrc->IsSetOrg() && biosrc->GetOrg().IsSetOrgname() && biosrc->GetOrg().GetOrgname().IsSetMod()) {
603  bool found = false;
604  for (const auto& m : biosrc->GetOrg().GetOrgname().GetMod()) {
605  if (m->IsSetSubtype() && m->GetSubtype() == COrgMod::eSubtype_culture_collection) {
606  if (found) {
607  m_Objs["[n] organism[s] [has] multiple culture-collection qualifiers"].Add(*context.BiosourceObjRef(*biosrc));
608  break;
609  }
610  found = true;
611  }
612  }
613  }
614  }
615 }
616 
617 
618 // REQUIRED_STRAIN
619 
620 DISCREPANCY_CASE(REQUIRED_STRAIN, BIOSRC, eDisc | eSubmitter | eSmart, "Bacteria should have strain")
621 {
622  const CSeqdesc* src = context.GetBiosource();
623  if (context.HasLineage(src ? &src->GetSource() : nullptr, "Bacteria") || context.HasLineage(src ? &src->GetSource() : nullptr, "Archaea")) {
624  for (const CBioSource* biosrc : context.GetBiosources()) {
625  if (biosrc->IsSetSubtype()) {
626  bool is_metagenomic = false;
627  bool is_env_sample = false;
628  for (const auto& s : biosrc->GetSubtype()) {
629  if (s->IsSetSubtype()) {
630  if (s->GetSubtype() == CSubSource::eSubtype_environmental_sample) {
631  is_env_sample = true;
632  if (is_metagenomic) {
633  break;
634  }
635  }
636  if (s->GetSubtype() == CSubSource::eSubtype_metagenomic) {
637  is_metagenomic = true;
638  if (is_env_sample) {
639  break;
640  }
641  }
642  }
643  }
644  if (is_metagenomic && is_env_sample) {
645  continue;
646  }
647  }
648  if (biosrc->IsSetOrg() && biosrc->GetOrg().IsSetOrgname() && biosrc->GetOrg().GetOrgname().IsSetMod()) {
649  bool skip = false;
650  for (const auto& m : biosrc->GetOrg().GetOrgname().GetMod()) {
651  if (m->IsSetSubtype() && m->GetSubtype() == COrgMod::eSubtype_strain) {
652  skip = true;
653  break;
654  }
655  }
656  if (skip) {
657  continue;
658  }
659  }
660  m_Objs["[n] biosource[s] [is] missing required strain value"].Add(*context.BiosourceObjRef(*biosrc));
661  }
662  }
663 }
664 
665 
666 // STRAIN_CULTURE_COLLECTION_MISMATCH
667 
668 static bool MatchExceptSpaceColon(const string& a, const string& b)
669 {
670  size_t i = 0;
671  size_t j = 0;
672  while (i < a.length() && j < b.length()) {
673  while (i < a.length() && (a[i] == ':' || a[i] == ' ')) i++;
674  while (j < b.length() && (b[j] == ':' || b[j] == ' ')) j++;
675  if (i == a.length()) {
676  return j == b.length();
677  }
678  if (j == b.length() || a[i] != b[j]) {
679  return false;
680  }
681  i++;
682  j++;
683  }
684  return true;
685 }
686 
687 
688 DISCREPANCY_CASE(STRAIN_CULTURE_COLLECTION_MISMATCH, BIOSRC, eOncaller | eSmart, "Strain and culture-collection values conflict")
689 {
690  for (const CBioSource* biosrc : context.GetBiosources()) {
691  if (biosrc->IsSetOrg() && biosrc->GetOrg().IsSetOrgname() && biosrc->GetOrg().GetOrgname().IsSetMod()) {
692  vector<const COrgMod*> OrgMods;
693  for (const auto& m : biosrc->GetOrg().GetOrgname().GetMod()) {
694  OrgMods.push_back(&*m);
695  }
696  bool match = false;
697  bool conflict = false;
698  for (size_t i = 0; i < OrgMods.size(); i++) {
699  if (OrgMods[i]->IsSetSubtype() && OrgMods[i]->GetSubtype() == COrgMod::eSubtype_strain) {
700  for (size_t j = i + 1; j < OrgMods.size(); j++) { // not from 0 ?
701  if (OrgMods[j]->IsSetSubtype() && OrgMods[j]->GetSubtype() == COrgMod::eSubtype_culture_collection) {
702  if (MatchExceptSpaceColon(OrgMods[i]->GetSubname(), OrgMods[j]->GetSubname())) {
703  match = true;
704  break;
705  }
706  else {
707  conflict = true;
708  }
709  }
710  }
711  if (match) {
712  break;
713  }
714  }
715  }
716  if (conflict && !match) {
717  m_Objs["[n] organism[s] [has] conflicting strain and culture-collection values"].Add(*context.BiosourceObjRef(*biosrc));
718  }
719  }
720  }
721 }
722 
723 
724 // SP_NOT_UNCULTURED
725 
726 DISCREPANCY_CASE(SP_NOT_UNCULTURED, BIOSRC, eOncaller, "Organism ending in sp. needs tax consult")
727 {
728  for (const CBioSource* biosrc : context.GetBiosources()) {
729  if (biosrc->IsSetOrg() && biosrc->GetOrg().CanGetTaxname()) {
730  const string& s = biosrc->GetOrg().GetTaxname();
731  if (s.length() > 4 && s.substr(s.length() - 4) == " sp." && s.substr(0, 11) != "uncultured ") {
732  m_Objs["[n] biosource[s] [has] taxname[s] that end[S] with \' sp.\' but [does] not start with \'uncultured\'"].Add(*context.BiosourceObjRef(*biosrc));
733  }
734  }
735  }
736 }
737 
738 
739 // FIND_STRAND_TRNAS
740 
741 DISCREPANCY_CASE(FIND_STRAND_TRNAS, SEQUENCE, eDisc, "Find tRNAs on the same strand")
742 {
743  const CSeqdesc* biosrc = context.GetBiosource();
745  bool strand_plus = false;
746  bool strand_minus = false;
747  for (const auto& feat : context.FeatTRNAs()) {
748  if (feat->GetLocation().GetStrand() == eNa_strand_minus) {
749  strand_minus = true;
750  }
751  else {
752  strand_plus = true;
753  }
754  if (strand_plus && strand_minus) {
755  return;
756  }
757  }
758  for (const auto& feat : context.FeatTRNAs()) {
759  m_Objs[strand_plus ? "[n] tRNA[s] on plus strand" : "[n] tRNA[s] on minus strand"].Add(*context.SeqFeatObjRef(*feat));
760  }
761  }
762 }
763 
764 
765 // REQUIRED_CLONE
766 
768 {
769  if (src.IsSetSubtype()) {
770  for (const auto& s : src.GetSubtype()) {
771  if (s->IsSetSubtype() && s->GetSubtype() == CSubSource::eSubtype_other && s->IsSetName() && NStr::Equal(s->GetName(), kAmplifiedWithSpeciesSpecificPrimers)) {
772  return true;
773  }
774  }
775  }
776  if (src.IsSetOrg() && src.GetOrg().IsSetOrgname() && src.GetOrg().GetOrgname().IsSetMod()) {
777  for (const auto& m : src.GetOrg().GetOrgname().GetMod()) {
778  if (m->IsSetSubtype() && m->GetSubtype() == CSubSource::eSubtype_other && m->IsSetSubname() && NStr::Equal(m->GetSubname(), kAmplifiedWithSpeciesSpecificPrimers)) {
779  return true;
780  }
781  }
782  }
783 
784  return false;
785 }
786 
787 
788 static bool IsMissingRequiredClone(const CBioSource& biosource)
789 {
791  return false;
792  }
793  bool needs_clone = biosource.IsSetOrg() && biosource.GetOrg().IsSetTaxname() && NStr::StartsWith(biosource.GetOrg().GetTaxname(), "uncultured", NStr::eNocase);
794  bool has_clone = false;
795  if (biosource.IsSetSubtype()) {
796  for (const auto& subtype_it : biosource.GetSubtype()) {
797  if (subtype_it->IsSetSubtype()) {
798  CSubSource::TSubtype subtype = subtype_it->GetSubtype();
800  needs_clone = true;
801  }
802  else if (subtype == CSubSource::eSubtype_clone) {
803  has_clone = true;
804  }
805  }
806  }
807  }
808  if (needs_clone && !has_clone) {
809  // look for gel band isolate
810  bool has_gel_band_isolate = false;
811  if (biosource.IsSetOrg() && biosource.GetOrg().IsSetOrgname() && biosource.GetOrg().GetOrgname().IsSetMod()) {
812  for (const auto& mod_it : biosource.GetOrg().GetOrgname().GetMod()) {
813  if (mod_it->IsSetSubtype() && mod_it->GetSubtype() == COrgMod::eSubtype_isolate) {
814  if (mod_it->IsSetSubname() && NStr::FindNoCase(mod_it->GetSubname(), "gel band") != NPOS) {
815  has_gel_band_isolate = true;
816  break;
817  }
818  }
819  }
820  }
821  if (has_gel_band_isolate) {
822  needs_clone = false;
823  }
824  }
825  return (needs_clone && !has_clone);
826 }
827 
828 
829 DISCREPANCY_CASE(REQUIRED_CLONE, BIOSRC, eOncaller, "Uncultured or environmental sources should have clone")
830 {
831  for (const CBioSource* biosrc : context.GetBiosources()) {
832  if (IsMissingRequiredClone(*biosrc)) {
833  m_Objs["[n] biosource[s] [is] missing required clone value"].Add(*context.BiosourceObjRef(*biosrc));
834  }
835  }
836 }
837 
838 
839 // STRAIN_TAXNAME_MISMATCH
840 
841 DISCREPANCY_CASE(STRAIN_TAXNAME_MISMATCH, BIOSRC, eDisc | eOncaller, "BioSources with the same strain should have the same taxname")
842 {
843  for (const CBioSource* biosrc : context.GetBiosources()) {
844  if (biosrc->IsSetOrg() && biosrc->GetOrg().IsSetOrgname() && biosrc->GetOrg().GetOrgname().IsSetMod()) {
845  for (const auto& om : biosrc->GetOrg().GetOrgname().GetMod()) {
846  if (om->IsSetSubtype() && om->GetSubtype() == COrgMod::eSubtype_strain && om->IsSetSubname()) {
847  const string strain = om->GetSubname();
848  if (!strain.empty()) {
849  m_Objs[strain][biosrc->GetOrg().IsSetTaxname() ? biosrc->GetOrg().GetTaxname() : ""].Add(*context.BiosourceObjRef(*biosrc));
850  }
851  }
852  }
853  }
854  }
855 }
856 
857 
858 DISCREPANCY_SUMMARIZE(STRAIN_TAXNAME_MISMATCH)
859 {
860  CReportNode rep, rep1;
861  static const string root = "[n] biosources have strain/taxname conflicts";
862  for (auto& it: m_Objs.GetMap()) {
863  if (it.second->GetMap().size() > 1) {
864  for (auto& mm: it.second->GetMap()) {
865  for (auto& obj : mm.second->GetObjects()) {
866  string label = "[n] biosources have strain [(]" + it.first + "[)] but do not have the same taxnames";
867  rep["[n] biosources have strain/taxname conflicts"][label].Ext().Add(*obj);
868  rep1[label].Add(*obj);
869  }
870  }
871  }
872  }
873  m_ReportItems = rep1.GetMap().size() > 1 ? rep.Export(*this)->GetSubitems() : rep1.Export(*this)->GetSubitems();
874 }
875 
876 
877 // SPECVOUCHER_TAXNAME_MISMATCH
878 
879 DISCREPANCY_CASE(SPECVOUCHER_TAXNAME_MISMATCH, BIOSRC, eOncaller | eSmart, "BioSources with the same specimen voucher should have the same taxname")
880 {
881  for (const CBioSource* biosrc : context.GetBiosources()) {
882  if (biosrc->IsSetOrg() && biosrc->GetOrg().IsSetOrgname() && biosrc->GetOrg().GetOrgname().IsSetMod()) {
883  for (const auto& om : biosrc->GetOrg().GetOrgname().GetMod()) {
884  if (om->IsSetSubtype() && om->GetSubtype() == COrgMod::eSubtype_specimen_voucher && om->IsSetSubname()) {
885  const string strain = om->GetSubname();
886  if (!strain.empty()) {
887  m_Objs[strain][biosrc->GetOrg().IsSetTaxname() ? biosrc->GetOrg().GetTaxname() : ""].Add(*context.BiosourceObjRef(*biosrc));
888  }
889  }
890  }
891  }
892  }
893 }
894 
895 
896 DISCREPANCY_SUMMARIZE(SPECVOUCHER_TAXNAME_MISMATCH)
897 {
898  CReportNode rep, rep1;
899  for (auto& it: m_Objs.GetMap()) {
900  if (it.second->GetMap().size() > 1) {
901  for (auto& mm: it.second->GetMap()) {
902  for (auto& obj: mm.second->GetObjects()) {
903  string label = "[n] biosources have specimen voucher [(]" + it.first + "[)] but do not have the same taxnames";
904  rep["[n] biosources have specimen voucher/taxname conflicts"][label].Ext().Add(*obj);
905  rep1[label].Add(*obj);
906  }
907  }
908  }
909  }
910  m_ReportItems = rep1.GetMap().size() > 1 ? rep.Export(*this)->GetSubitems() : rep1.Export(*this)->GetSubitems();
911 }
912 
913 
914 // CULTURE_TAXNAME_MISMATCH
915 
916 DISCREPANCY_CASE(CULTURE_TAXNAME_MISMATCH, BIOSRC, eOncaller, "Test BioSources with the same culture collection but different taxname")
917 {
918  for (const CBioSource* biosrc : context.GetBiosources()) {
919  if (biosrc->IsSetOrg() && biosrc->GetOrg().IsSetOrgname() && biosrc->GetOrg().GetOrgname().IsSetMod()) {
920  for (const auto& om : biosrc->GetOrg().GetOrgname().GetMod()) {
921  if (om->IsSetSubtype() && om->GetSubtype() == COrgMod::eSubtype_culture_collection && om->IsSetSubname()) {
922  const string strain = om->GetSubname();
923  if (!strain.empty()) {
924  m_Objs[strain][biosrc->GetOrg().IsSetTaxname() ? biosrc->GetOrg().GetTaxname() : ""].Add(*context.BiosourceObjRef(*biosrc));
925  }
926  }
927  }
928  }
929  }
930 }
931 
932 
933 DISCREPANCY_SUMMARIZE(CULTURE_TAXNAME_MISMATCH)
934 {
935  CReportNode rep, rep1;
936  for (auto& it : m_Objs.GetMap()) {
937  if (it.second->GetMap().size() > 1) {
938  for (auto& mm : it.second->GetMap()) {
939  for (auto& obj : mm.second->GetObjects()) {
940  string label = "[n] biosources have culture collection " + it.first + " but do not have the same taxnames";
941  rep["[n] biosources have culture collection/taxname conflicts"][label].Ext().Add(*obj);
942  rep1[label].Add(*obj);
943  }
944  }
945  }
946  }
947  m_ReportItems = rep1.GetMap().size() > 1 ? rep.Export(*this)->GetSubitems() : rep1.Export(*this)->GetSubitems();
948 }
949 
950 
951 // BIOMATERIAL_TAXNAME_MISMATCH
952 
953 DISCREPANCY_CASE(BIOMATERIAL_TAXNAME_MISMATCH, BIOSRC, eOncaller | eSmart, "Test BioSources with the same biomaterial but different taxname")
954 {
955  for (const CBioSource* biosrc : context.GetBiosources()) {
956  if (biosrc->IsSetOrg() && biosrc->GetOrg().IsSetOrgname() && biosrc->GetOrg().GetOrgname().IsSetMod()) {
957  for (const auto& om : biosrc->GetOrg().GetOrgname().GetMod()) {
958  if (om->IsSetSubtype() && om->GetSubtype() == COrgMod::eSubtype_bio_material && om->IsSetSubname()) {
959  const string strain = om->GetSubname();
960  if (!strain.empty()) {
961  m_Objs[strain][biosrc->GetOrg().IsSetTaxname() ? biosrc->GetOrg().GetTaxname() : ""].Add(*context.BiosourceObjRef(*biosrc));
962  }
963  }
964  }
965  }
966  }
967 }
968 
969 
970 DISCREPANCY_SUMMARIZE(BIOMATERIAL_TAXNAME_MISMATCH)
971 {
972  {
973  CReportNode rep, rep1;
974  for (auto& it : m_Objs.GetMap()) {
975  if (it.second->GetMap().size() > 1) {
976  for (auto& mm : it.second->GetMap()) {
977  for (auto& obj : mm.second->GetObjects()) {
978  string label = "[n] biosources have biomaterial " + it.first + " but do not have the same taxnames";
979  rep["[n] biosources have biomaterial/taxname conflicts"][label].Ext().Add(*obj);
980  rep1[label].Add(*obj);
981  }
982  }
983  }
984  }
985  m_ReportItems = rep1.GetMap().size() > 1 ? rep.Export(*this)->GetSubitems() : rep1.Export(*this)->GetSubitems();
986  }
987 }
988 
989 
990 // ORGANELLE_ITS
991 
992 DISCREPANCY_CASE(ORGANELLE_ITS, SEQUENCE, eOncaller, "Test Bioseqs for suspect rRNA / ITS on organelle")
993 {
994  static const vector<string> suspectable_products = {
995  "18S ribosomal RNA",
996  "5.8S ribosomal RNA",
997  "25S ribosomal RNA",
998  "28S ribosomal RNA",
999  "internal transcribed spacer 1",
1000  "internal transcribed spacer 2"
1001  };
1002  static const string msg = "[n] Bioseq[s] [has] suspect rRNA / ITS on organelle";
1003  const CSeqdesc* src = context.GetBiosource();
1004  if (src && src->GetSource().IsSetGenome()) {
1005  int genome = src->GetSource().GetGenome();
1009  || genome == CBioSource::eGenome_plastid || genome == CBioSource::eGenome_proplastid) {
1010  for (const CSeq_feat& feat : context.GetFeat()) {
1011  if (feat.IsSetData() && feat.GetData().IsRna()) {
1012  const CRNA_ref& rna = feat.GetData().GetRna();
1013  if (rna.IsSetType() && (rna.GetType() == CRNA_ref::eType_rRNA || rna.GetType() == CRNA_ref::eType_miscRNA)) {
1014  const string& product = rna.GetRnaProductName();
1015  // The Owls Are Not What They Seem!
1016  // if (NStr::FindNoCase(suspectable_products, product) != nullptr) {
1017  if (!product.empty()) {
1018  for (const string& pattern : suspectable_products) {
1019  if (NStr::FindNoCase(product, pattern) != NPOS) {
1020  m_Objs[msg].Add(*context.BioseqObjRef());
1021  return;
1022  }
1023  }
1024  }
1025  if (feat.IsSetComment()) {
1026  const string& comment = feat.GetComment();
1027  // The Owls Are Not What They Seem!
1028  // if (!comment.empty() && NStr::FindNoCase(suspectable_products, comment) != nullptr) {
1029  if (!comment.empty()) {
1030  for (const string& pattern : suspectable_products) {
1031  if (NStr::FindNoCase(comment, pattern) != NPOS) {
1032  m_Objs[msg].Add(*context.BioseqObjRef());
1033  return;
1034  }
1035  }
1036  }
1037  }
1038  }
1039  }
1040  }
1041  }
1042  }
1043 }
1044 
1045 
1046 // INCONSISTENT_BIOSOURCE
1047 
1048 typedef list<string> TInconsistecyDescriptionList;
1049 
1050 template<class T, typename R> class CCompareValues
1051 {
1052  typedef bool (T::*TIsSetFn)() const;
1053  typedef int (T::*TGetIntFn)() const;
1054  typedef const R& (T::*TGetRFn)() const;
1055 
1056 public:
1057  static bool IsEqualInt(const T& first, const T& second, TIsSetFn is_set_fn, TGetIntFn get_fn, int not_set)
1058  {
1059  int first_val = (first.*is_set_fn)() ? (first.*get_fn)() : not_set,
1060  second_val = (second.*is_set_fn)() ? (second.*get_fn)() : not_set;
1061 
1062  return first_val == second_val;
1063  }
1064 
1065  static bool IsEqualVal(const T& first, const T& second, TIsSetFn is_set_fn, TGetRFn get_fn, const R& empty_val)
1066  {
1067  const R& first_val = (first.*is_set_fn)() ? (first.*get_fn)() : empty_val,
1068  & second_val = (second.*is_set_fn)() ? (second.*get_fn)() : empty_val;
1069 
1070  return first_val == second_val;
1071  }
1072 };
1073 
1075 {
1076  if (first.size() != second.size()) {
1077  return false;
1078  }
1079 
1080  for (CBioSource::TSubtype::const_iterator it_first = first.cbegin(), it_second = second.cbegin();
1081  it_first != first.cend();
1082  ++it_first, ++it_second) {
1083 
1085  return false;
1086  }
1087 
1089  return false;
1090  }
1091 
1093  return false;
1094  }
1095  }
1096 
1097  return true;
1098 }
1099 
1100 static bool IsSameDb(const COrg_ref::TDb& first, const COrg_ref::TDb& second)
1101 {
1102  if (first.size() != second.size()) {
1103  return false;
1104  }
1105 
1106  for (COrg_ref::TDb::const_iterator it_first = first.cbegin(), it_second = second.cbegin();
1107  it_first != first.cend();
1108  ++it_first, ++it_second) {
1109 
1110  if (!(*it_first)->Equals(**it_second)) {
1111  return false;
1112  }
1113  }
1114 
1115  return true;
1116 }
1117 
1119 {
1120  bool first_name_set = first.IsSetName(),
1121  second_name_set = second.IsSetName();
1122 
1123  if (first_name_set != second_name_set || (first_name_set && first.GetName().Which() != second.GetName().Which())) {
1124  diffs.push_back("orgname choices differ");
1125  }
1126 
1128  diffs.push_back("genetic codes differ");
1129  }
1130 
1132  if ((first.IsSetMgcode() && first.GetMgcode()) || (second.IsSetMgcode() && second.GetMgcode())) {
1133  diffs.push_back("mitochondrial genetic codes differ");
1134  }
1135  }
1136 
1138  diffs.push_back("attributes differ");
1139  }
1140 
1142  diffs.push_back("lineages differ");
1143  }
1144 
1146  diffs.push_back("divisions differ");
1147  }
1148 
1149  bool first_mod_set = first.IsSetMod(),
1150  second_mod_set = second.IsSetMod();
1151 
1152  COrgName::TMod::const_iterator it_first, it_second;
1153  if (first_mod_set) {
1154  it_first = first.GetMod().cbegin();
1155  }
1156  if (second_mod_set) {
1157  it_second = second.GetMod().cbegin();
1158  }
1159  if (first_mod_set && second_mod_set) {
1160  COrgName::TMod::const_iterator end_first = first.GetMod().cend(),
1161  end_second = second.GetMod().cend();
1162 
1163  for (; it_first != end_first && it_second != end_second; ++it_first, ++it_second) {
1164 
1165  const string& qual = (*it_first)->IsSetSubtype() ? COrgMod::ENUM_METHOD_NAME(ESubtype)()->FindName((*it_first)->GetSubtype(), true) : "Unknown source qualifier";
1166 
1168  diffs.push_back("missing " + qual + " modifier");
1169  }
1170 
1172  diffs.push_back(qual + " modifier attrib values differ");
1173  }
1174 
1176  diffs.push_back("different " + qual + " values");
1177  }
1178  }
1179 
1180  if (it_first == end_first) {
1181  first_mod_set = false;
1182  }
1183  if (it_second == end_second) {
1184  second_mod_set = false;
1185  }
1186  }
1187 
1188  if (first_mod_set && !second_mod_set) {
1189  const string& qual = (*it_first)->IsSetSubtype() ? ENUM_METHOD_NAME(ESource_qual)()->FindName((*it_first)->GetSubtype(), true) : "Unknown source qualifier";
1190  diffs.push_back("missing " + qual + " modifier");
1191  }
1192  else if (!first_mod_set && second_mod_set) {
1193  const string& qual = (*it_second)->IsSetSubtype() ? ENUM_METHOD_NAME(ESource_qual)()->FindName((*it_second)->GetSubtype(), true) : "Unknown source qualifier";
1194  diffs.push_back("missing " + qual + " modifier");
1195  }
1196 }
1197 
1198 static void GetOrgrefDifferences(const COrg_ref& first_org, const COrg_ref& second_org, TInconsistecyDescriptionList& diffs)
1199 {
1201  diffs.push_back("taxnames differ");
1202  }
1203 
1205  diffs.push_back("common names differ");
1206  }
1207 
1209  diffs.push_back("synonyms differ");
1210  }
1211 
1212  bool first_db_set = first_org.IsSetDb(),
1213  second_db_set = second_org.IsSetDb();
1214 
1215  if (first_db_set != second_db_set || (first_db_set && !IsSameDb(first_org.GetDb(), second_org.GetDb()))) {
1216  diffs.push_back("dbxrefs differ");
1217  }
1218 
1219  bool first_orgname_set = first_org.IsSetOrgname(),
1220  second_orgname_set = second_org.IsSetOrgname();
1221 
1222  if (first_orgname_set != second_orgname_set) {
1223  diffs.push_back("one Orgname is missing");
1224  }
1225  else if (first_orgname_set && second_orgname_set) {
1226  GetOrgnameDifferences(first_org.GetOrgname(), second_org.GetOrgname(), diffs);
1227  }
1228 }
1229 
1230 
1231 static void GetBiosourceDifferences(const CBioSource& first_biosrc, const CBioSource& second_biosrc, TInconsistecyDescriptionList& diffs)
1232 {
1234  diffs.push_back("origins differ");
1235  }
1236 
1237  if (first_biosrc.IsSetIs_focus() != second_biosrc.IsSetIs_focus()) {
1238  diffs.push_back("focus differs");
1239  }
1240 
1242  diffs.push_back("locations differ");
1243  }
1244 
1245  static const CBioSource::TSubtype empty_subtype;
1246 
1247  const CBioSource::TSubtype& first_subtype = first_biosrc.IsSetSubtype() ? first_biosrc.GetSubtype() : empty_subtype,
1248  & second_subtype = second_biosrc.IsSetSubtype() ? second_biosrc.GetSubtype() : empty_subtype;
1249  if (!IsSameSubtype(first_subtype, second_subtype)) {
1250  diffs.push_back("subsource qualifiers differ");
1251  }
1252 
1253  bool first_org_set = first_biosrc.IsSetOrg(),
1254  second_org_set = second_biosrc.IsSetOrg();
1255 
1256  if (first_org_set != second_org_set) {
1257  diffs.push_back("one OrgRef is missing");
1258  }
1259  else if (first_org_set && second_org_set) {
1260  GetOrgrefDifferences(first_biosrc.GetOrg(), second_biosrc.GetOrg(), diffs);
1261  }
1262 }
1263 
1264 
1265 DISCREPANCY_CASE(INCONSISTENT_BIOSOURCE, SEQUENCE, eDisc | eSubmitter | eSmart, "Inconsistent BioSource")
1266 {
1267  const CBioseq& bioseq = context.CurrentBioseq();
1268  if (bioseq.IsNa()) {
1269  const CSeqdesc* biosrc = context.GetBiosource();
1270  if (biosrc) {
1271  stringstream ss;
1272  ss << MSerial_AsnBinary << biosrc->GetSource();
1273  auto& node = m_Objs[ss.str()];
1274  node.Add(*context.SeqdescObjRef(*biosrc));
1275  node.Add(*context.BioseqObjRef());
1276  }
1277  }
1278 }
1279 
1280 
1281 DISCREPANCY_SUMMARIZE(INCONSISTENT_BIOSOURCE)
1282 {
1283  auto& M = m_Objs.GetMap();
1284  string subtype;
1285  for (auto a = M.cbegin(); a != M.cend(); ++a) {
1286  stringstream ss_a(a->first);
1287  CBioSource bs_a;
1288  ss_a >> MSerial_AsnBinary >> bs_a;
1289  auto b = a;
1290  for (++b; b != M.cend(); ++b) {
1291  stringstream ss_b(b->first);
1292  CBioSource bs_b;
1293  ss_b >> MSerial_AsnBinary >> bs_b;
1295  GetBiosourceDifferences(bs_a, bs_b, diffs);
1296  if (!diffs.empty()) {
1297  subtype = "[n/2] inconsistent contig source[s][(] (" + NStr::Join(diffs, ", ") + ")";
1298  break;
1299  }
1300  }
1301  if (!subtype.empty()) {
1302  break;
1303  }
1304  }
1305  if (!subtype.empty()) {
1306  CReportNode rep;
1307  size_t subcat_index = 0;
1308  static size_t MAX_NUM_LEN = 10;
1309  for (auto& it : M) {
1310  string subcat_num = NStr::SizetToString(subcat_index);
1311  subcat_num = string(MAX_NUM_LEN - subcat_num.size(), '0') + subcat_num;
1312  string subcat = "[*" + subcat_num + "*][n/2] contig[s] [has] identical sources that do not match another contig source";
1313  ++subcat_index;
1314  rep[subtype][subcat].Ext().Add(it.second->GetObjects());
1315  }
1316  m_ReportItems = rep.Export(*this)->GetSubitems();
1317  }
1318 }
1319 
1320 
1321 // TAX_LOOKUP_MISMATCH
1322 
1323 DISCREPANCY_CASE(TAX_LOOKUP_MISMATCH, BIOSRC, eDisc, "Find Tax Lookup Mismatches")
1324 {
1325  for (const CBioSource* biosrc : context.GetBiosources()) {
1326  if (biosrc->IsSetOrg()) {
1327  stringstream ss;
1328  ss << MSerial_AsnBinary << biosrc->GetOrg();
1329  m_Objs[ss.str()].Add(*context.BiosourceObjRef(*biosrc));
1330  }
1331  }
1332 }
1333 
1334 
1335 static const CDbtag* GetTaxonTag(const COrg_ref& org)
1336 {
1337  if (org.IsSetDb()) {
1338  for (const auto& db : org.GetDb()) {
1339  if (db->IsSetDb() && NStr::EqualNocase(db->GetDb(), "taxon")) {
1340  return db;
1341  }
1342  }
1343  }
1344  return nullptr;
1345 }
1346 
1347 
1348 static bool OrgDiffers(const COrg_ref& first, const COrg_ref& second)
1349 {
1350  bool first_set = first.IsSetTaxname(), second_set = second.IsSetTaxname();
1351  if (first_set != second_set || (first_set && first.GetTaxname() != second.GetTaxname())) {
1352  return true;
1353  }
1354  const CDbtag* first_db_tag = GetTaxonTag(first);
1355  const CDbtag* second_db_tag = GetTaxonTag(second);
1356  if (first_db_tag == nullptr || second_db_tag == nullptr) {
1357  return true;
1358  }
1359  return !first_db_tag->Equals(*second_db_tag);
1360 }
1361 
1362 
1364 {
1366  CRef<CTaxon3_reply> reply = taxon3.SendOrgRefList(orgs);
1367  return reply;
1368 }
1369 
1370 
1371 DISCREPANCY_SUMMARIZE(TAX_LOOKUP_MISMATCH)
1372 {
1373  if (!m_Objs.empty()) {
1374  CReportNode rep;
1375  vector<CRef<COrg_ref>> org_refs;
1376  for (auto& it : m_Objs.GetMap()) {
1377  CRef<COrg_ref> oref(new COrg_ref());
1378  stringstream ss(it.first);
1379  ss >> MSerial_AsnBinary >> *oref;
1380  org_refs.push_back(oref);
1381  }
1382  CRef<CTaxon3_reply> reply = GetOrgRefs(org_refs);
1383  if (reply) {
1384  const auto& replies = reply->GetReply();
1385  auto rit = replies.cbegin();
1386  for (auto& it : m_Objs.GetMap()) {
1387  CRef<COrg_ref> oref(new COrg_ref());
1388  stringstream ss(it.first);
1389  ss >> MSerial_AsnBinary >> *oref;
1390  if ((*rit)->IsData() && OrgDiffers(*oref, (*rit)->GetData().GetOrg())) {
1391  rep["[n] tax name[s] [does] not match taxonomy lookup"].Add(it.second->GetObjects());
1392  }
1393  ++rit;
1394  }
1395  }
1396  m_ReportItems = rep.Export(*this)->GetSubitems();
1397  }
1398 }
1399 
1400 
1401 // TAX_LOOKUP_MISSING
1402 
1403 DISCREPANCY_CASE(TAX_LOOKUP_MISSING, BIOSRC, eDisc, "Find Missing Tax Lookup")
1404 {
1405  for (const CBioSource* biosrc : context.GetBiosources()) {
1406  if (biosrc->IsSetOrg()) {
1407  stringstream ss;
1408  ss << MSerial_AsnBinary << biosrc->GetOrg();
1409  m_Objs[ss.str()].Add(*context.BiosourceObjRef(*biosrc));
1410  }
1411  }
1412 }
1413 
1414 
1415 DISCREPANCY_SUMMARIZE(TAX_LOOKUP_MISSING)
1416 {
1417  if (!m_Objs.empty()) {
1418  CReportNode rep;
1419  vector<CRef<COrg_ref>> org_refs;
1420  for (auto& it : m_Objs.GetMap()) {
1421  CRef<COrg_ref> oref(new COrg_ref());
1422  stringstream ss(it.first);
1423  ss >> MSerial_AsnBinary >> *oref;
1424  org_refs.push_back(oref);
1425  }
1426  CRef<CTaxon3_reply> reply = GetOrgRefs(org_refs);
1427  if (reply) {
1428  const auto& replies = reply->GetReply();
1429  auto rit = replies.cbegin();
1430  for (auto& it : m_Objs.GetMap()) {
1431  if (!(*rit)->IsData() || (*rit)->IsError()) {
1432  rep["[n] tax name[s] [is] missing in taxonomy lookup"].Add(it.second->GetObjects());
1433  }
1434  ++rit;
1435  }
1436  }
1437  m_ReportItems = rep.Export(*this)->GetSubitems();
1438  }
1439 }
1440 
1441 
1442 // UNNECESSARY_ENVIRONMENTAL
1443 
1444 DISCREPANCY_CASE(UNNECESSARY_ENVIRONMENTAL, BIOSRC, eOncaller, "Unnecessary environmental qualifier present")
1445 {
1446  for (const CBioSource* biosrc : context.GetBiosources()) {
1447  if (biosrc->IsSetSubtype()) {
1448  bool skip = false;
1449  bool found = false;
1450  for (const auto& subtype : biosrc->GetSubtype()) {
1451  if (subtype->IsSetSubtype()) {
1452  CSubSource::TSubtype st = subtype->GetSubtype();
1454  skip = true;
1455  break;
1456  }
1457  else if (st == CSubSource::eSubtype_other && NStr::FindNoCase(subtype->GetName(), "amplified with species-specific primers") != NPOS) {
1458  skip = true;
1459  break;
1460  }
1462  found = true;
1463  }
1464  }
1465  }
1466  if (found && !skip) {
1467  if (biosrc->IsSetOrg()) {
1468  if (biosrc->GetOrg().IsSetTaxname()) {
1469  const string& s = biosrc->GetOrg().GetTaxname();
1470  if (NStr::FindNoCase(s, "uncultured") != NPOS || NStr::FindNoCase(s, "enrichment culture") != NPOS || NStr::FindNoCase(s, "metagenome") != NPOS || NStr::FindNoCase(s, "environmental") != NPOS || NStr::FindNoCase(s, "unidentified") != NPOS) {
1471  skip = true;
1472  continue;
1473  }
1474  }
1475  if (biosrc->GetOrg().IsSetOrgname() && biosrc->GetOrg().GetOrgname().IsSetMod()) {
1476  for (const auto& it : biosrc->GetOrg().GetOrgname().GetMod()) {
1477  if (it->IsSetSubtype() && it->GetSubtype() == COrgMod::eSubtype_other && it->IsSetSubname() && NStr::FindNoCase(it->GetSubname(), "amplified with species-specific primers") != NPOS) {
1478  skip = true;
1479  break;
1480  }
1481  }
1482  }
1483  }
1484  if (!skip) {
1485  m_Objs["[n] biosource[s] [has] unnecessary environmental qualifier"].Add(*context.BiosourceObjRef(*biosrc));
1486  }
1487  }
1488  }
1489  }
1490 }
1491 
1492 
1493 // END_COLON_IN_COUNTRY
1494 
1495 DISCREPANCY_CASE(END_COLON_IN_COUNTRY, BIOSRC, eOncaller, "Country name end with colon")
1496 {
1497  for (const CBioSource* biosrc : context.GetBiosources()) {
1498  if (biosrc->IsSetSubtype()) {
1499  for (const auto& subtype : biosrc->GetSubtype()) {
1500  if (subtype->IsSetSubtype() && subtype->GetSubtype() == CSubSource::eSubtype_country) {
1501  const string& s = subtype->GetName();
1502  if (s.length() && s[s.length() - 1] == ':') {
1503  m_Objs["[n] country source[s] end[S] with a colon."].Add(*context.BiosourceObjRef(*biosrc, true));
1504  }
1505  }
1506  }
1507  }
1508  }
1509 }
1510 
1511 
1513 {
1514  if (!src.IsSetSubtype()) {
1515  return false;
1516  }
1517  bool fixed = false;
1518  for (const auto& subtype : src.GetSubtype()) {
1519  if (subtype->IsSetSubtype() && subtype->GetSubtype() == CSubSource::eSubtype_country) {
1520  CSubSource& ss = const_cast<CSubSource&>(*subtype);
1521  string& s = ss.SetName();
1522  while (s.length() && s[s.length()-1] == ':') {
1523  s.resize(s.length()-1);
1524  fixed = true;
1525  }
1526  }
1527  }
1528  return fixed;
1529 }
1530 
1531 
1532 DISCREPANCY_AUTOFIX(END_COLON_IN_COUNTRY)
1533 {
1534  const CSeq_feat* feat = dynamic_cast<const CSeq_feat*>(context.FindObject(*obj));
1535  const CSeqdesc* desc = dynamic_cast<const CSeqdesc*>(context.FindObject(*obj));
1536  if (feat) {
1537  if (RemoveCountryColon(const_cast<CSeq_feat*>(feat)->SetData().SetBiosrc())) {
1538  obj->SetFixed();
1539  return CRef<CAutofixReport>(new CAutofixReport("END_COLON_IN_COUNTRY: [n] country name[s] fixed", 1));
1540  }
1541  }
1542  if (desc) {
1543  if (RemoveCountryColon(const_cast<CSeqdesc*>(desc)->SetSource())) {
1544  obj->SetFixed();
1545  return CRef<CAutofixReport>(new CAutofixReport("END_COLON_IN_COUNTRY: [n] country name[s] fixed", 1));
1546  }
1547  }
1548  return CRef<CAutofixReport>();
1549 }
1550 
1551 
1552 // COUNTRY_COLON
1553 
1554 DISCREPANCY_CASE(COUNTRY_COLON, BIOSRC, eOncaller, "Country description should only have 1 colon")
1555 {
1556  for (const CBioSource* biosrc : context.GetBiosources()) {
1557  if (biosrc->IsSetSubtype()) {
1558  for (const auto& subtype : biosrc->GetSubtype()) {
1559  if (subtype->IsSetSubtype() && subtype->GetSubtype() == CSubSource::eSubtype_country) {
1560  const string& s = subtype->GetName();
1561  int count = 0;
1562  for (size_t i = 0; i < s.length(); i++) {
1563  if (s[i] == ':') {
1564  count++;
1565  if (count > 1) {
1566  m_Objs["[n] country source[s] [has] more than 1 colon."].Add(*context.BiosourceObjRef(*biosrc, true));
1567  break;
1568  }
1569  }
1570  }
1571  }
1572  }
1573  }
1574  }
1575 }
1576 
1577 
1579 {
1580  if (!src.IsSetSubtype()) {
1581  return false;
1582  }
1583  bool fixed = false;
1584  for (const auto& subtype : src.GetSubtype()) {
1585  if (subtype->IsSetSubtype() && subtype->GetSubtype() == CSubSource::eSubtype_country) {
1586  CSubSource& ss = const_cast<CSubSource&>(*subtype);
1587  string& s = ss.SetName();
1588  int count = 0;
1589  for (size_t i = 0; i < s.length(); i++) {
1590  if (s[i] == ':') {
1591  count++;
1592  if (count > 1) {
1593  s[i] = ',';
1594  fixed = true;
1595  }
1596  }
1597  }
1598  }
1599  }
1600  return fixed;
1601 }
1602 
1603 
1604 DISCREPANCY_AUTOFIX(COUNTRY_COLON)
1605 {
1606  const CSeq_feat* feat = dynamic_cast<const CSeq_feat*>(context.FindObject(*obj));
1607  const CSeqdesc* desc = dynamic_cast<const CSeqdesc*>(context.FindObject(*obj));
1608  if (feat) {
1609  if (ChangeCountryColonToComma(const_cast<CSeq_feat*>(feat)->SetData().SetBiosrc())) {
1610  obj->SetFixed();
1611  return CRef<CAutofixReport>(new CAutofixReport("COUNTRY_COLON: [n] country name[s] fixed", 1));
1612  }
1613  }
1614  if (desc) {
1615  if (ChangeCountryColonToComma(const_cast<CSeqdesc*>(desc)->SetSource())) {
1616  obj->SetFixed();
1617  return CRef<CAutofixReport>(new CAutofixReport("COUNTRY_COLON: [n] country name[s] fixed", 1));
1618  }
1619  }
1620  return CRef<CAutofixReport>();
1621 }
1622 
1623 
1624 // HUMAN_HOST
1625 
1626 DISCREPANCY_CASE(HUMAN_HOST, BIOSRC, eDisc | eOncaller, "\'Human\' in host should be \'Homo sapiens\'")
1627 {
1628  for (const CBioSource* biosrc : context.GetBiosources()) {
1629  if (biosrc->IsSetOrg() && biosrc->GetOrg().CanGetOrgname() && biosrc->GetOrg().GetOrgname().CanGetMod()) {
1630  for (const auto& it : biosrc->GetOrg().GetOrgname().GetMod()) {
1631  if (it->CanGetSubtype() && it->GetSubtype() == COrgMod::eSubtype_nat_host && NStr::FindNoCase(it->GetSubname(), "human") != NPOS) {
1632  m_Objs["[n] organism[s] [has] \'human\' host qualifiers"].Add(*context.BiosourceObjRef(*biosrc, true));
1633  }
1634  }
1635  }
1636  }
1637 }
1638 
1639 
1640 static bool FixHumanHost(CBioSource& src)
1641 {
1642  if (!src.IsSetOrg()) {
1643  return false;
1644  }
1645  bool fixed = false;
1646  for (const auto& it : src.GetOrg().GetOrgname().GetMod()) {
1647  if (it->CanGetSubtype() && it->GetSubtype() == COrgMod::eSubtype_nat_host && NStr::FindNoCase(it->GetSubname(), "human") != NPOS) {
1648  COrgMod& om = const_cast<COrgMod&>(*it);
1649  NStr::ReplaceInPlace(om.SetSubname(), "human", "Homo sapiens");
1650  fixed = true;
1651  }
1652  }
1653  return fixed;
1654 }
1655 
1656 
1658 {
1659  const CSeq_feat* feat = dynamic_cast<const CSeq_feat*>(context.FindObject(*obj));
1660  const CSeqdesc* desc = dynamic_cast<const CSeqdesc*>(context.FindObject(*obj));
1661  if (feat) {
1662  if (FixHumanHost(const_cast<CSeq_feat*>(feat)->SetData().SetBiosrc())) {
1663  obj->SetFixed();
1664  return CRef<CAutofixReport>(new CAutofixReport("HUMAN_HOST: [n] host qualifier[s] fixed", 1));
1665  }
1666  }
1667  if (desc) {
1668  if (FixHumanHost(const_cast<CSeqdesc*>(desc)->SetSource())) {
1669  obj->SetFixed();
1670  return CRef<CAutofixReport>(new CAutofixReport("HUMAN_HOST: [n] host qualifier[s] fixed", 1));
1671  }
1672  }
1673  return CRef<CAutofixReport>();
1674 }
1675 
1676 
1677 // CHECK_AUTHORITY
1678 
1679 DISCREPANCY_CASE(CHECK_AUTHORITY, BIOSRC, eDisc | eOncaller, "Authority and Taxname should match first two words")
1680 {
1681  for (const CBioSource* biosrc : context.GetBiosources()) {
1682  if (biosrc->IsSetOrg() && biosrc->GetOrg().CanGetOrgname() && biosrc->GetOrg().GetOrgname().CanGetMod() && biosrc->GetOrg().CanGetTaxname() && biosrc->GetOrg().GetTaxname().length()) {
1683  string tax1, tax2;
1684  for (const auto& it : biosrc->GetOrg().GetOrgname().GetMod()) {
1685  if (it->CanGetSubtype() && it->GetSubtype() == COrgMod::eSubtype_authority) {
1686  if (tax1.empty()) {
1687  list<CTempString> tmp;
1688  NStr::Split(biosrc->GetOrg().GetTaxname(), " ", tmp, NStr::fSplit_Tokenize);
1689  list<CTempString>::const_iterator p = tmp.cbegin();
1690  if (p != tmp.cend()) {
1691  tax1 = *p;
1692  p++;
1693  if (p != tmp.cend()) {
1694  tax2 = *p;
1695  }
1696  }
1697  }
1698  string aut1, aut2;
1699  list<CTempString> tmp;
1700  NStr::Split(it->GetSubname(), " ", tmp, NStr::fSplit_Tokenize);
1701  list<CTempString>::const_iterator p = tmp.cbegin();
1702  if (p != tmp.cend()) {
1703  aut1 = *p;
1704  p++;
1705  if (p != tmp.cend()) {
1706  aut2 = *p;
1707  }
1708  }
1709  if (aut1 != tax1 || aut2 != tax2) {
1710  m_Objs["[n] biosource[s] [has] taxname/authority conflict"].Add(*context.BiosourceObjRef(*biosrc));
1711  }
1712  }
1713  }
1714  }
1715  }
1716 
1717 }
1718 
1719 
1720 // TRINOMIAL_SHOULD_HAVE_QUALIFIER
1721 
1722 static const pair<int, string> srcqual_keywords[] = {
1723  { COrgMod::eSubtype_forma_specialis, " f. sp." } ,
1724  { COrgMod::eSubtype_forma, " f." } ,
1725  { COrgMod::eSubtype_sub_species, " subsp." } ,
1726  { COrgMod::eSubtype_variety, " var." } ,
1727  { COrgMod::eSubtype_pathovar, " pv." }
1728 };
1729 
1731 
1732 static string GetSrcQual(const CBioSource& bs, int qual)
1733 {
1734  if (bs.GetOrg().CanGetOrgname() && bs.GetOrg().GetOrgname().CanGetMod()) {
1735  for (const auto& it : bs.GetOrg().GetOrgname().GetMod()) {
1736  if (it->CanGetSubtype() && it->GetSubtype() == qual) {
1737  return it->GetSubname();
1738  }
1739  }
1740  }
1741  return kEmptyStr;
1742 }
1743 
1744 
1745 DISCREPANCY_CASE(TRINOMIAL_SHOULD_HAVE_QUALIFIER, BIOSRC, eDisc | eOncaller | eSmart, "Trinomial sources should have corresponding qualifier")
1746 {
1747  for (const CBioSource* biosrc : context.GetBiosources()) {
1748  if (biosrc->IsSetOrg() && biosrc->GetOrg().CanGetTaxname() && biosrc->GetOrg().GetTaxname().length() && NStr::FindNoCase(biosrc->GetOrg().GetTaxname(), " x ") == NPOS && !CDiscrepancyContext::HasLineage(*biosrc, context.GetLineage(), "Viruses")) {
1749  const string& taxname = biosrc->GetOrg().GetTaxname();
1750  for (size_t i = 0; i < srcqual_keywords_sz; i++) {
1751  size_t n = NStr::FindNoCase(taxname, srcqual_keywords[i].second);
1752  if (n != NPOS) {
1753  for (n += srcqual_keywords[i].second.length(); n < taxname.length(); n++) {
1754  if (taxname[n] != ' ') {
1755  break;
1756  }
1757  }
1758  if (n < taxname.length()) {
1759  string q = GetSrcQual(*biosrc, srcqual_keywords[i].first);
1760  string s = taxname.substr(n, q.length());
1761  if (!q.length() || NStr::CompareNocase(s, q)) {
1762  m_Objs["[n] trinomial source[s] lack[S] corresponding qualifier"].Add(*context.BiosourceObjRef(*biosrc));
1763  }
1764  break;
1765  }
1766  }
1767  }
1768  }
1769  }
1770 }
1771 
1772 
1773 // AMPLIFIED_PRIMERS_NO_ENVIRONMENTAL_SAMPLE
1774 
1775 DISCREPANCY_CASE(AMPLIFIED_PRIMERS_NO_ENVIRONMENTAL_SAMPLE, BIOSRC, eOncaller, "Species-specific primers, no environmental sample")
1776 {
1777  for (const CBioSource* biosrc : context.GetBiosources()) {
1778  if (!biosrc->HasSubtype(CSubSource::eSubtype_environmental_sample)) {
1779  bool has_primer_note = false;
1780  if (biosrc->CanGetSubtype()) {
1781  for (const auto& it : biosrc->GetSubtype()) {
1782  if (it->GetSubtype() == CSubSource::eSubtype_other && NStr::FindNoCase(it->GetName(), "amplified with species-specific primers") != NPOS) {
1783  has_primer_note = true;
1784  break;
1785  }
1786  }
1787  }
1788  if (!has_primer_note && biosrc->IsSetOrg() && biosrc->GetOrg().CanGetOrgname() && biosrc->GetOrg().GetOrgname().CanGetMod()) {
1789  for (const auto& it : biosrc->GetOrg().GetOrgname().GetMod()) {
1790  if (it->CanGetSubtype() && it->GetSubtype() == COrgMod::eSubtype_other && it->IsSetSubname() && NStr::FindNoCase(it->GetSubname(), "amplified with species-specific primers") != NPOS) {
1791  has_primer_note = true;
1792  break;
1793  }
1794  }
1795  }
1796  if (has_primer_note) {
1797  m_Objs["[n] biosource[s] [has] \'amplified with species-specific primers\' note but no environmental-sample qualifier."].Add(*context.BiosourceObjRef(*biosrc, true));
1798  }
1799  }
1800  }
1801 }
1802 
1803 
1805 {
1806  bool change = false;
1809  change = true;
1810  }
1811  for (auto& s : src.SetSubtype()) {
1812  if (s->GetSubtype() == CSubSource::eSubtype_other && s->IsSetName()) {
1813  const string orig = s->GetName();
1814  NStr::ReplaceInPlace(s->SetName(), "[amplified with species-specific primers", "amplified with species-specific primers");
1815  NStr::ReplaceInPlace(s->SetName(), "amplified with species-specific primers]", "amplified with species-specific primers");
1816  if (!NStr::Equal(orig, s->GetName())) {
1817  change = true;
1818  break;
1819  }
1820  }
1821  }
1822 
1823  return change;
1824 }
1825 
1826 
1827 DISCREPANCY_AUTOFIX(AMPLIFIED_PRIMERS_NO_ENVIRONMENTAL_SAMPLE)
1828 {
1829  const CSeq_feat* feat = dynamic_cast<const CSeq_feat*>(context.FindObject(*obj));
1830  const CSeqdesc* desc = dynamic_cast<const CSeqdesc*>(context.FindObject(*obj));
1831  if (feat) {
1832  if (SetEnvSampleFixAmplifiedPrimers(const_cast<CSeq_feat*>(feat)->SetData().SetBiosrc())) {
1833  obj->SetFixed();
1834  return CRef<CAutofixReport>(new CAutofixReport("AMPLIFIED_PRIMERS_NO_ENVIRONMENTAL_SAMPLE: Set environmental_sample, fixed amplified primers note for [n] source[s]", 1));
1835  }
1836  }
1837  if (desc) {
1838  if (SetEnvSampleFixAmplifiedPrimers(const_cast<CSeqdesc*>(desc)->SetSource())) {
1839  obj->SetFixed();
1840  return CRef<CAutofixReport>(new CAutofixReport("AMPLIFIED_PRIMERS_NO_ENVIRONMENTAL_SAMPLE: Set environmental_sample, fixed amplified primers note for [n] source[s]", 1));
1841  }
1842  }
1843  return CRef<CAutofixReport>();
1844 }
1845 
1846 
1847 // MISSING_PRIMER
1848 
1849 DISCREPANCY_CASE(MISSING_PRIMER, BIOSRC, eOncaller, "Missing values in primer set")
1850 {
1851  for (const CBioSource* biosrc : context.GetBiosources()) {
1852  if (biosrc->CanGetPcr_primers() && biosrc->GetPcr_primers().CanGet()) {
1853  bool report = false;
1854  for (const auto& pr : biosrc->GetPcr_primers().Get()) {
1855  if (pr->CanGetForward() != pr->CanGetReverse()) {
1856  report = true;
1857  break;
1858  }
1859  if (pr->CanGetForward()) {
1860  const CPCRPrimerSet& fwdset = pr->GetForward();
1861  const CPCRPrimerSet& revset = pr->GetReverse();
1862  CPCRPrimerSet::Tdata::const_iterator fwd = fwdset.Get().cbegin();
1863  CPCRPrimerSet::Tdata::const_iterator rev = revset.Get().cbegin();
1864  while (fwd != fwdset.Get().cend() && rev != revset.Get().cend()) {
1865  if (((*fwd)->CanGetName() && !(*fwd)->GetName().Get().empty()) != ((*rev)->CanGetName() && !(*rev)->GetName().Get().empty()) || ((*fwd)->CanGetSeq() && !(*fwd)->GetSeq().Get().empty()) != ((*rev)->CanGetSeq() && !(*rev)->GetSeq().Get().empty())) {
1866  report = true;
1867  break;
1868  }
1869  fwd++;
1870  rev++;
1871  }
1872  if (report) {
1873  break;
1874  }
1875  }
1876  }
1877  if (report) {
1878  m_Objs["[n] biosource[s] [has] primer set[s] with missing values"].Add(*context.BiosourceObjRef(*biosrc));
1879  }
1880  }
1881  }
1882 }
1883 
1884 
1885 // DUPLICATE_PRIMER_SET
1886 
1888 {
1889  size_t count_a = a.size();
1890  size_t count_b = b.size();
1891  if (count_a != count_b) {
1892  return false;
1893  }
1894  for (CPCRPrimerSet::Tdata::const_iterator it = a.cbegin(); it != a.cend(); it++) {
1895  CPCRPrimerSet::Tdata::const_iterator jt = b.cbegin();
1896  for (; jt != b.cend(); jt++) {
1897  if ((*it)->CanGetName() == (*jt)->CanGetName() && (*it)->CanGetSeq() == (*jt)->CanGetSeq()
1898  && (!(*it)->CanGetName() || (*it)->GetName().Get() == (*jt)->GetName().Get())
1899  && (!(*it)->CanGetSeq() || (*it)->GetSeq().Get() == (*jt)->GetSeq().Get())) {
1900  break;
1901  }
1902  }
1903  if (jt == b.cend()) {
1904  return false;
1905  }
1906  }
1907  return true;
1908 }
1909 
1910 
1911 static bool inline FindDuplicatePrimers(const CPCRReaction& a, const CPCRReaction& b)
1912 {
1913  return a.CanGetForward() == b.CanGetForward() && a.CanGetReverse() == b.CanGetReverse()
1914  && (!a.CanGetForward() || EqualPrimerSets(a.GetForward().Get(), b.GetForward().Get()))
1915  && (!a.CanGetReverse() || EqualPrimerSets(a.GetReverse().Get(), b.GetReverse().Get()));
1916 }
1917 
1918 
1919 DISCREPANCY_CASE(DUPLICATE_PRIMER_SET, BIOSRC, eOncaller, "Duplicate PCR primer pair")
1920 {
1921  for (const CBioSource* biosrc : context.GetBiosources()) {
1922  if (biosrc->CanGetPcr_primers() && biosrc->GetPcr_primers().CanGet()) {
1923  bool done = false;
1924  const CPCRReactionSet::Tdata data = biosrc->GetPcr_primers().Get();
1925  for (CPCRReactionSet::Tdata::const_iterator it = data.cbegin(); !done && it != data.cend(); it++) {
1926  CPCRReactionSet::Tdata::const_iterator jt = it;
1927  for (jt++; !done && jt != data.cend(); jt++) {
1928  if (FindDuplicatePrimers(**it, **jt)) {
1929  m_Objs["[n] BioSource[s] [has] duplicate primer pairs."].Add(*context.BiosourceObjRef(*biosrc));
1930  done = true;
1931  }
1932  }
1933  }
1934  }
1935  }
1936 }
1937 
1938 
1939 
1940 // METAGENOMIC
1941 
1942 DISCREPANCY_CASE(METAGENOMIC, BIOSRC, eDisc | eOncaller | eSmart, "Source has metagenomic qualifier")
1943 {
1944  for (const CBioSource* biosrc : context.GetBiosources()) {
1945  if (biosrc->CanGetSubtype()) {
1946  for (const auto& it : biosrc->GetSubtype()) {
1947  if (it->GetSubtype() == CSubSource::eSubtype_metagenomic) {
1948  m_Objs["[n] biosource[s] [has] metagenomic qualifier"].Add(*context.BiosourceObjRef(*biosrc));
1949  break;
1950  }
1951  }
1952  }
1953  }
1954 }
1955 
1956 
1957 // METAGENOME_SOURCE
1958 
1959 DISCREPANCY_CASE(METAGENOME_SOURCE, BIOSRC, eDisc | eOncaller | eSmart, "Source has metagenome_source qualifier")
1960 {
1961  for (const CBioSource* biosrc : context.GetBiosources()) {
1962  if (biosrc->IsSetOrg() && biosrc->GetOrg().CanGetOrgname() && biosrc->GetOrg().GetOrgname().CanGetMod() && biosrc->GetOrg().IsSetTaxname() && !biosrc->GetOrg().GetTaxname().empty()) {
1963  for (const auto& it : biosrc->GetOrg().GetOrgname().GetMod()) {
1964  if (it->CanGetSubtype() && it->GetSubtype() == COrgMod::eSubtype_metagenome_source) {
1965  m_Objs["[n] biosource[s] [has] metagenome_source qualifier"].Add(*context.BiosourceObjRef(*biosrc));
1966  break;
1967  }
1968  }
1969  }
1970  }
1971 }
1972 
1973 
1974 // DUP_SRC_QUAL
1975 
1976 static string GetOrgModName(const COrgMod& qual)
1977 {
1978  const COrgMod::TSubtype& subtype = qual.GetSubtype();
1979  return subtype == COrgMod::eSubtype_other ? "note-orgmod" : subtype == COrgMod::eSubtype_nat_host ? "host" : qual.GetSubtypeName(subtype, COrgMod::eVocabulary_raw);
1980 }
1981 
1982 
1983 static string GetSubtypeName(const CSubSource& qual)
1984 {
1985  const CSubSource::TSubtype& subtype = qual.GetSubtype();
1986  return subtype == CSubSource::eSubtype_other ? "note-subsrc" : qual.GetSubtypeName(subtype, CSubSource::eVocabulary_raw);
1987 }
1988 
1989 
1990 static const char* kDupSrc = "[n] source[s] [has] two or more qualifiers with the same value";
1991 
1992 
1993 DISCREPANCY_CASE1(DUP_SRC_QUAL, BIOSRC, eDisc | eOncaller | eSmart, "Each qualifier on a source should have different value",
1994  "DUP_SRC_QUAL_DATA")
1995 {
1996  for (const CBioSource* biosrc : context.GetBiosources()) {
1998  string collected_by;
1999  string identified_by;
2000  if (biosrc->CanGetSubtype()) {
2001  for (const auto& it : biosrc->GetSubtype()) {
2002  if (it->CanGetName()) {
2003  const string& s = it->GetName();
2004  if (it->CanGetSubtype()) {
2005  if (it->GetSubtype() == CSubSource::eSubtype_collected_by) {
2006  collected_by = s;
2007  }
2008  else if (it->GetSubtype() == CSubSource::eSubtype_identified_by) {
2009  identified_by = s;
2010  }
2011  }
2012  if (!s.empty()) {
2013  Map[s].push_back(GetSubtypeName(*it));
2014  }
2015  }
2016  }
2017  }
2018  if (biosrc->IsSetOrg() && biosrc->GetOrg().CanGetOrgname() && biosrc->GetOrg().GetOrgname().CanGetMod()) {
2019  for (const auto& it : biosrc->GetOrg().GetOrgname().GetMod()) {
2020  if (it->IsSetSubname()) {
2021  const string& s = it->GetSubname();
2022  if (it->CanGetSubtype()) {
2023  if (it->GetSubtype() == COrgMod::eSubtype_anamorph || it->GetSubtype() == COrgMod::eSubtype_common ||
2024  it->GetSubtype() == COrgMod::eSubtype_old_name || it->GetSubtype() == COrgMod::eSubtype_old_lineage ||
2025  it->GetSubtype() == COrgMod::eSubtype_gb_acronym || it->GetSubtype() == COrgMod::eSubtype_gb_anamorph ||
2026  it->GetSubtype() == COrgMod::eSubtype_gb_synonym) {
2027  continue;
2028  }
2029  }
2030  if (!s.empty()) {
2031  Map[s].push_back(GetOrgModName(*it));
2032  }
2033  }
2034  }
2035  }
2036  if (biosrc->IsSetOrg() && biosrc->GetOrg().CanGetTaxname()) {
2037  const string& s = biosrc->GetOrg().GetTaxname();
2038  if (!s.empty()) {
2039  Map[s].push_back("organism");
2040  }
2041  }
2042  if (biosrc->CanGetPcr_primers()) {
2043  for (const auto& it : biosrc->GetPcr_primers().Get()) {
2044  if (it->CanGetForward()) {
2045  for (const auto& pr : it->GetForward().Get()) {
2046  if (pr->CanGetName()) {
2047  Map[pr->GetName()].push_back("fwd-primer-name");
2048  }
2049  if (pr->CanGetSeq()) {
2050  Map[pr->GetSeq()].push_back("fwd-primer-seq");
2051  }
2052  }
2053  }
2054  if (it->CanGetReverse()) {
2055  for (const auto& pr : it->GetReverse().Get()) {
2056  if (pr->CanGetName()) {
2057  Map[pr->GetName()].push_back("rev-primer-name");
2058  }
2059  if (pr->CanGetSeq()) {
2060  Map[pr->GetSeq()].push_back("rev-primer-seq");
2061  }
2062  }
2063  }
2064  }
2065  }
2066  bool bad = false;
2067  for (const auto& it : Map) {
2068  if (it.second.size() > 1) {
2069  if (it.second.size() == 2 && it.first == collected_by && collected_by == identified_by) {
2070  continue; // there is no error if collected_by equals to identified_by
2071  }
2072  string s = "[n] biosource[s] [has] value\'";
2073  s += it.first;
2074  s += "\' for these qualifiers: ";
2075  for (size_t i = 0; i < it.second.size(); i++) {
2076  if (i) {
2077  s += ", ";
2078  }
2079  s += it.second[i];
2080  }
2081  m_Objs[kDupSrc][s].Add(*context.BiosourceObjRef(*biosrc));
2082  }
2083  }
2084  if (bad) {
2085  m_Objs[kDupSrc].Incr();
2086  }
2087  }
2088 }
2089 
2090 
2091 
2092 // UNUSUAL_ITS
2093 
2094 DISCREPANCY_CASE(UNUSUAL_ITS, SEQUENCE, eDisc | eOncaller, "Test Bioseqs for unusual rRNA / ITS")
2095 {
2096  const CSeqdesc* biosrc = context.GetBiosource();
2097  if (context.HasLineage(biosrc ? &biosrc->GetSource() : nullptr, "Microsporidia")) {
2098  bool has_unusual = false;
2099  for (const CSeq_feat& feat : context.GetFeat()) {
2100  if (feat.IsSetComment() && feat.IsSetData() && feat.GetData().IsRna()) {
2101  const CRNA_ref& rna = feat.GetData().GetRna();
2102  if (rna.IsSetType() && rna.GetType() == CRNA_ref::eType_miscRNA) {
2103  if (NStr::StartsWith(feat.GetComment(), "contains", NStr::eNocase)) {
2104  has_unusual = true;
2105  break;
2106  }
2107  }
2108  }
2109  }
2110  if (has_unusual) {
2111  m_Objs["[n] Bioseq[s] [has] unusual rRNA / ITS"].Add(*context.BioseqObjRef());
2112  }
2113  }
2114 }
2115 
2116 
2117 // SARS_QUALS
2118 
2119 #define SARS_TAX_ID 2697049
2120 
2121 DISCREPANCY_CASE(SARS_QUALS, BIOSRC, eOncaller, "SARS-CoV-2 isolate must have correct format")
2122 {
2123  for (const CBioSource* biosrc : context.GetBiosources()) {
2124  if (biosrc->IsSetOrg() && biosrc->GetOrg().CanGetDb()) {
2125  bool sars = false;
2126  for (const auto& db : biosrc->GetOrg().GetDb()) {
2127  if (db->CanGetTag() && db->GetTag().IsId() && db->GetTag().GetId() == SARS_TAX_ID && db->CanGetDb() && db->GetDb() == "taxon") {
2128  sars = true;
2129  break;
2130  }
2131  }
2132  if (sars) {
2133  string isolate;
2134  bool good = true;
2135  if (biosrc->IsSetOrg() && biosrc->GetOrg().IsSetOrgname() && biosrc->GetOrg().GetOrgname().IsSetMod()) {
2136  for (const auto& m : biosrc->GetOrg().GetOrgname().GetMod()) {
2137  if (m->IsSetSubtype() && m->GetSubtype() == COrgMod::eSubtype_isolate && m->IsSetSubname()) {
2138  isolate = m->GetSubname();
2139  break;
2140  }
2141  }
2142  }
2143  if (!isolate.length()) {
2144  isolate = "no isolate";
2145  good = false;
2146  }
2147  if (good && isolate.find("SARS-CoV-2") != 0) {
2148  good = false;
2149  }
2150  string year;
2151  if (good) {
2152  vector<string> arr;
2153  NStr::SplitByPattern(isolate, "/", arr);
2154  if (arr.size() == 5) {
2155  year = arr[4];
2156  }
2157  else {
2158  good = false;
2159  }
2160  }
2161  if (good) {
2162  string date, date0, date1;
2163  if (biosrc->IsSetSubtype()) {
2164  for (const auto& it : biosrc->GetSubtype()) {
2165  if (it->IsSetSubtype() && it->GetSubtype() == CSubSource::eSubtype_collection_date && it->IsSetName()) {
2166  date = it->GetName();
2167  break;
2168  }
2169  }
2170  }
2171  if (date.length() >= 4) {
2172  date0 = date.substr(0, 4);
2173  date1 = date.substr(date.length() - 4);
2174  }
2175  good = year == date0 || year == date1;
2176  }
2177  if (!good) {
2178  m_Objs["[n] SARS-CoV-2 biosource[s] [has] wrong isolate format"][isolate].Add(*context.BiosourceObjRef(*biosrc));
2179  }
2180  }
2181  }
2182  }
2183 }
2184 
2185 
User-defined methods of the data storage class.
USING_SCOPE(objects)
static const CDbtag * GetTaxonTag(const COrg_ref &org)
list< string > TInconsistecyDescriptionList
static bool IsSameDb(const COrg_ref::TDb &first, const COrg_ref::TDb &second)
static bool IsMissingRequiredClone(const CBioSource &biosource)
static bool SetCultureCollectionFromStrain(CBioSource &src)
static bool IsSameSubtype(const CBioSource::TSubtype &first, const CBioSource::TSubtype &second)
static void GetOrgrefDifferences(const COrg_ref &first_org, const COrg_ref &second_org, TInconsistecyDescriptionList &diffs)
static bool MatchExceptSpaceColon(const string &a, const string &b)
const string kAmplifiedWithSpeciesSpecificPrimers
const string kMissingViralQualsTop
static bool SetEnvSampleFixAmplifiedPrimers(CBioSource &src)
static bool FindDuplicatePrimers(const CPCRReaction &a, const CPCRReaction &b)
static void GetBiosourceDifferences(const CBioSource &first_biosrc, const CBioSource &second_biosrc, TInconsistecyDescriptionList &diffs)
static bool ChangeCountryColonToComma(CBioSource &src)
bool HasStrainForATCCCultureCollection(const COrgName::TMod &mods, const string &culture_collection)
static bool EqualPrimerSets(const CPCRPrimerSet::Tdata &a, const CPCRPrimerSet::Tdata &b)
bool HasCultureCollectionForATCCStrain(const COrgName::TMod &mods, const string &strain)
static bool FixHumanHost(CBioSource &src)
static bool OrgDiffers(const COrg_ref &first, const COrg_ref &second)
static CRef< CTaxon3_reply > GetOrgRefs(vector< CRef< COrg_ref >> &orgs)
static const size_t srcqual_keywords_sz
#define SARS_TAX_ID
static const char * kDupSrc
static string GetOrgModName(const COrgMod &qual)
static void GetOrgnameDifferences(const COrgName &first, const COrgName &second, TInconsistecyDescriptionList &diffs)
static string GetSubtypeName(const CSubSource &qual)
static string GetSrcQual(const CBioSource &bs, int qual)
bool HasAmplifiedWithSpeciesSpecificPrimerNote(const CBioSource &src)
static const pair< int, string > srcqual_keywords[]
static bool RemoveCountryColon(CBioSource &src)
#define bool
Definition: bool.h:34
bool HasSubtype(CSubSource::TSubtype subtype) const
Definition: BioSource.cpp:2040
bool IsNa(void) const
Definition: Bioseq.cpp:345
bool(T::* TIsSetFn)() const
int(T::* TGetIntFn)() const
static bool IsEqualVal(const T &first, const T &second, TIsSetFn is_set_fn, TGetRFn get_fn, const R &empty_val)
const R &(T::* TGetRFn)() const
static bool IsEqualInt(const T &first, const T &second, TIsSetFn is_set_fn, TGetIntFn get_fn, int not_set)
Definition: Dbtag.hpp:53
static bool HasLineage(const CBioSource &biosrc, const string &def_lineage, const string &type)
@OrgMod.hpp User-defined methods of the data storage class.
Definition: OrgMod.hpp:54
@ eVocabulary_raw
Definition: OrgMod.hpp:68
static string GetSubtypeName(TSubtype stype, EVocabulary vocabulary=eVocabulary_raw)
Definition: OrgMod.cpp:108
bool IsSetOrgMod(void) const
Definition: Org_ref.cpp:169
CPCRPrimerSet –.
CPCRReaction –.
Definition: PCRReaction.hpp:66
@RNA_ref.hpp User-defined methods of the data storage class.
Definition: RNA_ref.hpp:54
CRegexp –.
Definition: regexp.hpp:70
virtual vector< CRef< CReportItem > > GetSubitems() const =0
static void Add(TReportObjectList &list, TReportObjectSet &hash, CReportObj &obj, bool unique=true)
CReportNode & Ext(bool b=true)
TNodeMap & GetMap()
CRef< CReportItem > Export(CDiscrepancyCore &test, bool unique=true) const
CScope –.
Definition: scope.hpp:92
CSeq_feat_EditHandle –.
namespace ncbi::objects::
Definition: Seq_feat.hpp:58
static bool HasCultureNotes(const string &value)
Definition: SubSource.cpp:5128
@ eVocabulary_raw
Definition: SubSource.hpp:79
static string GetSubtypeName(CSubSource::TSubtype stype, EVocabulary vocabulary=eVocabulary_raw)
Definition: SubSource.cpp:151
static CRef< CDate > DateFromCollectionDate(const string &str) THROWS((CException))
Definition: SubSource.cpp:253
virtual CRef< CTaxon3_reply > SendOrgRefList(const vector< CRef< COrg_ref > > &list, COrg_ref::fOrgref_parts result_parts=COrg_ref::eOrgref_default, fT3reply_parts t3result_parts=eT3reply_default)
Definition: taxon3.cpp:190
size_type size() const
Definition: map.hpp:148
Definition: map.hpp:338
#define T(s)
Definition: common.h:230
vector< CRef< CReportObj > > TReportObjectList
@ eFatal
@ eDisc
@ eOncaller
@ eSubmitter
@ eSmart
#define DISCREPANCY_AUTOFIX(name)
#define DISCREPANCY_CASE1(name, type, group, descr,...)
#define DISCREPANCY_CASE(name, type, group, descr)
#define DISCREPANCY_SUMMARIZE(name)
static DLIST_TYPE *DLIST_NAME() first(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:46
int GetSubtype(CFieldNamePanel *field_name_panel, string &ncRNA_class)
CRange< Position > Map(const CRange< Position > &target, const CRange< Position > &range)
Definition: blast_aux.cpp:826
constexpr size_t ArraySize(const Element(&)[Size])
Definition: ncbimisc.hpp:1532
string
Definition: cgiapp.hpp:687
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Set object to copy of another one.
#define MSerial_AsnBinary
Definition: serialbase.hpp:697
#define ENUM_METHOD_NAME(EnumName)
Definition: serialbase.hpp:994
virtual bool Equals(const CSerialObject &object, ESerialRecursionMode how=eRecursive) const
Check if both objects contain the same values.
CSeq_feat_Handle GetSeq_featHandle(const CSeq_feat &feat, EMissing action=eMissing_Default)
Definition: scope.cpp:200
void Replace(const CSeq_feat &new_feat) const
Replace the feature with new Seq-feat object.
bool IsMatch(CTempString str, TMatch flags=fMatch_default)
Check existence substring which match a specified pattern.
Definition: regexp.cpp:193
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
static list< string > & SplitByPattern(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Variation of Split() with fSplit_ByPattern flag applied by default.
Definition: ncbistr.cpp:3503
static string SizetToString(size_t value, TNumToStringFlags flags=0, int base=10)
Convert size_t to string.
Definition: ncbistr.cpp:2751
#define kEmptyStr
Definition: ncbistr.hpp:123
static int CompareNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive compare of a substring with another string.
Definition: ncbistr.cpp:219
static int StringToInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to int.
Definition: ncbistr.cpp:630
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
Definition: ncbistr.cpp:3457
static SIZE_TYPE FindNoCase(const CTempString str, const CTempString pattern, SIZE_TYPE start, SIZE_TYPE end, EOccurrence which=eFirst)
Find the pattern in the specified range of a string using a case insensitive search.
Definition: ncbistr.cpp:2989
static bool IsBlank(const CTempString str, SIZE_TYPE pos=0)
Check if a string is blank (has no text).
Definition: ncbistr.cpp:106
#define NPOS
Definition: ncbistr.hpp:133
static void TruncateSpacesInPlace(string &str, ETrunc where=eTrunc_Both)
Truncate spaces in a string (in-place)
Definition: ncbistr.cpp:3197
static SIZE_TYPE Find(const CTempString str, const CTempString pattern, ECase use_case=eCase, EDirection direction=eForwardSearch, SIZE_TYPE occurrence=0)
Find the pattern in the string.
Definition: ncbistr.cpp:2887
static string Join(const TContainer &arr, const CTempString &delim)
Join strings using the specified delimiter.
Definition: ncbistr.hpp:2697
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
Definition: ncbistr.hpp:5411
static bool EqualNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive equality of a substring with another string.
Definition: ncbistr.hpp:5352
static bool Equal(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2, ECase use_case=eCase)
Test for equality of a substring with another string.
Definition: ncbistr.hpp:5383
static string & ReplaceInPlace(string &src, const string &search, const string &replace, SIZE_TYPE start_pos=0, SIZE_TYPE max_replace=0, SIZE_TYPE *num_replace=0)
Replace occurrences of a substring within a string.
Definition: ncbistr.cpp:3401
@ fSplit_Tokenize
All delimiters are merged and trimmed, to get non-empty tokens only.
Definition: ncbistr.hpp:2508
@ eNocase
Case insensitive compare.
Definition: ncbistr.hpp:1206
static const char label[]
const TSubtype & GetSubtype(void) const
Get the Subtype member data.
Definition: BioSource_.hpp:539
TGenome GetGenome(void) const
Get the Genome member data.
Definition: BioSource_.hpp:422
TOrigin GetOrigin(void) const
Get the Origin member data.
Definition: BioSource_.hpp:472
const Tdata & Get(void) const
Get the member data.
bool IsSetOrg(void) const
Check if a value has been assigned to Org data member.
Definition: BioSource_.hpp:497
list< CRef< CSubSource > > TSubtype
Definition: BioSource_.hpp:145
bool IsSetSubtype(void) const
Check if a value has been assigned to Subtype data member.
Definition: BioSource_.hpp:527
const TOrg & GetOrg(void) const
Get the Org member data.
Definition: BioSource_.hpp:509
bool IsSetOrigin(void) const
Check if a value has been assigned to Origin data member.
Definition: BioSource_.hpp:447
const TAttrib & GetAttrib(void) const
Get the Attrib member data.
Definition: SubSource_.hpp:397
TSubtype GetSubtype(void) const
Get the Subtype member data.
Definition: SubSource_.hpp:310
bool IsSetGenome(void) const
Check if a value has been assigned to Genome data member.
Definition: BioSource_.hpp:397
bool IsSetSubtype(void) const
Check if a value has been assigned to Subtype data member.
Definition: SubSource_.hpp:291
void SetOrg(TOrg &value)
Assign a value to Org data member.
Definition: BioSource_.cpp:108
void SetName(const TName &value)
Assign a value to Name data member.
Definition: SubSource_.hpp:359
const TName & GetName(void) const
Get the Name member data.
Definition: SubSource_.hpp:350
bool IsSetAttrib(void) const
attribution/source of this name Check if a value has been assigned to Attrib data member.
Definition: SubSource_.hpp:385
bool IsSetIs_focus(void) const
to distinguish biological focus Check if a value has been assigned to Is_focus data member.
Definition: BioSource_.hpp:552
list< CRef< CPCRReaction > > Tdata
bool IsSetName(void) const
Check if a value has been assigned to Name data member.
Definition: SubSource_.hpp:338
TSubtype & SetSubtype(void)
Assign a value to Subtype data member.
Definition: BioSource_.hpp:545
list< CRef< CPCRPrimer > > Tdata
@ eSubtype_collection_date
DD-MMM-YYYY format.
Definition: SubSource_.hpp:114
@ eSubtype_collected_by
name of person who collected the sample
Definition: SubSource_.hpp:115
@ eSubtype_environmental_sample
Definition: SubSource_.hpp:111
@ eSubtype_identified_by
name of person who identified the sample
Definition: SubSource_.hpp:116
bool IsSetYear(void) const
full year (including 1900) Check if a value has been assigned to Year data member.
Definition: Date_std_.hpp:407
bool IsStd(void) const
Check if variant Std is selected.
Definition: Date_.hpp:320
TYear GetYear(void) const
Get the Year member data.
Definition: Date_std_.hpp:426
const TStd & GetStd(void) const
Get the variant data.
Definition: Date_.cpp:109
ESource_qual
Access to ESource_qual's attributes (values, names) as defined in spec.
const TMod & GetMod(void) const
Get the Mod member data.
Definition: OrgName_.hpp:839
bool CanGetMod(void) const
Check if it is safe to call GetMod method.
Definition: OrgName_.hpp:833
bool IsSetDb(void) const
ids in taxonomic or culture dbases Check if a value has been assigned to Db data member.
Definition: Org_ref_.hpp:479
const TLineage & GetLineage(void) const
Get the Lineage member data.
Definition: OrgName_.hpp:864
TSubtype GetSubtype(void) const
Get the Subtype member data.
Definition: OrgMod_.hpp:307
const TDiv & GetDiv(void) const
Get the Div member data.
Definition: OrgName_.hpp:1005
const TAttrib & GetAttrib(void) const
Get the Attrib member data.
Definition: OrgMod_.hpp:394
TMgcode GetMgcode(void) const
Get the Mgcode member data.
Definition: OrgName_.hpp:965
E_Choice Which(void) const
Which variant is currently selected.
Definition: OrgName_.hpp:686
TGcode GetGcode(void) const
Get the Gcode member data.
Definition: OrgName_.hpp:918
bool IsSetSubtype(void) const
Check if a value has been assigned to Subtype data member.
Definition: OrgMod_.hpp:288
const TSubname & GetSubname(void) const
Get the Subname member data.
Definition: OrgMod_.hpp:347
bool IsSetCommon(void) const
common name Check if a value has been assigned to Common data member.
Definition: Org_ref_.hpp:407
bool IsSetLineage(void) const
lineage with semicolon separators Check if a value has been assigned to Lineage data member.
Definition: OrgName_.hpp:852
vector< CRef< CDbtag > > TDb
Definition: Org_ref_.hpp:101
const TName & GetName(void) const
Get the Name member data.
Definition: OrgName_.hpp:771
const TTaxname & GetTaxname(void) const
Get the Taxname member data.
Definition: Org_ref_.hpp:372
bool IsSetMgcode(void) const
mitochondrial genetic code Check if a value has been assigned to Mgcode data member.
Definition: OrgName_.hpp:946
const TCommon & GetCommon(void) const
Get the Common member data.
Definition: Org_ref_.hpp:419
const TDb & GetDb(void) const
Get the Db member data.
Definition: Org_ref_.hpp:491
const TSyn & GetSyn(void) const
Get the Syn member data.
Definition: Org_ref_.hpp:516
bool IsSetDiv(void) const
GenBank division code Check if a value has been assigned to Div data member.
Definition: OrgName_.hpp:993
bool IsSetMod(void) const
Check if a value has been assigned to Mod data member.
Definition: OrgName_.hpp:827
list< CRef< COrgMod > > TMod
Definition: OrgName_.hpp:332
bool IsSetAttrib(void) const
attribution of name Check if a value has been assigned to Attrib data member.
Definition: OrgName_.hpp:780
bool CanGetOrgname(void) const
Check if it is safe to call GetOrgname method.
Definition: Org_ref_.hpp:535
bool IsSetOrgname(void) const
Check if a value has been assigned to Orgname data member.
Definition: Org_ref_.hpp:529
bool IsSetSubname(void) const
Check if a value has been assigned to Subname data member.
Definition: OrgMod_.hpp:335
bool IsSetTaxname(void) const
preferred formal name Check if a value has been assigned to Taxname data member.
Definition: Org_ref_.hpp:360
bool IsSetAttrib(void) const
attribution/source of name Check if a value has been assigned to Attrib data member.
Definition: OrgMod_.hpp:382
bool IsSetGcode(void) const
genetic code (see CdRegion) Check if a value has been assigned to Gcode data member.
Definition: OrgName_.hpp:899
bool IsSetSyn(void) const
synonyms for taxname or common Check if a value has been assigned to Syn data member.
Definition: Org_ref_.hpp:504
bool IsSetName(void) const
Check if a value has been assigned to Name data member.
Definition: OrgName_.hpp:759
list< string > TSyn
Definition: Org_ref_.hpp:102
const TAttrib & GetAttrib(void) const
Get the Attrib member data.
Definition: OrgName_.hpp:792
const TOrgname & GetOrgname(void) const
Get the Orgname member data.
Definition: Org_ref_.hpp:541
@ eSubtype_gb_acronym
used by taxonomy database
Definition: OrgMod_.hpp:115
@ eSubtype_gb_synonym
used by taxonomy database
Definition: OrgMod_.hpp:117
@ eSubtype_anamorph
Definition: OrgMod_.hpp:112
@ eSubtype_pathovar
Definition: OrgMod_.hpp:94
@ eSubtype_other
ASN5: old-name (254) will be added to next spec.
Definition: OrgMod_.hpp:125
@ eSubtype_authority
Definition: OrgMod_.hpp:107
@ eSubtype_sub_species
Definition: OrgMod_.hpp:105
@ eSubtype_nat_host
natural host of this specimen
Definition: OrgMod_.hpp:104
@ eSubtype_variety
Definition: OrgMod_.hpp:89
@ eSubtype_strain
Definition: OrgMod_.hpp:85
@ eSubtype_metagenome_source
Definition: OrgMod_.hpp:120
@ eSubtype_old_name
Definition: OrgMod_.hpp:124
@ eSubtype_specimen_voucher
Definition: OrgMod_.hpp:106
@ eSubtype_serotype
Definition: OrgMod_.hpp:90
@ eSubtype_bio_material
Definition: OrgMod_.hpp:119
@ eSubtype_gb_anamorph
used by taxonomy database
Definition: OrgMod_.hpp:116
@ eSubtype_culture_collection
Definition: OrgMod_.hpp:118
@ eSubtype_forma_specialis
Definition: OrgMod_.hpp:109
@ eSubtype_old_lineage
Definition: OrgMod_.hpp:123
@ eSubtype_isolate
Definition: OrgMod_.hpp:100
bool IsSetData(void) const
the specific data Check if a value has been assigned to Data data member.
Definition: Seq_feat_.hpp:913
const TData & GetData(void) const
Get the Data member data.
Definition: Seq_feat_.hpp:925
void SetData(TData &value)
Assign a value to Data data member.
Definition: Seq_feat_.cpp:94
bool IsBiosrc(void) const
Check if variant Biosrc is selected.
@ eNa_strand_minus
Definition: Na_strand_.hpp:67
const TSource & GetSource(void) const
Get the variant data.
Definition: Seqdesc_.cpp:566
bool IsSource(void) const
Check if variant Source is selected.
Definition: Seqdesc_.hpp:1190
TSource & SetSource(void)
Select the variant.
Definition: Seqdesc_.cpp:572
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
where boath are integers</td > n< td ></td > n</tr > n< tr > n< td > tse</td > n< td > optional</td > n< td > String</td > n< td class=\"description\"> TSE option controls what blob is orig
int i
yy_size_t n
int len
unique_ptr< CLocalTaxon > tax1
unsigned int a
Definition: ncbi_localip.c:102
int isspace(Uchar c)
Definition: ncbictype.hpp:69
int isdigit(Uchar c)
Definition: ncbictype.hpp:64
static char tmp[2048]
Definition: utf8.c:42
static int match(register const pcre_uchar *eptr, register const pcre_uchar *ecode, const pcre_uchar *mstart, int offset_top, match_data *md, eptrblock *eptrb, unsigned int rdepth)
Definition: pcre_exec.c:513
CRef< objects::CObjectManager > om
done
Definition: token1.c:1
C++ wrappers for the Perl-compatible regular expression (PCRE) library.
Modified on Thu Dec 07 10:06:06 2023 by modify_doxy.py rev. 669887