NCBI C++ ToolKit
sequence_tests.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: sequence_tests.cpp 99169 2023-02-21 20:00:56Z foleyjp $
2  * =========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * =========================================================================
25  *
26  * Authors: Colleen Bollin, based on similar discrepancy tests
27  *
28  */
29 
30 #include <ncbi_pch.hpp>
32 #include <objects/seq/MolInfo.hpp>
34 #include <objmgr/seqdesc_ci.hpp>
35 #include <objmgr/feat_ci.hpp>
36 #include <objmgr/seq_vector.hpp>
37 #include <objmgr/bioseq_ci.hpp>
41 #include <objects/seq/Seq_ext.hpp>
51 #include <objects/pub/Pub.hpp>
57 #include <util/xregexp/regexp.hpp>
59 
60 #include "discrepancy_core.hpp"
61 #include "utils.hpp"
63 
64 
68 
69 
70 // DUP_DEFLINE
71 
72 const string kUniqueDeflines = "[n] definition line[s] [is] unique";
73 const string kIdenticalDeflines = "[n] definition line[s] [is] identical";
74 const string kAllUniqueDeflines = "All deflines are unique";
75 const string kSomeIdenticalDeflines = "Defline Problem Report";
76 
77 
78 DISCREPANCY_CASE(DUP_DEFLINE, SEQUENCE, eOncaller, "Definition lines should be unique")
79 {
80  const CBioseq& bioseq = context.CurrentBioseq();
81  if (bioseq.IsAa()) {
82  return;
83  }
84 
85  auto bsh = context.GetBioseqHandle(bioseq);
86  if (bsh) {
87  sequence::CDeflineGenerator deflineGenerator;
88  auto defline = deflineGenerator.GenerateDefline(bsh, 0);
89  if (!NStr::IsBlank(defline)) {
90  if (bioseq.IsSetDescr() && bioseq.GetDescr().IsSet()) {
91  for (const auto& desc : context.GetSeqdesc()) {
92  if (desc.IsTitle()) {
93  m_Objs[defline].Add(*context.SeqdescObjRef(desc));
94  return;
95  }
96  }
97  }
98  m_Objs[defline].Add(*context.BioseqObjRef());
99  }
100  }
101 }
102 
103 
105 {
106  if (m_Objs.empty()) {
107  return;
108  }
109  bool all_unique = true;
111  for (auto& it : m_Objs.GetMap()) {
112  TReportObjectList& list = it.second->GetObjects();
113  if (list.size() == 1) {
115  }
116  else if (list.size() > 1) {
117  tmp[kSomeIdenticalDeflines][kIdenticalDeflines + "[*" + it.first + "*]"].Add(list);
118  all_unique = false;
119  }
120  }
121  if (all_unique) {
122  tmp.clear();
124  }
125  m_ReportItems = tmp.Export(*this)->GetSubitems();
126 }
127 
128 
129 // TERMINAL_NS
130 
131 DISCREPANCY_CASE(TERMINAL_NS, SEQUENCE, eDisc | eSmart | eBig | eFatal, "Ns at end of sequences")
132 {
133  const CBioseq& bioseq = context.CurrentBioseq();
134  if (bioseq.CanGetInst() && bioseq.GetInst().IsNa()) {
135  const CSeqSummary& sum = context.GetSeqSummary();
136  if (sum.StartsWithGap || sum.EndsWithGap) {
137  m_Objs["[n] sequence[s] [has] terminal Ns"].Fatal().Add(*context.BioseqObjRef());
138  }
139  }
140 }
141 
142 
143 // SHORT_PROT_SEQUENCES
144 
145 DISCREPANCY_CASE(SHORT_PROT_SEQUENCES, SEQUENCE, eDisc | eOncaller | eSmart, "Protein sequences should be at least 50 aa, unless they are partial")
146 {
147  const CBioseq& bioseq = context.CurrentBioseq();
148  if (bioseq.CanGetInst() && bioseq.GetInst().IsAa() && bioseq.GetInst().IsSetLength() && bioseq.GetInst().GetLength() < 50) {
149  const CSeqdesc* molinfo = context.GetMolinfo();
151  m_Objs["[n] protein sequences are shorter than 50 aa."].Add(*context.BioseqObjRef(), false);
152  }
153  }
154 }
155 
156 
157 // COMMENT_PRESENT
158 
159 DISCREPANCY_CASE(COMMENT_PRESENT, DESC, eOncaller, "Comment descriptor present")
160 {
161  for (const auto& desc : context.GetSeqdesc()) {
162  if (desc.IsComment()) {
163  m_Objs[desc.GetComment()].Add(*context.SeqdescObjRef(desc));
164  }
165  }
166 }
167 
168 
169 DISCREPANCY_SUMMARIZE(COMMENT_PRESENT)
170 {
171  if (!m_Objs.empty()) {
172  CReportNode rep;
173  string label = m_Objs.GetMap().size() == 1 ? "[n] comment descriptor[s] were found (all same)" : "[n] comment descriptor[s] were found (some different)";
174  for (auto it : m_Objs.GetMap()) {
175  for (auto obj : it.second->GetObjects()) {
176  rep[label].Add(*obj);
177  }
178  }
179  m_ReportItems = rep.Export(*this)->GetSubitems();
180  }
181 }
182 
183 
184 // MRNA_ON_WRONG_SEQUENCE_TYPE
185 
186 DISCREPANCY_CASE(MRNA_ON_WRONG_SEQUENCE_TYPE, SEQUENCE, eDisc | eOncaller, "Eukaryotic sequences that are not genomic or macronuclear should not have mRNA features")
187 {
188  const CBioseq& bioseq = context.CurrentBioseq();
189  if (!bioseq.IsSetInst() || !bioseq.GetInst().IsSetMol() || bioseq.GetInst().GetMol() != CSeq_inst::eMol_dna) {
190  return;
191  }
192  const CSeqdesc* molinfo = context.GetMolinfo();
193  if (!molinfo || !molinfo->GetMolinfo().IsSetBiomol() || molinfo->GetMolinfo().GetBiomol() != CMolInfo::eBiomol_genomic) {
194  return;
195  }
196  const CSeqdesc* biosrc = context.GetBiosource();
197  if (!biosrc || !biosrc->GetSource().IsSetGenome() ||
200  !context.IsEukaryotic(&biosrc->GetSource())) {
201  return;
202  }
203  for (const CSeq_feat* feat : context.FeatMRNAs()) {
204  m_Objs["[n] mRNA[s] [is] located on eukaryotic sequence[s] that [does] not have genomic or plasmid source[s]"].Add(*context.SeqFeatObjRef(*feat));
205  }
206 }
207 
208 
209 // DISC_GAPS
210 
211 const string kSequencesWithGaps = "[n] sequence[s] contain[S] gaps";
212 
213 DISCREPANCY_CASE(GAPS, SEQUENCE, eDisc | eSubmitter | eSmart | eBig, "Sequences with gaps")
214 {
215  const CBioseq& bioseq = context.CurrentBioseq();
216  if (bioseq.CanGetInst() && bioseq.GetInst().IsSetRepr() && bioseq.GetInst().GetRepr() == CSeq_inst::eRepr_delta) {
217  const CSeqSummary& sum = context.GetSeqSummary();
218  bool has_gaps = !!sum.Gaps;
219  if (!has_gaps) {
220  const CSeq_annot* annot = nullptr;
221  for (auto it : bioseq.GetAnnot()) {
222  if (it->IsFtable()) {
223  annot = it;
224  break;
225  }
226  }
227  if (annot) {
228  for (const auto& feat : annot->GetData().GetFtable()) {
229  if (feat->IsSetData() && feat->GetData().GetSubtype() == CSeqFeatData::eSubtype_gap) {
230  has_gaps = true;
231  break;
232  }
233  }
234  }
235  }
236  if (has_gaps) {
237  m_Objs[kSequencesWithGaps].Add(*context.BioseqObjRef());
238  }
239  }
240 }
241 
242 
243 // BIOPROJECT_ID
244 
245 DISCREPANCY_CASE(BIOPROJECT_ID, SEQUENCE, eOncaller, "Sequences with BioProject IDs")
246 {
247  const CBioseq& bioseq = context.CurrentBioseq();
248  if (bioseq.CanGetInst() && bioseq.GetInst().IsNa()) {
249  for (const auto& desc : context.GetAllSeqdesc()) {
250  if (desc.IsUser()) {
251  const CUser_object& user = desc.GetUser();
252  if (user.IsSetData() && user.IsSetType() && user.GetType().IsStr() && user.GetType().GetStr() == "DBLink") {
253  for (const auto& user_field : user.GetData()) {
254  if (user_field->IsSetLabel() && user_field->GetLabel().IsStr() && user_field->GetLabel().GetStr() == "BioProject" && user_field->IsSetData() && user_field->GetData().IsStrs()) {
255  const CUser_field::C_Data::TStrs& strs = user_field->GetData().GetStrs();
256  if (!strs.empty() && !strs[0].empty()) {
257  m_Objs["[n] sequence[s] contain[S] BioProject IDs"].Add(*context.BioseqObjRef());
258  return;
259  }
260  }
261  }
262  }
263  }
264  }
265  }
266 }
267 
268 
269 // MISSING_DEFLINES
270 
271 DISCREPANCY_CASE(MISSING_DEFLINES, SEQUENCE, eAll, "Missing definition lines")
272 {
273  const CBioseq& bioseq = context.CurrentBioseq();
274  if (bioseq.CanGetInst() && !bioseq.GetInst().IsAa() && !context.GetTitle()) {
275  m_Objs["[n] bioseq[s] [has] no definition line"].Add(*context.BioseqObjRef());
276  }
277 }
278 
279 
280 
281 // N_RUNS_14
282 
283 DISCREPANCY_CASE(N_RUNS_14, SEQUENCE, eDisc | eTSA, "Runs of more than 14 Ns")
284 {
285  const CBioseq& bioseq = context.CurrentBioseq();
286  if (bioseq.CanGetInst() && bioseq.GetInst().IsNa()) {
287  const CSeqSummary& sum = context.GetSeqSummary();
288  if (sum.MaxN > 14) {
289  m_Objs["[n] sequence[s] [has] runs of 15 or more Ns"].Add(*context.BioseqObjRef());
290  }
291  }
292 }
293 
294 
295 // EXTERNAL_REFERENCE
296 
297 DISCREPANCY_CASE(EXTERNAL_REFERENCE, SEQUENCE, eDisc | eOncaller, "Sequence has external reference")
298 {
299  const CBioseq& bioseq = context.CurrentBioseq();
300  if (bioseq.CanGetInst() && bioseq.GetInst().IsNa()) {
301  const CSeqSummary& sum = context.GetSeqSummary();
302  if (sum.HasRef) {
303  m_Objs["[n] sequence[s] [has] external references"].Add(*context.BioseqObjRef());
304  }
305  }
306 }
307 
308 
309 // 10_PERCENTN
310 
311 DISCREPANCY_CASE0(TEN_PERCENTN, "10_PERCENTN", SEQUENCE, eDisc | eSubmitter | eSmart | eTSA, "Greater than 10 percent Ns")
312 {
313  const double MIN_N_PERCENTAGE = 10.0;
314 
315  const CBioseq& bioseq = context.CurrentBioseq();
316  if (bioseq.CanGetInst() && bioseq.GetInst().IsNa()) {
317  const CSeqSummary& sum = context.GetSeqSummary();
318  if (!sum.HasRef && sum.N * 100. / sum.Len > MIN_N_PERCENTAGE) {
319  m_Objs["[n] sequence[s] [has] > 10% Ns"].Add(*context.BioseqObjRef());
320  }
321  }
322 }
323 
324 
325 // FEATURE_COUNT
326 
327 DISCREPANCY_CASE(FEATURE_COUNT, FEAT, eOncaller | eSubmitter | eSmart, "Count features present or missing from sequences")
328 {
329  // context.SetGui(true); // for debug only!
330  for (const auto& feat : context.GetFeat()) {
331  if (!feat.IsSetData() || feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_prot) {
332  continue;
333  }
334  string key = feat.GetData().IsGene() ? "gene" : feat.GetData().GetKey(CSeqFeatData::eVocabulary_genbank);
335  m_Objs[key + ": [n] present"].Info().Incr();
336  }
337  if (CDiscrepancySet::IsGui() && context.IsBioseq()) {
338  const CBioseq& bioseq = context.CurrentBioseq();
339  bool na = false;
340  if (bioseq.CanGetInst() && bioseq.GetInst().IsNa()) {
341  na = true;
342  }
343  CRef<CReportObj> rep(context.BioseqObjRef());
344  for (const auto& feat : context.GetAllFeat()) {
345  if (!feat.IsSetData() || feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_prot) {
346  continue;
347  }
348  string key = feat.GetData().IsGene() ? "gene" : feat.GetData().GetKey(CSeqFeatData::eVocabulary_genbank);
349  key = to_string(feat.GetData().GetSubtype()) + " " + key;
350  m_Objs[kEmptyCStr][key].Add(*rep, false);
351  }
352  m_Objs[kEmptyCStr][na ? "N" : "A"].Add(*rep);
353  }
354 }
355 
356 
357 DISCREPANCY_SUMMARIZE(FEATURE_COUNT)
358 {
359  if (CDiscrepancySet::IsGui()) {
360  for (auto& it : m_Objs[kEmptyCStr].GetMap()) {
361  if (it.first == "N" || it.first == "A") {
362  continue;
363  }
364  size_t n = it.first.find(' ');
365  string key = it.first.substr(n + 1);
367  string label = key + ": [n] present";
368  map<CReportObj*, size_t> obj2num;
370  for (auto& obj : m_Objs[kEmptyStr]["N"].GetObjects()) {
371  obj2num[&*obj] = 0;
372  }
373  }
375  for (auto& obj : m_Objs[kEmptyStr]["A"].GetObjects()) {
376  obj2num[&*obj] = 0;
377  }
378  }
379  for (auto& obj : m_Objs[kEmptyStr][it.first].GetObjects()) {
380  obj2num[&*obj]++;
381  }
382  for (auto& pp : obj2num) {
383  m_Objs[label]["[n] bioseq[s] [has] [(]" + to_string(pp.second) + "[)] " + key + " features"].Info().Add(*pp.first);
384  }
385  }
386  m_Objs.GetMap().erase(kEmptyCStr);
387  }
388  xSummarize();
389 }
390 
391 
392 // EXON_ON_MRNA
393 
394 DISCREPANCY_CASE(EXON_ON_MRNA, SEQUENCE, eOncaller | eSmart, "mRNA sequences should not have exons")
395 {
396  const CSeqdesc* molinfo = context.GetMolinfo();
397  if (molinfo && molinfo->GetMolinfo().IsSetBiomol() && molinfo->GetMolinfo().GetBiomol() == CMolInfo::eBiomol_mRNA) {
398  if (context.FeatExons().size()) {
399  m_Objs["[n] mRNA bioseq[s] [has] exon features"].Add(*context.BioseqObjRef(CDiscrepancyContext::eFixSet));
400  }
401  }
402 }
403 
404 
405 DISCREPANCY_AUTOFIX(EXON_ON_MRNA)
406 {
407  unsigned int n = 0;
408  const CBioseq* seq = dynamic_cast<const CBioseq*>(context.FindObject(*obj));
409  CBioseq_EditHandle handle = context.GetBioseqHandle(*seq);
410  CFeat_CI ci(handle);
411  while (ci) {
413  if (ci->IsSetData() && ci->GetData().GetSubtype() == CSeqFeatData::eSubtype_exon) {
414  eh = CSeq_feat_EditHandle(context.GetScope().GetSeq_featHandle(ci->GetMappedFeature()));
415  }
416  ++ci;
417  if (eh) {
418  eh.Remove();
419  n++;
420  }
421  }
422  obj->SetFixed();
423  return CRef<CAutofixReport>(n ? new CAutofixReport("EXON_ON_MRNA: [n] exon[s] removed", n) : nullptr);
424 }
425 
426 
427 // INCONSISTENT_MOLINFO_TECH
428 
429 DISCREPANCY_CASE(INCONSISTENT_MOLINFO_TECH, SEQUENCE, eDisc | eSmart, "Inconsistent Molinfo Techniques")
430 {
431  const CBioseq& bioseq = context.CurrentBioseq();
432  if (bioseq.CanGetInst() && !bioseq.GetInst().IsAa()) {
433  string moltype;
434  const CSeqdesc* molinfo = context.GetMolinfo();
435  if (molinfo) {
436  if (molinfo->GetMolinfo().IsSetTech()) {
437  m_Objs[to_string(molinfo->GetMolinfo().GetTech())].Add(*context.BioseqObjRef());
438  }
439  else {
440  m_Objs[kEmptyStr].Add(*context.BioseqObjRef());
441  }
442  }
443  }
444 }
445 
446 
447 static const string kInconsistentMolinfoTechSummary = "Molinfo Technique Report";
448 static const string kInconsistentMolinfoTech = "[n] Molinfo[s] [is] missing field technique";
449 
450 DISCREPANCY_SUMMARIZE(INCONSISTENT_MOLINFO_TECH)
451 {
452  if (m_Objs.empty()) {
453  return;
454  }
455 
456  CReportNode report;
457 
458  CReportNode::TNodeMap& the_map = m_Objs.GetMap();
459 
460  bool same = true;
461  string tech;
462 
463  size_t num_of_missing = 0,
464  num_of_bioseqs = 0;
465 
466  for (auto it : the_map) {
467  num_of_bioseqs += it.second->GetObjects().size();
468  if (it.first.empty()) {
469  num_of_missing += it.second->GetObjects().size();
470  continue;
471  }
472  if (tech.empty()) {
473  tech = it.first;
474  }
475  else if (tech != it.first) {
476  same = false;
477  }
478  }
479  string summary = kInconsistentMolinfoTechSummary + " (";
480  if (num_of_missing == num_of_bioseqs || (same && !num_of_missing)) {
481  return;
482  }
483  summary += num_of_missing ? "some missing, " : "all present, ";
484  summary += same ? "all same)" : "some different)";
485  if (num_of_missing) {
486  if (num_of_missing == num_of_bioseqs) {
487  report[summary].SetCount(num_of_missing);
488  }
489  else {
490  report[summary][kInconsistentMolinfoTech].Add(the_map[kEmptyStr]->GetObjects());
491  }
492  }
493 
494  m_ReportItems = report.Export(*this)->GetSubitems();
495 }
496 
497 
498 // TITLE_ENDS_WITH_SEQUENCE
499 
500 static bool IsATGC(char ch)
501 {
502  return (ch == 'A' || ch == 'T' || ch == 'G' || ch == 'C');
503 }
504 
505 
506 static bool EndsWithSequence(const string& title)
507 {
508  static const size_t MIN_TITLE_SEQ_LEN = 19; // 19 was just copied from C-toolkit
509 
510  size_t count = 0;
511  for (string::const_reverse_iterator it = title.rbegin(); it != title.rend(); ++it) {
512  if (IsATGC(*it)) {
513  ++count;
514  }
515  else
516  break;
517 
518  if (count >= MIN_TITLE_SEQ_LEN) {
519  break;
520  }
521  }
522 
523  return count >= MIN_TITLE_SEQ_LEN;
524 }
525 
526 
527 DISCREPANCY_CASE(TITLE_ENDS_WITH_SEQUENCE, DESC, eDisc | eSubmitter | eSmart | eBig, "Sequence characters at end of defline")
528 {
529  for (auto& desc : context.GetSeqdesc()) {
530  if (desc.IsTitle() && EndsWithSequence(desc.GetTitle())) {
531  m_Objs["[n] defline[s] appear[S] to end with sequence characters"].Add(*context.SeqdescObjRef(desc));
532  }
533  }
534 }
535 
536 
537 // FEATURE_MOLTYPE_MISMATCH
538 
539 DISCREPANCY_CASE(FEATURE_MOLTYPE_MISMATCH, SEQUENCE, eOncaller, "Sequences with rRNA or misc_RNA features should be genomic DNA")
540 {
541  bool is_dna = false;
542  bool is_genomic = false;
543  const CBioseq& bioseq = context.CurrentBioseq();
544  if (bioseq.CanGetInst() && bioseq.GetInst().IsSetMol() && bioseq.GetInst().GetMol() == CSeq_inst::eMol_dna) {
545  is_dna = true;
546  }
547  auto molinfo = context.GetMolinfo();
548  if (molinfo && molinfo->GetMolinfo().IsSetBiomol() && molinfo->GetMolinfo().GetBiomol() == CMolInfo::eBiomol_genomic) {
549  is_genomic = true;
550  }
551  if (!is_genomic || !is_dna) {
552  const CSeq_annot* annot = nullptr;
553  if (bioseq.IsSetAnnot()) {
554  for (auto& annot_it : bioseq.GetAnnot()) {
555  if (annot_it->IsFtable()) {
556  annot = annot_it;
557  break;
558  }
559  }
560  }
561  if (annot) {
562  for (auto& feat : annot->GetData().GetFtable()) {
563  if (feat->IsSetData()) {
564  CSeqFeatData::ESubtype subtype = feat->GetData().GetSubtype();
565  if (subtype == CSeqFeatData::eSubtype_rRNA || subtype == CSeqFeatData::eSubtype_otherRNA) {
566  m_Objs["[n] sequence[s] [has] rRNA or misc_RNA features but [is] not genomic DNA"].Add(*context.BioseqObjRef(CDiscrepancyContext::eFixSelf));
567  break;
568  }
569  }
570  }
571  }
572  }
573 }
574 
575 DISCREPANCY_AUTOFIX(FEATURE_MOLTYPE_MISMATCH)
576 {
577  const CBioseq* seq = dynamic_cast<const CBioseq*>(context.FindObject(*obj));
578  CBioseq_EditHandle edit_handle = context.GetBioseqHandle(*seq);
579  edit_handle.SetInst_Mol(CSeq_inst::eMol_dna);
580  CSeq_descr& descrs = edit_handle.SetDescr();
581  CMolInfo* molinfo = nullptr;
582  if (descrs.IsSet()) {
583  for (auto descr : descrs.Set()) {
584  if (descr->IsMolinfo()) {
585  molinfo = &(descr->SetMolinfo());
586  break;
587  }
588  }
589  }
590  if (molinfo == nullptr) {
591  CRef<CSeqdesc> new_descr(new CSeqdesc);
592  molinfo = &(new_descr->SetMolinfo());
593  descrs.Set().push_back(new_descr);
594  }
595  if (molinfo == nullptr) {
596  return CRef<CAutofixReport>();
597  }
599  obj->SetFixed();
600  return CRef<CAutofixReport>(new CAutofixReport("FEATURE_MOLTYPE_MISMATCH: Moltype was set to genomic for [n] bioseq[s]", 1));
601 }
602 
603 
604 // INCONSISTENT_DBLINK
605 
606 const string kMissingDBLink = "[n] Bioseq [is] missing DBLink object";
607 const string kDBLinkObjectList = "DBLink Objects";
608 const string kDBLinkFieldCountTop = "DBLink Fields";
609 const string kDBLinkCollect = "DBLink Collection";
610 
611 string GetFieldValueAsString(const CUser_field& field)
612 {
613  string value;
614 
615  if (field.GetData().IsStr()) {
616  value = field.GetData().GetStr();
617  } else if (field.GetData().IsStrs()) {
618  for (const string& s : field.GetData().GetStrs()) {
619  if (!NStr::IsBlank(value)) {
620  value += "; ";
621  }
622  value += s;
623  }
624  }
625  return value;
626 }
627 
628 
629 const string& kPreviouslySeenFields = "Previously Seen Fields";
630 const string& kPreviouslySeenFieldsThis = "Previously Seen Fields This";
631 const string& kPreviouslySeenObjects = "Previously Seen Objects";
632 
634 (//CConstRef<CSeqdesc> desc,
635  const CSeqdesc* desc,
636  //CConstRef<CBioseq> seq,
637  //const CSeqSummary* info,
638  CReportObj& rep_seq,
639  CReportNode& collector,
640  CReportNode& previously_seen,
641  CDiscrepancyContext& context,
642  const string& object_name,
643  const string& field_prefix = kEmptyStr)
644 {
645  if (!desc) {
646  // add missing for all previously seen fields
647  for (auto& obj : previously_seen[kPreviouslySeenFields].GetMap()) {
648  for (auto& z : obj.second->GetMap()) {
649  collector[field_prefix + z.first][" [n] " + object_name + "[s] [is] missing field " + field_prefix + z.first]
650  //.Add(*context.NewBioseqObj(seq, info, eKeepRef), false);
651  //.Add(*context.NewBioseqObj(seq, info));
652  .Add(rep_seq);
653  }
654  }
655  return;
656  }
657 
658  bool already_seen = previously_seen[kPreviouslySeenObjects].Exist(object_name);
659  for (auto& f : desc->GetUser().GetData()) {
660  if (f->IsSetLabel() && f->GetLabel().IsStr() && f->IsSetData()) {
661  string field_name = field_prefix + f->GetLabel().GetStr();
662  // add missing field to all previous objects that do not have this field
663  if (already_seen && !collector.Exist(field_name)) {
664  for (auto& ro : previously_seen[kPreviouslySeenObjects][object_name].GetObjects()) {
665  string missing_label = "[n] " + object_name + "[s] [is] missing field " + field_name;
666  //CRef<CDiscrepancyObject> seq_disc_obj(dynamic_cast<CDiscrepancyObject*>(ro->GetNCPointer()));
667  //collector[field_name][missing_label].Add(*seq_disc_obj, false);
668  //collector[field_name][missing_label].Add(*seq_disc_obj);
669  collector[field_name][missing_label].Add(*ro);
670  }
671  }
672  collector[field_name]["[n] " + object_name + "[s] [has] field " + field_name + " value '" + GetFieldValueAsString(*f) + "'"].Add(*context.SeqdescObjRef(*desc), false);
673  previously_seen[kPreviouslySeenFieldsThis][f->GetLabel().GetStr()].Add(*context.SeqdescObjRef(*desc), false);
674  previously_seen[kPreviouslySeenFields][object_name][f->GetLabel().GetStr()].Add(*context.SeqdescObjRef(*desc), false);
675  }
676  }
677 
678  // add missing for all previously seen fields not on this object
679  for (auto& z : previously_seen[kPreviouslySeenFields][object_name].GetMap()) {
680  if (!previously_seen[kPreviouslySeenFieldsThis].Exist(z.first)) {
681  collector[field_prefix + z.first][" [n] " + object_name + "[s] [is] missing field " + field_prefix + z.first].Add(*context.SeqdescObjRef(*desc));
682  }
683  }
684 
685  // maintain object list for missing fields
686  //CRef<CDiscrepancyObject> this_disc_obj(context.NewSeqdescObj(d, context.GetCurrentBioseqLabel(), eKeepRef));
687  CRef<CDiscrepancyObject> this_disc_obj(context.SeqdescObjRef(*desc));
688  //previously_seen[kPreviouslySeenObjects][object_name].Add(*this_disc_obj, false);
689  previously_seen[kPreviouslySeenObjects][object_name].Add(*this_disc_obj);
690  previously_seen[kPreviouslySeenFieldsThis].clear();
691 }
692 
693 
694 DISCREPANCY_CASE(INCONSISTENT_DBLINK, SEQUENCE, eDisc | eSubmitter | eSmart | eBig, "Inconsistent DBLink fields")
695 {
696  const CBioseq& bioseq = context.CurrentBioseq();
697  if (bioseq.CanGetInst() && bioseq.GetInst().IsNa()) {
698  bool found = false;
699  auto rep_seq = context.BioseqObjRef();
700  for (auto& desc : context.GetAllSeqdesc()) {
701  if (desc.IsUser()) {
702  const CUser_object& user = desc.GetUser();
704  found = true;
705  AddUserObjectFieldItems(&desc, *rep_seq, m_Objs[kDBLinkCollect], m_Objs[kDBLinkObjectList], context, "DBLink object");
706  }
707  }
708  }
709  if (!found) {
710  m_Objs[kMissingDBLink].Add(*rep_seq);
711  AddUserObjectFieldItems(nullptr, *rep_seq, m_Objs[kDBLinkCollect], m_Objs[kDBLinkObjectList], context, "DBLink object");
712  }
713  }
714 }
715 
716 
717 void AnalyzeField(CReportNode& node, bool& all_present, bool& all_same)
718 {
719  all_present = true;
720  all_same = true;
721  size_t num_values = 0;
722  string value;
723  bool first = true;
724  for (auto& s : node.GetMap()) {
725  if (NStr::Find(s.first, " missing field ") != NPOS) {
726  all_present = false;
727  } else {
728  SIZE_TYPE pos = NStr::Find(s.first, " value '");
729  if (pos != NPOS) {
730  if (first) {
731  value = s.first.substr(pos);
732  num_values++;
733  first = false;
734  } else if (!NStr::Equal(s.first.substr(pos), value)) {
735  num_values++;
736  }
737  }
738  }
739  if (num_values > 1) {
740  all_same = false;
741  if (!all_present) {
742  // have all the info we need
743  break;
744  }
745  }
746  }
747 }
748 
749 
750 void AnalyzeFieldReport(CReportNode& node, bool& all_present, bool& all_same)
751 {
752  all_present = true;
753  all_same = true;
754  for (auto& s : node.GetMap()) {
755  bool this_present = true;
756  bool this_same = true;
757  AnalyzeField(*s.second, this_present, this_same);
758  all_present &= this_present;
759  all_same &= this_same;
760  if (!all_present && !all_same) {
761  break;
762  }
763  }
764 }
765 
766 
767 string GetSummaryLabel(bool all_present, bool all_same)
768 {
769  string summary = "(";
770  if (all_present) {
771  summary += "all present";
772  } else {
773  summary += "some missing";
774  }
775  summary += ", ";
776  if (all_same) {
777  summary += "all same";
778  } else {
779  summary += "inconsistent";
780  }
781  summary += ")";
782  return summary;
783 }
784 
785 
786 void CopyNode(CReportNode& new_home, CReportNode& original)
787 {
788  for (auto& s : original.GetMap()){
789  for (auto q : s.second->GetObjects()) {
790  new_home[s.first].Add(*q);
791  }
792  }
793  for (auto& q : original.GetObjects()) {
794  new_home.Add(*q);
795  }
796 }
797 
798 
799 string AdjustDBLinkFieldName(const string& orig_field_name)
800 {
801  if (NStr::Equal(orig_field_name, "BioSample")) {
802  return " " + orig_field_name;
803  } else if (NStr::Equal(orig_field_name, "ProbeDB")) {
804  return " " + orig_field_name;
805  } else if (NStr::Equal(orig_field_name, "Sequence Read Archive")) {
806  return " " + orig_field_name;
807  } else if (NStr::Equal(orig_field_name, "BioProject")) {
808  return " " + orig_field_name;
809  } else if (NStr::Equal(orig_field_name, "Assembly")) {
810  return " " + orig_field_name;
811  } else {
812  return orig_field_name;
813  }
814 }
815 
816 
817 DISCREPANCY_SUMMARIZE(INCONSISTENT_DBLINK)
818 {
819  m_Objs.GetMap().erase(kDBLinkObjectList);
820  m_Objs.GetMap().erase(kDBLinkFieldCountTop);
821  if (m_Objs.empty()) {
822  return;
823  }
824 
825  // add top-level category, rename field values
826  bool all_present = true;
827  bool all_same = true;
828  AnalyzeFieldReport(m_Objs[kDBLinkCollect], all_present, all_same);
829  if (all_present && all_same) {
830  m_Objs.clear();
831  return;
832  }
833  string top_label = "DBLink Report " + GetSummaryLabel(all_present, all_same);
834 
835  CReportNode::TNodeMap::iterator it = m_Objs.GetMap().begin();
836  while (it != m_Objs.GetMap().end()) {
837  if (!NStr::Equal(it->first, top_label)
838  && !NStr::Equal(it->first, kDBLinkCollect)) {
839  CopyNode(m_Objs[top_label][" " + it->first], *it->second);
840  it = m_Objs.GetMap().erase(it);
841  } else {
842  ++it;
843  }
844  }
845 
846  for (auto& it2 : m_Objs[kDBLinkCollect].GetMap()) {
847  bool this_present = true;
848  bool this_same = true;
849  AnalyzeField(*it2.second, this_present, this_same);
850  string new_label = AdjustDBLinkFieldName(it2.first) + " " + GetSummaryLabel(this_present, this_same);
851  for (auto& s : it2.second->GetMap()){
852  for (auto& q : s.second->GetObjects()) {
853  m_Objs[top_label][new_label][s.first].Add(*q);
854  }
855  }
856  }
857  m_Objs.GetMap().erase(kDBLinkCollect);
858 
859  xSummarize();
860 }
861 
862 
863 // INCONSISTENT_STRUCTURED_COMMENTS
864 const string kStructuredCommentsSeqs = "sequences";
865 const string kStructuredCommentObservedPrefixes = "observed prefixes";
866 const string kStructuredCommentObservedPrefixesThis = "observed prefixes this";
867 const string kStructuredCommentReport = "collection";
868 const string kStructuredCommentPrevious = "previous";
869 const string kStructuredCommentFieldPrefix = "structured comment field ";
870 
871 DISCREPANCY_CASE(INCONSISTENT_STRUCTURED_COMMENTS, SEQUENCE, eDisc | eSubmitter | eSmart | eBig, "Inconsistent structured comments")
872 {
873  const CBioseq& bioseq = context.CurrentBioseq();
874  if (bioseq.CanGetInst() && bioseq.GetInst().IsNa()) {
875  auto rep_seq = context.BioseqObjRef();
876  for (auto& desc : context.GetAllSeqdesc()) {
877  if (desc.IsUser()) {
878  const CUser_object& user = desc.GetUser();
881  if (NStr::IsBlank(prefix)) {
882  prefix = "unnamed";
883  }
884  m_Objs[kStructuredCommentObservedPrefixesThis][prefix].Add(*context.SeqdescObjRef(desc));
885  AddUserObjectFieldItems(&desc, *rep_seq, m_Objs[kStructuredCommentReport], m_Objs[kStructuredCommentPrevious], context, prefix + " structured comment", kStructuredCommentFieldPrefix);
886  }
887  }
888  }
889  //report prefixes seen previously, not found on this sequence
890  for (auto& it : m_Objs[kStructuredCommentObservedPrefixes].GetMap()) {
891  if (!m_Objs[kStructuredCommentObservedPrefixesThis].Exist(it.first)) {
892  m_Objs["[n] Bioseq[s] [is] missing " + it.first + " structured comment"].Add(*rep_seq);
893  AddUserObjectFieldItems(nullptr, *rep_seq, m_Objs[kStructuredCommentReport], m_Objs[kStructuredCommentPrevious], context, it.first + " structured comment", kStructuredCommentFieldPrefix);
894  }
895  }
896  // report prefixes found on this sequence but not on previous sequences
897  for (auto& it : m_Objs[kStructuredCommentObservedPrefixesThis].GetMap()) {
898  if (!m_Objs[kStructuredCommentObservedPrefixes].Exist(it.first)) {
899  for (auto ro : m_Objs[kStructuredCommentsSeqs].GetObjects()) {
900  m_Objs["[n] Bioseq[s] [is] missing " + it.first + " structured comment"].Add(*ro);
901  AddUserObjectFieldItems(nullptr, *ro, m_Objs[kStructuredCommentReport], m_Objs[kStructuredCommentPrevious], context, it.first + " structured comment", kStructuredCommentFieldPrefix);
902  }
903  }
904  m_Objs[kStructuredCommentObservedPrefixes][it.first].Add(*context.BioseqObjRef());
905  }
907  m_Objs[kStructuredCommentsSeqs].Add(*context.BioseqObjRef());
908  }
909 }
910 
911 
912 DISCREPANCY_SUMMARIZE(INCONSISTENT_STRUCTURED_COMMENTS)
913 {
914  m_Objs.GetMap().erase(kStructuredCommentObservedPrefixesThis);
915  m_Objs.GetMap().erase(kStructuredCommentsSeqs);
916  m_Objs.GetMap().erase(kStructuredCommentObservedPrefixes);
917  m_Objs.GetMap().erase(kStructuredCommentPrevious);
918 
919  m_Objs[kStructuredCommentReport].GetMap().erase(kStructuredCommentFieldPrefix + "StructuredCommentPrefix");
920  m_Objs[kStructuredCommentReport].GetMap().erase(kStructuredCommentFieldPrefix + "StructuredCommentSuffix");
921 
922  if (m_Objs.empty()) {
923  return;
924  }
925 
926  // add top-level category, rename field values
927  bool all_present = true;
928  bool all_same = true;
929  AnalyzeFieldReport(m_Objs[kStructuredCommentReport], all_present, all_same);
930  if (all_present && all_same) {
931  return;
932  }
933 
934  string top_label = "Structured Comment Report " + GetSummaryLabel(all_present, all_same);
935 
936  CReportNode::TNodeMap::iterator it = m_Objs.GetMap().begin();
937  while (it != m_Objs.GetMap().end()) {
938  if (!NStr::Equal(it->first, top_label)
939  && !NStr::Equal(it->first, kStructuredCommentReport)) {
940  CopyNode(m_Objs[top_label][" " + it->first], *it->second);
941  it = m_Objs.GetMap().erase(it);
942  } else {
943  ++it;
944  }
945  }
946 
947  for (auto& it2 : m_Objs[kStructuredCommentReport].GetMap()) {
948  bool this_present = true;
949  bool this_same = true;
950  AnalyzeField(*it2.second, this_present, this_same);
951  string new_label = it2.first + " " + GetSummaryLabel(this_present, this_same);
952  for (auto& s : it2.second->GetMap()) {
953  string sub_label = s.first;
954  if (this_present && this_same) {
955  NStr::ReplaceInPlace(sub_label, "[n]", "All");
956  }
957  for (auto& q : s.second->GetObjects()) {
958  m_Objs[top_label][new_label][sub_label].Add(*q);
959  }
960  }
961  }
962  m_Objs.GetMap().erase(kStructuredCommentReport);
963 
964  xSummarize();
965 }
966 
967 
968 // MISSING_STRUCTURED_COMMENT
969 
970 DISCREPANCY_CASE(MISSING_STRUCTURED_COMMENT, SEQUENCE, eDisc | eTSA, "Structured comment not included")
971 {
972  const CBioseq& bioseq = context.CurrentBioseq();
973  if (bioseq.CanGetInst() && bioseq.GetInst().IsNa()) {
974  for (auto& desc : context.GetAllSeqdesc()) {
975  if (desc.IsUser()) {
976  const CUser_object& user = desc.GetUser();
978  return;
979  }
980  }
981  }
982  m_Objs["[n] sequence[s] [does] not include structured comments."].Add(*context.BioseqObjRef());
983  }
984 }
985 
986 
987 // MISSING_PROJECT
988 
989 DISCREPANCY_CASE(MISSING_PROJECT, SEQUENCE, eDisc | eTSA, "Project not included")
990 {
991  const CBioseq& bioseq = context.CurrentBioseq();
992  if (bioseq.CanGetInst()) {
993  for (auto& desc : context.GetAllSeqdesc()) {
994  if (desc.IsUser()) {
995  const CUser_object& user = desc.GetUser();
997  if (user.IsSetData()) {
998  for (auto& it : user.GetData()) {
999  if (it->IsSetLabel() && it->GetLabel().IsStr() && NStr::Equal(it->GetLabel().GetStr(), "BioProject")) {
1000  return;
1001  }
1002  }
1003  }
1004  }
1005  else if (user.IsSetType() && user.GetType().IsStr() && NStr::Equal(user.GetType().GetStr(), "GenomeProjectsDB")) {
1006  return;
1007  }
1008  }
1009  }
1010  m_Objs["[n] sequence[s] [does] not include project."].Add(*context.BioseqObjRef());
1011  }
1012 }
1013 
1014 
1015 // COUNT_UNVERIFIED
1016 
1017 DISCREPANCY_CASE(COUNT_UNVERIFIED, SEQUENCE, eOncaller, "Count number of unverified sequences")
1018 {
1019  const CBioseq& bioseq = context.CurrentBioseq();
1020  if (bioseq.CanGetInst()) {
1021  for (auto& desc : context.GetAllSeqdesc()) {
1022  if (desc.IsUser()) {
1023  const CUser_object& user = desc.GetUser();
1025  m_Objs["[n] sequence[s] [is] unverified"].Add(*context.BioseqObjRef(), false);
1026  return;
1027  }
1028  }
1029  }
1030  }
1031 }
1032 
1033 
1034 // DEFLINE_PRESENT
1035 
1036 const string kDeflineExists = "[n] Bioseq[s] [has] definition line";
1037 
1038 DISCREPANCY_CASE(DEFLINE_PRESENT, SEQUENCE, eDisc, "Test defline existence")
1039 {
1040  const CBioseq& bioseq = context.CurrentBioseq();
1041  if (bioseq.CanGetInst() && !bioseq.GetInst().IsAa() && context.GetTitle()) {
1042  m_Objs[kDeflineExists].Add(*context.BioseqObjRef());
1043  }
1044 }
1045 
1046 
1047 // UNUSUAL_NT
1048 
1049 DISCREPANCY_CASE(UNUSUAL_NT, SEQUENCE, eDisc | eSubmitter | eSmart, "Sequence contains unusual nucleotides")
1050 {
1051  const CBioseq& bioseq = context.CurrentBioseq();
1052  if (bioseq.CanGetInst() && bioseq.GetInst().IsNa()) {
1053  const CSeqSummary& sum = context.GetSeqSummary();
1054  if (sum.Other) {
1055  m_Objs["[n] sequence[s] contain[S] nucleotides that are not ATCG or N"].Add(*context.BioseqObjRef());
1056  }
1057  }
1058 }
1059 
1060 
1061 // TAXNAME_NOT_IN_DEFLINE
1062 
1063 const string kNoTaxnameInDefline = "[n] defline[s] [does] not contain the complete taxname";
1064 
1065 DISCREPANCY_CASE(TAXNAME_NOT_IN_DEFLINE, SEQUENCE, eDisc | eOncaller, "Complete taxname should be present in definition line")
1066 {
1067  const CBioseq& bioseq = context.CurrentBioseq();
1068  if (bioseq.CanGetInst() && !bioseq.GetInst().IsAa()) {
1069  const CSeqdesc* source = context.GetBiosource();
1070  const CSeqdesc* title = context.GetTitle();
1071  if (source && source->IsSource() && source->GetSource().IsSetOrg() && source->GetSource().GetOrg().IsSetTaxname() && title) {
1072  string taxname = source->GetSource().GetOrg().GetTaxname();
1073  if (NStr::EqualNocase(taxname, "Human immunodeficiency virus 1")) {
1074  taxname = "HIV-1";
1075  }
1076  else if (NStr::EqualNocase(taxname, "Human immunodeficiency virus 2")) {
1077  taxname = "HIV-2";
1078  }
1079  bool no_taxname_in_defline = false;
1080  SIZE_TYPE taxname_pos = NStr::FindNoCase(title->GetTitle(), taxname);
1081  if (taxname_pos == NPOS) {
1082  no_taxname_in_defline = true;
1083  }
1084  else {
1085  //capitalization must match for all but the first letter
1086  no_taxname_in_defline = NStr::CompareCase(title->GetTitle().c_str() + taxname_pos, 1, taxname.size() - 1, taxname.c_str() + 1) != 0;
1087  if (taxname_pos > 0 && !isspace(title->GetTitle()[taxname_pos - 1]) && !ispunct(title->GetTitle()[taxname_pos - 1])) {
1088  no_taxname_in_defline = true;
1089  }
1090  }
1091  if (no_taxname_in_defline) {
1092  m_Objs[kNoTaxnameInDefline].Add(*context.SeqdescObjRef(*title));
1093  }
1094  }
1095  }
1096 }
1097 
1098 
1099 // HAS_PROJECT_ID
1100 
1101 static string GetProjectID(const CUser_object& user)
1102 {
1103  string res;
1104  if (user.IsSetData()) {
1105  for (auto field: user.GetData()) {
1106  if (field->IsSetData() && field->GetData().IsInt() && field->IsSetLabel() && field->GetLabel().IsStr() && field->GetLabel().GetStr() == "ProjectID") {
1107  return NStr::IntToString(field->GetData().GetInt());
1108  }
1109  }
1110  }
1111  return res;
1112 }
1113 
1114 
1115 DISCREPANCY_CASE(HAS_PROJECT_ID, SEQUENCE, eOncaller | eSmart, "Sequences with project IDs (looks for genome project IDs)")
1116 {
1117  const CBioseq& bioseq = context.CurrentBioseq();
1118  if (bioseq.CanGetInst()) {
1119  for (auto& desc : context.GetAllSeqdesc()) {
1120  if (desc.IsUser()) {
1121  const CUser_object& user = desc.GetUser();
1122  if (user.IsSetType() && user.GetType().IsStr() && user.GetType().GetStr() == "GenomeProjectsDB") {
1123  string proj_id = GetProjectID(user);
1124  if (!proj_id.empty()) {
1125  m_Objs[proj_id][bioseq.IsNa() ? "N" : "A"].Add(*context.BioseqObjRef());
1126  }
1127  }
1128  }
1129  }
1130  }
1131 }
1132 
1133 
1134 DISCREPANCY_SUMMARIZE(HAS_PROJECT_ID)
1135 {
1136  if (m_Objs.empty()) {
1137  return;
1138  }
1139  CReportNode res;
1140  string all = "[n] sequence[s] [has] project IDs ";
1141  string prots = "[n] protein sequence[s] [has] project IDs ";
1142  string nucs = "[n] nucleotide sequence[s] [has] project IDs ";
1143  auto& projects = m_Objs.GetMap();
1144  all += projects.size() > 1 ? "(some different)" : "(all same)";
1145  size_t count_prots = 0;
1146  size_t count_nucs = 0;
1147  for (auto it: projects) {
1148  auto& M = it.second->GetMap();
1149  if (M.find("A") != M.end()) {
1150  count_prots++;
1151  }
1152  if (M.find("N") != M.end()) {
1153  count_nucs++;
1154  }
1155  }
1156  prots += count_prots > 1 ? "(some different)" : "(all same)";
1157  nucs += count_nucs > 1 ? "(some different)" : "(all same)";
1158  for (auto it : projects) {
1159  auto& M = it.second->GetMap();
1160  if (M.find("A") != M.end()) {
1161  for (auto obj : M["A"]->GetObjects()) {
1162  res[all][prots].Add(*obj);
1163  }
1164  }
1165  if (M.find("N") != M.end()) {
1166  for (auto obj : M["N"]->GetObjects()) {
1167  res[all][nucs].Add(*obj);
1168  }
1169  }
1170  }
1171 
1172  m_ReportItems = res.Export(*this)->GetSubitems();
1173 }
1174 
1175 
1176 // MULTIPLE_CDS_ON_MRNA
1177 
1178 DISCREPANCY_CASE(MULTIPLE_CDS_ON_MRNA, SEQUENCE, eOncaller | eSubmitter | eSmart, "Multiple CDS on mRNA")
1179 {
1180  const CSeqdesc* molinfo = context.GetMolinfo();
1181  if (molinfo && molinfo->GetMolinfo().IsSetBiomol() && molinfo->GetMolinfo().GetBiomol() == CMolInfo::eBiomol_mRNA) {
1182  auto& cds = context.FeatCDS();
1183  if (cds.size() < 2) {
1184  return;
1185  }
1186  size_t count_pseudo = 0;
1187  size_t count_disrupt = 0;
1188  for (auto feat : cds) {
1189  if (feat->IsSetComment() && NStr::Find(feat->GetComment(), "coding region disrupted by sequencing gap") != NPOS) {
1190  count_disrupt++;
1191  }
1192  if (context.IsPseudo(*feat)) {
1193  count_pseudo++;
1194  }
1195  }
1196  if (count_disrupt != cds.size() && count_pseudo != cds.size()) {
1197  m_Objs["[n] mRNA bioseq[s] [has] multiple CDS features"].Add(*context.BioseqObjRef());
1198  }
1199  }
1200 }
1201 
1202 
1203 // MRNA_SEQUENCE_MINUS_STRAND_FEATURES
1204 
1205 static const string kMrnaSequenceMinusStrandFeatures = "[n] mRNA sequences have features on the complement strand.";
1206 
1207 DISCREPANCY_CASE(MRNA_SEQUENCE_MINUS_STRAND_FEATURES, SEQUENCE, eOncaller, "mRNA sequences have CDS/gene on the complement strand")
1208 {
1209  const CSeqdesc* molinfo = context.GetMolinfo();
1210  if (molinfo && molinfo->GetMolinfo().IsSetBiomol() && molinfo->GetMolinfo().GetBiomol() == CMolInfo::eBiomol_mRNA) {
1211  auto& cds = context.FeatCDS();
1212  size_t count_plus = 0;
1213  size_t count_minus = 0;
1214  for (auto& feat : cds) {
1215  if (feat->GetLocation().GetStrand() != eNa_strand_minus || feat->GetData().GetSubtype() == CSeqFeatData::eSubtype_primer_bind) {
1216  count_plus++;
1217  }
1218  else {
1219  count_minus++;
1220  }
1221  }
1222  if (count_minus) {
1223  if (!count_plus) {
1224  m_Objs[kMrnaSequenceMinusStrandFeatures].Add(*context.BioseqObjRef(CDiscrepancyContext::eFixSet));
1225  }
1226  else {
1227  m_Objs[kMrnaSequenceMinusStrandFeatures].Add(*context.BioseqObjRef());
1228  }
1229  }
1230  }
1231 }
1232 
1233 
1234 DISCREPANCY_AUTOFIX(MRNA_SEQUENCE_MINUS_STRAND_FEATURES)
1235 {
1236  unsigned int n = 0;
1237  const CBioseq* seq = dynamic_cast<const CBioseq*>(context.FindObject(*obj));
1238  CBioseq_EditHandle bioseq(context.GetBioseqHandle(*seq));
1239  vector<CSeq_feat*> features;
1240  CFeat_CI feat_ci(bioseq, CSeqFeatData::e_Cdregion);
1241  for (; feat_ci; ++feat_ci) {
1242  features.push_back(const_cast<CSeq_feat*>(&*feat_ci->GetSeq_feat()));
1243  }
1244 
1245  CRef<objects::CSeq_inst> new_inst(new objects::CSeq_inst());
1246  new_inst->Assign(bioseq.GetInst());
1247  ReverseComplement(*new_inst, &context.GetScope());
1248  bioseq.SetInst(*new_inst);
1249 
1250  for (auto& feat : features) {
1251  edit::ReverseComplementFeature(*feat, context.GetScope());
1252  ++n;
1253  }
1254  obj->SetFixed();
1255  return CRef<CAutofixReport>(n ? new CAutofixReport("MRNA_SEQUENCE_MINUS_STRAND_FEATURES: [n] sequence[s] [is] converted to reverse complement[s]", n) : nullptr);
1256 }
1257 
1258 
1259 // LOW_QUALITY_REGION
1260 
1261 DISCREPANCY_CASE(LOW_QUALITY_REGION, SEQUENCE, eDisc | eSubmitter | eSmart, "Sequence contains regions of low quality")
1262 {
1263  const size_t MAX_N_IN_SEQ = 7; // 25% of the sequence
1264  const CBioseq& bioseq = context.CurrentBioseq();
1265  if (bioseq.CanGetInst() && bioseq.GetInst().IsNa()) {
1266  const CSeqSummary& sum = context.GetSeqSummary();
1267  if (sum.MinQ > MAX_N_IN_SEQ) {
1268  m_Objs["[n] sequence[s] contain[S] low quality region"].Add(*context.BioseqObjRef());
1269  }
1270  }
1271 }
1272 
1273 
1274 // DEFLINE_ON_SET
1275 
1276 DISCREPANCY_CASE(DEFLINE_ON_SET, SEQ_SET, eOncaller, "Titles on sets")
1277 {
1278  const CBioseq_set& set = context.CurrentBioseq_set();
1279  if (set.IsSetDescr()) {
1280  for (const auto& descr : set.GetDescr().Get()) {
1281  if (descr->IsTitle()) {
1282  m_Objs["[n] title[s] on sets were found"].Add(*context.SeqdescObjRef(*descr));
1283  }
1284  }
1285  }
1286 }
1287 
1288 
1289 DISCREPANCY_SUMMARIZE(DEFLINE_ON_SET)
1290 {
1291  m_ReportItems = m_Objs.Export(*this, false)->GetSubitems();
1292 }
1293 
1294 
1295 // MITOCHONDRION_REQUIRED
1296 
1297 DISCREPANCY_CASE(MITOCHONDRION_REQUIRED, SEQUENCE, eDisc | eOncaller, "If D-loop or control region misc_feat is present, source must be mitochondrial")
1298 {
1299  const CSeqdesc* biosrc = context.GetBiosource();
1300  if (!biosrc || biosrc->GetSource().GetGenome() != CBioSource::eGenome_mitochondrion) {
1301  auto& all = context.FeatAll();
1302  bool has_D_loop = false;
1303  bool has_misc_feat_with_control_region = false;
1304  for (auto& feat : all) {
1305  if (feat->IsSetData()) {
1306  if (feat->GetData().GetSubtype() == CSeqFeatData::eSubtype_D_loop) {
1307  has_D_loop = true;
1308  break;
1309  }
1310  else if (feat->GetData().GetSubtype() == CSeqFeatData::eSubtype_misc_feature) {
1311  if (feat->IsSetComment() && NStr::FindNoCase(feat->GetComment(), "control region") != NPOS) {
1312  has_misc_feat_with_control_region = true;
1313  break;
1314  }
1315  }
1316  }
1317  }
1318  if (has_D_loop || has_misc_feat_with_control_region) {
1319  m_Objs["[n] bioseq[s] [has] D-loop or control region misc_feature, but [is] do not have mitochondrial source"].Add(*context.BioseqObjRef(CDiscrepancyContext::eFixSet));
1320  }
1321  }
1322 }
1323 
1324 
1325 DISCREPANCY_SUMMARIZE(MITOCHONDRION_REQUIRED)
1326 {
1327  m_ReportItems = m_Objs.Export(*this, false)->GetSubitems();
1328 }
1329 
1330 
1331 #if 0
1332 static bool FixGenome(const CBioseq& bioseq, CScope& scope)
1333 {
1334  CBioseq_Handle seq_h = scope.GetBioseqHandle(bioseq);
1335  CSeqdesc_CI biosrc(seq_h, CSeqdesc::e_Source);
1336  if (biosrc) {
1337  CSeqdesc& edit_biosrc = const_cast<CSeqdesc&>(*biosrc);
1339  return true;
1340  }
1341 
1342  return false;
1343 }
1344 #endif
1345 
1346 
1347 DISCREPANCY_AUTOFIX(MITOCHONDRION_REQUIRED)
1348 {
1349  const CBioseq* seq = dynamic_cast<const CBioseq*>(context.FindObject(*obj));
1350  CBioseq_EditHandle seq_h = context.GetBioseqHandle(*seq);
1351  CSeqdesc_CI biosrc(seq_h, CSeqdesc::e_Source);
1352  if (biosrc) {
1353  CSeqdesc& edit_biosrc = const_cast<CSeqdesc&>(*biosrc);
1355  obj->SetFixed();
1356  return CRef<CAutofixReport>(new CAutofixReport("MITOCHONDRION_REQUIRED: Genome was set to mitochondrion for [n] bioseq[s]", 1));
1357  }
1358  return CRef<CAutofixReport>();
1359 }
1360 
1361 
1362 
1363 // SEQ_SHORTER_THAN_50bp
1364 
1365 static bool IsMolProd(int biomol) { return biomol == CMolInfo::eBiomol_mRNA || biomol == CMolInfo::eBiomol_ncRNA || biomol == CMolInfo::eBiomol_rRNA || biomol == CMolInfo::eBiomol_pre_RNA || biomol == CMolInfo::eBiomol_tRNA; }
1366 
1367 DISCREPANCY_CASE(SEQ_SHORTER_THAN_50bp, SEQUENCE, eDisc | eSubmitter | eSmart | eBig, "Find Short Sequences")
1368 {
1369  const CBioseq& bioseq = context.CurrentBioseq();
1370  if (bioseq.CanGetInst() && bioseq.GetInst().IsNa() && bioseq.IsSetLength() && bioseq.GetLength() < 50) {
1371  if (context.InGenProdSet() && bioseq.IsSetInst() && bioseq.GetInst().IsSetMol() && bioseq.GetInst().GetMol() == objects::CSeq_inst::eMol_rna) {
1372  const CSeqdesc* molinfo = context.GetMolinfo();
1373  if (molinfo && molinfo->GetMolinfo().IsSetBiomol() && IsMolProd(molinfo->GetMolinfo().GetBiomol())) {
1374  return;
1375  }
1376  }
1377  m_Objs["[n] sequence[s] [is] shorter than 50 nt"].Add(*context.BioseqObjRef());
1378  }
1379 }
1380 
1381 
1382 // SEQ_SHORTER_THAN_200bp
1383 
1384 DISCREPANCY_CASE(SEQ_SHORTER_THAN_200bp, SEQUENCE, eDisc | eSubmitter | eSmart | eBig | eTSA, "Short Contig")
1385 {
1386  const CBioseq& bioseq = context.CurrentBioseq();
1387  if (bioseq.CanGetInst() && bioseq.GetInst().IsNa() && bioseq.IsSetLength() && bioseq.GetLength() < 200) {
1388  if (context.InGenProdSet() && bioseq.IsSetInst() && bioseq.GetInst().IsSetMol() && bioseq.GetInst().GetMol() == objects::CSeq_inst::eMol_rna) {
1389  const CSeqdesc* molinfo = context.GetMolinfo();
1390  if (molinfo && molinfo->GetMolinfo().IsSetBiomol() && IsMolProd(molinfo->GetMolinfo().GetBiomol())) {
1391  return;
1392  }
1393  }
1395  if (bioseq.IsSetAnnot()) {
1396  for (auto& annot_it : bioseq.GetAnnot()) {
1397  if (annot_it->IsFtable()) {
1399  }
1400  }
1401  }
1402  m_Objs["[n] contig[s] [is] shorter than 200 nt"].Add(*context.BioseqObjRef(fix));
1403  }
1404 }
1405 
1406 
1407 DISCREPANCY_SUMMARIZE(SEQ_SHORTER_THAN_200bp)
1408 {
1409  m_ReportItems = m_Objs.Export(*this, false)->GetSubitems();
1410 }
1411 
1412 
1413 DISCREPANCY_AUTOFIX(SEQ_SHORTER_THAN_200bp)
1414 {
1415  const CBioseq* seq = dynamic_cast<const CBioseq*>(context.FindObject(*obj));
1416  CBioseq_EditHandle bioseq_edit(context.GetBioseqHandle(*seq));
1417  bioseq_edit.Remove();
1418  obj->SetFixed();
1419  return CRef<CAutofixReport>(new CAutofixReport("SEQ_SHORTER_THAN_200bp: [n] short bioseq[s] [is] removed", 1));
1420 }
1421 
1422 
1423 // RNA_PROVIRAL
1424 
1425 DISCREPANCY_CASE(RNA_PROVIRAL, SEQUENCE, eOncaller, "RNA bioseqs are proviral")
1426 {
1427  const CBioseq& bioseq = context.CurrentBioseq();
1428  if (bioseq.CanGetInst() && bioseq.GetInst().IsSetMol() && bioseq.GetInst().GetMol() == CSeq_inst::eMol_rna) {
1429  const CSeqdesc* biosrc = context.GetBiosource();
1430  if (biosrc && biosrc->GetSource().IsSetOrg() && biosrc->GetSource().IsSetGenome() && biosrc->GetSource().GetGenome() == CBioSource::eGenome_proviral) {
1431  m_Objs["[n] RNA bioseq[s] [is] proviral"].Add(*context.BioseqObjRef());
1432  }
1433  }
1434 }
1435 
1436 
1438 {
1439  m_ReportItems = m_Objs.Export(*this, false)->GetSubitems();
1440 }
1441 
1442 
1443 // SMALL_GENOME_SET_PROBLEM
1444 
1445 typedef bool (CBioSource::*FnIsSet)() const;
1446 typedef const string& (CBioSource::*FnGet)() const;
1447 
1448 static bool CompareOrGetString(const CBioSource& bio_src, FnIsSet is_set_fn, FnGet get_fn, string& val)
1449 {
1450  bool ret = true;
1451  if ((bio_src.*is_set_fn)()) {
1452  if (val.empty()) {
1453  val = (bio_src.*get_fn)();
1454  }
1455  else if (val != (bio_src.*get_fn)()) {
1456  ret = false;
1457  }
1458  }
1459  return ret;
1460 }
1461 
1462 
1463 static bool CompareOrgModValue(const CBioSource& bio_src, COrgMod::TSubtype subtype, string& val)
1464 {
1465  bool ret = true;
1466  if (bio_src.IsSetOrgMod()) {
1467  for (auto& mod : bio_src.GetOrgname().GetMod()) {
1468  if (mod->IsSetSubtype() && mod->GetSubtype() == subtype && mod->IsSetSubname()) {
1469  if (val.empty()) {
1470  val = mod->GetSubname();
1471  }
1472  else {
1473  if (mod->GetSubname() != val) {
1474  ret = false;
1475  }
1476  }
1477  break;
1478  }
1479  }
1480  }
1481  return ret;
1482 }
1483 
1484 
1485 static bool IsSegmentSubtype(const CBioSource& bio_src)
1486 {
1487  bool ret = false;
1488  if (bio_src.IsSetSubtype()) {
1489  for (const auto& subtype : bio_src.GetSubtype()) {
1490  if (subtype->IsSetSubtype() && subtype->GetSubtype() == CSubSource::eSubtype_segment) {
1491  ret = true;
1492  break;
1493  }
1494  }
1495  }
1496  return ret;
1497 }
1498 
1499 
1500 DISCREPANCY_CASE(SMALL_GENOME_SET_PROBLEM, SEQ_SET, eOncaller, "Problems with small genome sets")
1501 {
1502  const CBioseq_set& set = context.CurrentBioseq_set();
1503  if (set.IsSetClass() && set.GetClass() == CBioseq_set::eClass_small_genome_set) {
1504  string taxname, isolate, strain;
1505  bool all_taxname_same = true, all_isolate_same = true, all_strain_same = true;
1506  for (auto& descr_bio_src : context.GetSetBiosources()) {
1507  const CBioSource& bio_src = descr_bio_src->GetSource();
1508  if (context.HasLineage(bio_src, "", "Viruses")) {
1509  if (!IsSegmentSubtype(bio_src)) {
1510  m_Objs["[n] biosource[s] should have segment qualifier but [does] not"].Add(*context.SeqdescObjRef(*descr_bio_src));
1511  }
1512  }
1513  if (all_taxname_same) {
1514  all_taxname_same = CompareOrGetString(bio_src, &CBioSource::IsSetTaxname, &CBioSource::GetTaxname, taxname);
1515  }
1516  if (all_isolate_same) {
1517  all_isolate_same = CompareOrgModValue(bio_src, COrgMod::eSubtype_isolate, isolate);
1518  }
1519  if (all_strain_same) {
1520  all_strain_same = CompareOrgModValue(bio_src, COrgMod::eSubtype_strain, strain);
1521  }
1522  }
1523  if (!all_taxname_same) {
1524  m_Objs["Not all biosources have same taxname"];
1525  }
1526  if (!all_isolate_same) {
1527  m_Objs["Not all biosources have same isolate"];
1528  }
1529  if (!all_strain_same) {
1530  m_Objs["Not all biosources have same strain"];
1531  }
1532  }
1533 }
1534 
1535 
1536 DISCREPANCY_SUMMARIZE(SMALL_GENOME_SET_PROBLEM)
1537 {
1538  m_ReportItems = m_Objs.Export(*this, false)->GetSubitems();
1539 }
1540 
1541 
1542 // UNWANTED_SET_WRAPPER
1543 
1544 static bool IsMicroSatellite(const CSeq_feat& feat)
1545 {
1547  if (feat.IsSetQual()) {
1548  for (auto& qual : feat.GetQual()) {
1549  if (qual->IsSetQual() && qual->IsSetVal() && NStr::EqualNocase("satellite", qual->GetQual()) && NStr::StartsWith(qual->GetVal(), "microsatellite", NStr::eNocase)) {
1550  return true;
1551  }
1552  }
1553  }
1554  }
1555  return false;
1556 }
1557 
1558 
1559 DISCREPANCY_CASE(UNWANTED_SET_WRAPPER, FEAT, eOncaller, "Set wrapper on microsatellites or rearranged genes")
1560 {
1561  const CSeqdesc* biosrc = context.GetBiosource();
1562  if (biosrc && biosrc->GetSource().IsSetSubtype()) {
1563  for (auto& subtype : biosrc->GetSource().GetSubtype()) {
1564  if (subtype->IsSetSubtype() && subtype->GetSubtype() == CSubSource::eSubtype_rearranged) {
1565  context.PropagateFlags(CDiscrepancyContext::eHasRearranged);
1566  break;
1567  }
1568  }
1569  }
1570  for (auto& feat : context.GetFeat()) {
1571  if (IsMicroSatellite(feat)) {
1572  context.PropagateFlags(CDiscrepancyContext::eHasSatFeat);
1573  }
1574  else {
1575  context.PropagateFlags(CDiscrepancyContext::eHasNonSatFeat);
1576  }
1577  }
1578  if (!context.IsBioseq()) {
1579  const CBioseq_set& set = context.CurrentBioseq_set();
1580  if (set.IsSetClass()) {
1581  CBioseq_set::EClass bio_set_class = set.GetClass();
1582  if (bio_set_class == CBioseq_set::eClass_eco_set || bio_set_class == CBioseq_set::eClass_mut_set || bio_set_class == CBioseq_set::eClass_phy_set || bio_set_class == CBioseq_set::eClass_pop_set) {
1583  unsigned char flags = context.ReadFlags();
1585  m_Objs["[n] unwanted set wrapper[s]"].Add(*context.BioseqSetObjRef());
1586  }
1587  }
1588  }
1589  }
1590 }
1591 
1592 
1593 // FLATFILE_FIND
1595 {
1596  const char* m_misspell;
1597  const char* m_correct;
1599 };
1600 
1602  { "Agricultutral", "agricultural", false },
1603  { "Bacilllus", "Bacillus", false },
1604  { "Enviromental", "Environmental", false },
1605  { "Insitiute", "institute", false },
1606  { "Instutite", "institute", false },
1607  { "Instutute", "Institute", false },
1608  { "P.R.Chian", "P.R. China", false },
1609  { "PRChian", "PR China", false },
1610  { "Scieces", "Sciences", false },
1611  { "agricultral", "agricultural", false },
1612  { "agriculturral", "agricultural", false },
1613  { "biotechnlogy", "biotechnology", false },
1614  { "Biotechnlogy", "Biotechnology", false },
1615  { "biotechnolgy", "biotechnology", false },
1616  { "biotechology", "biotechnology", false },
1617  { "caputre", "capture", true },
1618  { "casette", "cassette", true },
1619  { "catalize", "catalyze", false },
1620  { "charaterization", "characterization", false },
1621  { "clonging", "cloning", false },
1622  { "consevered", "conserved", false },
1623  { "cotaining", "containing", false },
1624  { "cytochome", "cytochrome", true },
1625  { "diveristy", "diversity", true },
1626  { "enivronment", "environment", false },
1627  { "enviroment", "environment", false },
1628  { "genone", "genome", true },
1629  { "homologue", "homolog", true },
1630  { "hypotethical", "hypothetical", false },
1631  { "hypotetical", "hypothetical", false },
1632  { "hypothetcial", "hypothetical", false },
1633  { "hypothteical", "hypothetical", false },
1634  { "indepedent", "independent", false },
1635  { "insititute", "institute", false },
1636  { "insitute", "institute", false },
1637  { "institue", "institute", false },
1638  { "instute", "institute", false },
1639  { "muesum", "museum", true },
1640  { "musuem", "museum", true },
1641  { "nuclear shutting", "nuclear shuttling", true },
1642  { "phylogentic", "phylogenetic", false },
1643  { "protien", "protein", false },
1644  { "puatative", "putative", false },
1645  { "putaitve", "putative", false },
1646  { "putaive", "putative", false },
1647  { "putataive", "putative", false },
1648  { "putatitve", "putative", false },
1649  { "putatuve", "putative", false },
1650  { "putatvie", "putative", false },
1651  { "pylogeny", "phylogeny", false },
1652  { "resaerch", "research", false },
1653  { "reseach", "research", false },
1654  { "reserach", "research", true },
1655  { "reserch", "research", false },
1656  { "ribosoml", "ribosomal", false },
1657  { "ribossomal", "ribosomal", false },
1658  { "scencies", "sciences", false },
1659  { "scinece", "science", false },
1660  { "simmilar", "similar", false },
1661  { "structual", "structural", false },
1662  { "subitilus", "subtilis", false },
1663  { "sulfer", "sulfur", false },
1664  { "technlogy", "technology", false },
1665  { "technolgy", "technology", false },
1666  { "Technlogy", "Technology", false },
1667  { "Veterinry", "Veterinary", false },
1668  { "Argricultural", "Agricultural", false },
1669  { "transcirbed", "transcribed", false },
1670  { "transcirption", "transcription", true },
1671  { "uiniversity", "university", false },
1672  { "uinversity", "university", false },
1673  { "univercity", "university", false },
1674  { "univerisity", "university", false },
1675  { "univeristy", "university", false },
1676  { "univesity", "university", false },
1677  { "unversity", "university", true },
1678  { "uviversity", "university", false },
1679  { "anaemia", nullptr, false },
1680  { "haem", nullptr, false },
1681  { "haemagglutination", nullptr, false },
1682  { "heam", nullptr, false },
1683  { "mithocon", nullptr, false },
1684 };
1685 
1686 static const size_t kSpellFixesSize = ArraySize(kSpellFixes);
1687 static const string kFixable = "Fixable";
1688 static const string kNonFixable = "Non-fixable";
1689 
1690 
1691 static void FindFlatfileText(const char* str, bool *result)
1692 {
1693 #include "FLATFILE_FIND.inc"
1694  static constexpr TLocalFSM s_FSM{s_compact, s_hits_init_1, s_hits_init_2, s_states, nullptr};
1695 
1696  CMultipatternSearch::Search(str, s_FSM, [result](size_t n){ result[n] = true; });
1697 }
1698 
1699 
1700 /// Checking that FLATFILE_FIND.inc is in sync with kSpellFixes
1701 /// If the array is changed, need to regenerate FLATFILE_FIND.inc:
1702 /// multipattern.exe -i FLATFILE_FIND.txt > FLATFILE_FIND.inc
1704 {
1705  bool Found[kSpellFixesSize];
1706  string error = "String not found: ";
1707  for (size_t i = 0; i < kSpellFixesSize; i++) {
1708  fill(Found, Found + kSpellFixesSize, 0);
1709  FindFlatfileText(kSpellFixes[i].m_misspell, Found);
1710  if (!Found[i]) {
1713  }
1714  }
1715 }
1716 
1717 
1718 DISCREPANCY_CASE1(FLATFILE_FIND, SEQUENCE, eOncaller, "Flatfile representation of object contains suspect text",
1719  "FLATFILE_FIND_ONCALLER",
1720  "FLATFILE_FIND_ONCALLER_UNFIXABLE",
1721  "FLATFILE_FIND_ONCALLER_FIXABLE"
1722  )
1723 {
1724  bool Found[kSpellFixesSize];
1725  for (auto& desc : context.GetAllSeqdesc()) {
1726  fill(Found, Found + kSpellFixesSize, 0);
1727  for (CStdTypeConstIterator<string> it(desc); it; ++it) {
1728  FindFlatfileText(it->c_str(), Found);
1729  }
1730  for (size_t i = 0; i < kSpellFixesSize; i++) {
1731  if (Found[i]) {
1732  string subitem = string("[n] object[s] contain[S] ") + kSpellFixes[i].m_misspell;
1733  bool autofix = kSpellFixes[i].m_correct != nullptr;
1734  const string& fixable = (autofix ? kFixable : kNonFixable);
1735  m_Objs[fixable][subitem].Add(*context.SeqdescObjRef(desc, &desc));
1736  }
1737  }
1738  }
1739  for (auto& feat: context.FeatAll()) {
1740  fill(Found, Found + kSpellFixesSize, 0);
1741  for (CStdTypeConstIterator<string> it(*feat); it; ++it) {
1742  FindFlatfileText(it->c_str(), Found);
1743  }
1744  for (size_t i = 0; i < kSpellFixesSize; i++) {
1745  if (Found[i]) {
1746  string subitem = string("[n] object[s] contain[S] ") + kSpellFixes[i].m_misspell;
1747  bool autofix = kSpellFixes[i].m_correct != nullptr;
1748  const string& fixable = (autofix ? kFixable : kNonFixable);
1749  m_Objs[fixable][subitem].Add(*context.SeqFeatObjRef(*feat, feat));
1750  }
1751  }
1752  }
1753 }
1754 
1755 
1757 {
1758  m_ReportItems = m_Objs.Export(*this, false)->GetSubitems();
1759 }
1760 
1761 
1762 static bool FixTextInObject(CSerialObject* obj, size_t misspell_idx)
1763 {
1764  bool ret = false;
1765  const SpellFixData& fix_data = kSpellFixes[misspell_idx];
1766  for (CStdTypeIterator<string> it(*obj); it; ++it) {
1767  if (NStr::Find(*it, fix_data.m_misspell) != NPOS) {
1768  NStr::ReplaceInPlace(*it, fix_data.m_misspell, fix_data.m_correct, 0, -1);
1769  ret = true;
1770  }
1771  }
1772  return ret;
1773 }
1774 
1775 
1776 DISCREPANCY_AUTOFIX(FLATFILE_FIND)
1777 {
1778  unsigned int n = 0;
1779  bool Found[kSpellFixesSize];
1780  const CSeq_feat* feat = dynamic_cast<const CSeq_feat*>(context.FindObject(*obj));
1781  const CSeqdesc* desc = dynamic_cast<const CSeqdesc*>(context.FindObject(*obj));
1782  fill(Found, Found + kSpellFixesSize, 0);
1783  if (feat) {
1784  for (CStdTypeConstIterator<string> it(*feat); it; ++it) {
1785  FindFlatfileText(it->c_str(), Found);
1786  }
1787  for (size_t i = 0; i < kSpellFixesSize; i++) {
1788  if (Found[i]) {
1789  if (FixTextInObject(const_cast<CSeq_feat*>(feat), i)) {
1790  ++n;
1791  }
1792  }
1793  }
1794  }
1795  if (desc) {
1796  for (CStdTypeConstIterator<string> it(*desc); it; ++it) {
1797  FindFlatfileText(it->c_str(), Found);
1798  }
1799  for (size_t i = 0; i < kSpellFixesSize; i++) {
1800  if (Found[i]) {
1801  if (FixTextInObject(const_cast<CSeqdesc*>(desc), i)) {
1802  ++n;
1803  }
1804  }
1805  }
1806  }
1807  obj->SetFixed();
1808  return CRef<CAutofixReport>(new CAutofixReport("FLATFILE_FIND: [n] suspect text[s] [is] fixed", n));
1809 }
1810 
1811 
1812 // ALL_SEQS_SHORTER_THAN_20kb
1813 
1814 static const size_t MIN_SEQUENCE_LEN = 20000;
1815 
1816 
1817 DISCREPANCY_CASE(ALL_SEQS_SHORTER_THAN_20kb, SEQUENCE, eDisc | eSubmitter | eSmart | eBig, "Short sequences test")
1818 {
1819  if (context.CurrentBioseqSummary().Len > MIN_SEQUENCE_LEN) {
1820  m_Objs[kEmptyStr];
1821  }
1822 }
1823 
1824 
1825 DISCREPANCY_SUMMARIZE(ALL_SEQS_SHORTER_THAN_20kb)
1826 {
1827  if (m_Objs.GetMap().find(kEmptyStr) == m_Objs.GetMap().end()) {
1828  // no sequences longer than 20000 nt
1829  m_Objs["No sequences longer than 20,000 nt found"];
1830  }
1831  else {
1832  m_Objs.GetMap().erase(kEmptyStr);
1833  }
1834  m_ReportItems = m_Objs.Export(*this, false)->GetSubitems();
1835 }
1836 
1837 
1838 
1839 DISCREPANCY_CASE(ALL_SEQS_CIRCULAR, SEQUENCE, eDisc | eSubmitter | eSmart, "All sequences circular")
1840 {
1841  const CBioseq& bioseq = context.CurrentBioseq();
1842  if (bioseq.CanGetInst() && bioseq.GetInst().IsNa()) {
1843  if (m_Objs["N"].GetCount()) {
1844  return;
1845  }
1846  if (bioseq.GetInst().CanGetTopology() && bioseq.GetInst().GetTopology() == CSeq_inst::eTopology_circular) {
1847  const CSeqdesc* biosrc = context.GetBiosource();
1848  if (biosrc && biosrc->GetSource().IsSetGenome() && (biosrc->GetSource().GetGenome() == CBioSource::eGenome_plasmid || biosrc->GetSource().GetGenome() == CBioSource::eGenome_chromosome)) {
1849  return;
1850  }
1851  m_Objs["C"].Incr();
1852  if (!m_Objs["F"].GetCount()) {
1853  if (bioseq.IsSetId()) {
1854  for (auto id : bioseq.GetId()) {
1855  const CTextseq_id* txt = id->GetTextseq_Id();
1856  if (txt && txt->IsSetAccession()) {
1859  m_Objs["F"].Incr();
1860  return;
1861  }
1862  }
1863  }
1864  }
1865  if (bioseq.IsSetDescr() && bioseq.GetDescr().IsSet()) {
1866  for (const auto& descr : bioseq.GetDescr().Get()) {
1867  if (descr->IsMolinfo() && descr->GetMolinfo().CanGetTech()) {
1868  if (descr->GetMolinfo().GetTech() == CMolInfo::eTech_wgs || descr->GetMolinfo().GetTech() == CMolInfo::eTech_tsa || descr->GetMolinfo().GetTech() == CMolInfo::eTech_targeted) {
1869  m_Objs["F"].Incr();
1870  return;
1871  }
1872  }
1873  }
1874  }
1875  }
1876  }
1877  else {
1878  m_Objs["N"].Incr();
1879  }
1880  }
1881 }
1882 
1883 
1884 DISCREPANCY_SUMMARIZE(ALL_SEQS_CIRCULAR)
1885 {
1886  CReportNode rep;
1887  if (m_Objs["C"].GetCount() && !m_Objs["N"].GetCount()) {
1888  rep["All ([n]) sequences are circular"].Severity(m_Objs["F"].GetCount() ? CReportItem::eSeverity_error : CReportItem::eSeverity_warning).SetCount(m_Objs["C"].GetCount());
1889  m_ReportItems = rep.Export(*this, false)->GetSubitems();
1890  }
1891 }
1892 
1893 
1894 // SUSPICIOUS_SEQUENCE_ID
1895 
1896 static bool SuspiciousId(const string& s)
1897 {
1898  static CRegexp regexp("chromosome|plasmid|mito|chloroplast|apicoplast|plastid|^chr|^lg|\\bNW_|\\bNZ_|\\bNM_|\\bNC_|\\bAC_|CP\\d\\d\\d\\d\\d\\d", CRegexp::fCompile_ignore_case);
1899  return regexp.IsMatch(s);
1900 }
1901 
1902 DISCREPANCY_CASE(SUSPICIOUS_SEQUENCE_ID, SEQUENCE, eOncaller | eSubmitter | eSmart | eBig, "Suspicious sequence identifiers")
1903 {
1904  const CBioseq& bioseq = context.CurrentBioseq();
1905  if (bioseq.CanGetId()) {
1906  bool report = false;
1907  for (const auto& id : bioseq.GetId()) {
1908  if (id->IsLocal()) {
1909  if (id->GetLocal().IsStr() && SuspiciousId(id->GetLocal().GetStr())) {
1910  report = true;
1911  break;
1912  }
1913  }
1914  else if (id->IsGeneral()) {
1915  if (id->GetGeneral().IsSetDb() && SuspiciousId(id->GetGeneral().GetDb())) {
1916  report = true;
1917  break;
1918  }
1919  if (id->GetGeneral().IsSetTag() && id->GetGeneral().GetTag().IsStr() && SuspiciousId(id->GetGeneral().GetTag().GetStr())) {
1920  report = true;
1921  break;
1922  }
1923  }
1924  }
1925  if (report) {
1926  m_Objs["[n] sequence[s] [has] suspicious identifiers"].Add(*context.BioseqSetObjRef());
1927  }
1928  }
1929 }
1930 
1931 
1932 DISCREPANCY_SUMMARIZE(SUSPICIOUS_SEQUENCE_ID)
1933 {
1934  m_ReportItems = m_Objs.Export(*this, false)->GetSubitems();
1935 }
1936 
1937 
1938 // CHROMOSOME_PRESENT
1939 
1941 
1943 {
1944  if (Qualifier == CSubSource::eSubtype_plasmid_name) {
1945  return true; // always OK by this test; might be handled elsewhere
1946  }
1947 
1948  switch (Location)
1949  {
1951  return false;
1952  case CBioSource::eGenome_unknown: // not present
1954  switch (Qualifier)
1955  {
1958  return false;
1959 // case eSubtype_unknown: // not present
1960  default:
1961  return true;
1962  }
1963  case CBioSource::eGenome_plasmid: // always OK by this test; might be handled elsewhere
1964  default:
1965  return true;
1966  }
1967 
1968 }
1969 
1970 DISCREPANCY_CASE(CHROMOSOME_PRESENT, SEQ_SET, eSubmitter | eSmart, "Chromosome present")
1971 {
1972  const CBioseq_set& set = context.CurrentBioseq_set();
1973  if (set.IsSetSeq_set()) {
1974  for (const auto& se : set.GetSeq_set()) {
1975  if (!se->IsSetDescr()) {
1976  continue;
1977  }
1978 
1979  for (const auto& descr : se->GetDescr().Get()) {
1980  if (!descr->IsSource()) {
1981  continue;
1982  }
1983  const CBioSource& bio_src = descr->GetSource();
1984 
1986  if (bio_src.IsSetGenome()) {
1987  Location = static_cast<CBioSource::EGenome>(bio_src.GetGenome());
1988  }
1989  // shortcut
1990  if (Location == CBioSource::eGenome_plasmid) {
1991  continue; // always OK by this test; might be handled elsewhere
1992  }
1993 
1994  if (bio_src.IsSetSubtype()) {
1995  for (const auto& subtype : bio_src.GetSubtype()) {
1997  if (subtype->IsSetSubtype()) {
1998  Qualifier = static_cast<CSubSource::ESubtype>(subtype->GetSubtype());
1999  }
2000  if (!s_areCompatible(Location, Qualifier)) {
2001  m_Objs["one or more chromosomes are present"];
2002  }
2003  }
2004  } else {
2005  if (!s_areCompatible(Location, eSubtype_unknown)) {
2006  m_Objs["one or more chromosomes are present"];
2007  }
2008  }
2009  }
2010  }
2011  }
2012 }
2013 
2014 
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
#define bool
Definition: bool.h:34
const string & GetTaxname(void) const
Definition: BioSource.cpp:340
bool IsSetOrgMod(void) const
Definition: BioSource.cpp:415
const COrgName & GetOrgname(void) const
Definition: BioSource.cpp:410
bool IsSetTaxname(void) const
Definition: BioSource.cpp:335
CBioseq_EditHandle –.
CBioseq_Handle –.
TSeqPos GetLength(void) const
Definition: Bioseq.cpp:360
bool IsSetLength(void) const
Definition: Bioseq.cpp:355
bool IsNa(void) const
Definition: Bioseq.cpp:345
bool IsAa(void) const
Definition: Bioseq.cpp:350
static string GetStructuredCommentPrefix(const CUser_object &user, bool normalize=true)
CRef< CDiscrepancyObject > SeqdescObjRef(const CSeqdesc &desc, const CObject *fix=nullptr, const CObject *more=nullptr)
static bool IsGui()
CFeat_CI –.
Definition: feat_ci.hpp:64
void Search(const char *input, VoidCall1 found_callback) const
CRef –.
Definition: ncbiobj.hpp:618
CRegexp –.
Definition: regexp.hpp:70
virtual vector< CRef< CReportItem > > GetSubitems() const =0
static void Add(TReportObjectList &list, TReportObjectSet &hash, CReportObj &obj, bool unique=true)
TReportObjectList & GetObjects()
CReportNode & Severity(CReportItem::ESeverity s)
void SetCount(size_t n)
TNodeMap & GetMap()
CRef< CReportItem > Export(CDiscrepancyCore &test, bool unique=true) const
static bool Exist(TReportObjectSet &hash, CReportObj &obj)
CScope –.
Definition: scope.hpp:92
static EFeatureLocationAllowed AllowedFeatureLocation(ESubtype subtype)
@ eFeatureLocationAllowed_NucOnly
@ eFeatureLocationAllowed_ProtOnly
@ eFeatureLocationAllowed_Any
ESubtype GetSubtype(void) const
@Seq_descr.hpp User-defined methods of the data storage class.
Definition: Seq_descr.hpp:55
CSeq_feat_EditHandle –.
namespace ncbi::objects::
Definition: Seq_feat.hpp:58
static bool IsAa(EMol mol)
Definition: Seq_inst.hpp:99
static bool IsNa(EMol mol)
Definition: Seq_inst.hpp:90
CSeqdesc_CI –.
Definition: seqdesc_ci.hpp:65
Base class for all serializable objects.
Definition: serialbase.hpp:150
Template class for iteration on objects of class C (non-medifiable version)
Definition: iterator.hpp:767
Template class for iteration on objects of class C.
Definition: iterator.hpp:673
@ eObjectType_StructuredComment
EObjectType GetObjectType() const
void erase(iterator pos)
Definition: map.hpp:167
size_type size() const
Definition: map.hpp:148
Definition: map.hpp:338
Definition: set.hpp:45
char value[7]
Definition: config.c:431
API (CDeflineGenerator) for computing sequences' titles ("definitions").
static uch flags
vector< CRef< CReportObj > > TReportObjectList
@ eFatal
@ eAll
@ eTSA
@ eBig
@ eDisc
@ eOncaller
@ eSubmitter
@ eSmart
#define DISCREPANCY_AUTOFIX(name)
#define DISCREPANCY_CASE1(name, type, group, descr,...)
#define DISCREPANCY_CASE0(name, sname, type, group, descr)
#define DISCREPANCY_CASE(name, type, group, descr)
#define DISCREPANCY_SUMMARIZE(name)
static DLIST_TYPE *DLIST_NAME() first(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:46
vector< CConstRef< CObject > > GetObjects(CSeq_entry_Handle seh, const string &field, CFieldNamePanel::EFieldType field_type, int subtype, const string &ncRNA_class, CConstRef< objects::CSeq_submit > submit, CRef< CEditingActionConstraint > constraint, vector< CSeq_entry_Handle > *descr_context=nullptr)
void ReverseComplement(const BidirectionalIterator &first, const BidirectionalIterator &last)
constexpr size_t ArraySize(const Element(&)[Size])
Definition: ncbimisc.hpp:1532
string
Definition: cgiapp.hpp:687
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
@ eUnknown
Definition: app_popup.hpp:72
static EAccessionInfo IdentifyAccession(const CTempString &accession, TParseFlags flags=fParse_AnyRaw)
Deduces information from a bare accession a la WHICH_db_accession; may report false negatives on prop...
Definition: Seq_id.cpp:1634
EAccessionInfo
For IdentifyAccession (below)
Definition: Seq_id.hpp:220
@ eAcc_wgs
Definition: Seq_id.hpp:264
@ eAcc_division_mask
Definition: Seq_id.hpp:273
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
Definition: scope.cpp:95
void SetDescr(TDescr &v) const
void SetInst_Mol(TInst_Mol v) const
const CSeqFeatData & GetData(void) const
void Remove(void) const
Remove the feature from Seq-annot.
void SetInst(TInst &v) const
void Remove(ERemoveMode mode=eRemoveSeq_entry) const
bool IsSetData(void) const
const TInst & GetInst(void) const
const CSeq_feat & GetMappedFeature(void) const
Feature mapped to the master sequence.
CConstRef< CSeq_feat > GetSeq_feat(void) const
Get current seq-feat.
bool IsMatch(CTempString str, TMatch flags=fMatch_default)
Check existence substring which match a specified pattern.
Definition: regexp.cpp:193
@ fCompile_ignore_case
Definition: regexp.hpp:103
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
NCBI_NS_STD::string::size_type SIZE_TYPE
Definition: ncbistr.hpp:132
#define kEmptyStr
Definition: ncbistr.hpp:123
static SIZE_TYPE FindNoCase(const CTempString str, const CTempString pattern, SIZE_TYPE start, SIZE_TYPE end, EOccurrence which=eFirst)
Find the pattern in the specified range of a string using a case insensitive search.
Definition: ncbistr.cpp:2989
static bool IsBlank(const CTempString str, SIZE_TYPE pos=0)
Check if a string is blank (has no text).
Definition: ncbistr.cpp:106
#define NPOS
Definition: ncbistr.hpp:133
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5083
static SIZE_TYPE Find(const CTempString str, const CTempString pattern, ECase use_case=eCase, EDirection direction=eForwardSearch, SIZE_TYPE occurrence=0)
Find the pattern in the string.
Definition: ncbistr.cpp:2887
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
Definition: ncbistr.hpp:5411
static bool EqualNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive equality of a substring with another string.
Definition: ncbistr.hpp:5352
static bool Equal(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2, ECase use_case=eCase)
Test for equality of a substring with another string.
Definition: ncbistr.hpp:5383
static string & ReplaceInPlace(string &src, const string &search, const string &replace, SIZE_TYPE start_pos=0, SIZE_TYPE max_replace=0, SIZE_TYPE *num_replace=0)
Replace occurrences of a substring within a string.
Definition: ncbistr.cpp:3401
const char *const kEmptyCStr
Empty "C" string (points to a '\0').
Definition: ncbistr.cpp:68
static int CompareCase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-sensitive compare of a substring with another string.
Definition: ncbistr.cpp:135
@ eNocase
Case insensitive compare.
Definition: ncbistr.hpp:1206
static const char label[]
const TSubtype & GetSubtype(void) const
Get the Subtype member data.
Definition: BioSource_.hpp:539
TGenome GetGenome(void) const
Get the Genome member data.
Definition: BioSource_.hpp:422
bool IsSetOrg(void) const
Check if a value has been assigned to Org data member.
Definition: BioSource_.hpp:497
bool IsSetSubtype(void) const
Check if a value has been assigned to Subtype data member.
Definition: BioSource_.hpp:527
void SetGenome(TGenome value)
Assign a value to Genome data member.
Definition: BioSource_.hpp:428
bool IsSetGenome(void) const
Check if a value has been assigned to Genome data member.
Definition: BioSource_.hpp:397
EGenome
biological context
Definition: BioSource_.hpp:97
const TStr & GetStr(void) const
Get the variant data.
bool IsSetData(void) const
the object itself Check if a value has been assigned to Data data member.
bool IsStr(void) const
Check if variant Str is selected.
Definition: Object_id_.hpp:291
bool IsSetType(void) const
type of object within class Check if a value has been assigned to Type data member.
bool IsStrs(void) const
Check if variant Strs is selected.
const TStrs & GetStrs(void) const
Get the variant data.
const TData & GetData(void) const
Get the Data member data.
bool IsStr(void) const
Check if variant Str is selected.
const TStr & GetStr(void) const
Get the variant data.
Definition: Object_id_.hpp:297
const TData & GetData(void) const
Get the Data member data.
const TType & GetType(void) const
Get the Type member data.
vector< CStringUTF8 > TStrs
const TMod & GetMod(void) const
Get the Mod member data.
Definition: OrgName_.hpp:839
@ eSubtype_strain
Definition: OrgMod_.hpp:85
@ eSubtype_isolate
Definition: OrgMod_.hpp:100
bool IsSetData(void) const
the specific data Check if a value has been assigned to Data data member.
Definition: Seq_feat_.hpp:913
bool IsSetQual(void) const
qualifiers Check if a value has been assigned to Qual data member.
Definition: Seq_feat_.hpp:1135
const TQual & GetQual(void) const
Get the Qual member data.
Definition: Seq_feat_.hpp:1147
const TData & GetData(void) const
Get the Data member data.
Definition: Seq_feat_.hpp:925
bool IsSetAccession(void) const
Check if a value has been assigned to Accession data member.
const TAccession & GetAccession(void) const
Get the Accession member data.
@ eNa_strand_minus
Definition: Na_strand_.hpp:67
@ eClass_pop_set
population study
@ eClass_phy_set
phylogenetic study
@ eClass_mut_set
set of mutations
@ eClass_eco_set
ecological sample study
@ eClass_small_genome_set
viral segments or mitochondrial minicircles
TRepr GetRepr(void) const
Get the Repr member data.
Definition: Seq_inst_.hpp:565
bool IsSetCompleteness(void) const
Check if a value has been assigned to Completeness data member.
Definition: MolInfo_.hpp:569
const TUser & GetUser(void) const
Get the variant data.
Definition: Seqdesc_.cpp:384
const TInst & GetInst(void) const
Get the Inst member data.
Definition: Bioseq_.hpp:336
TTopology GetTopology(void) const
Get the Topology member data.
Definition: Seq_inst_.hpp:733
bool IsSetAnnot(void) const
Check if a value has been assigned to Annot data member.
Definition: Bioseq_.hpp:354
bool IsSetRepr(void) const
Check if a value has been assigned to Repr data member.
Definition: Seq_inst_.hpp:546
bool IsSetMol(void) const
Check if a value has been assigned to Mol data member.
Definition: Seq_inst_.hpp:593
bool CanGetTopology(void) const
Check if it is safe to call GetTopology method.
Definition: Seq_inst_.hpp:714
const TTitle & GetTitle(void) const
Get the variant data.
Definition: Seqdesc_.hpp:1032
const TSource & GetSource(void) const
Get the variant data.
Definition: Seqdesc_.cpp:566
bool IsSetBiomol(void) const
Check if a value has been assigned to Biomol data member.
Definition: MolInfo_.hpp:422
const TAnnot & GetAnnot(void) const
Get the Annot member data.
Definition: Bioseq_.hpp:366
const TId & GetId(void) const
Get the Id member data.
Definition: Bioseq_.hpp:290
TTech GetTech(void) const
Get the Tech member data.
Definition: MolInfo_.hpp:497
const Tdata & Get(void) const
Get the member data.
Definition: Seq_descr_.hpp:166
bool IsSetInst(void) const
the sequence data Check if a value has been assigned to Inst data member.
Definition: Bioseq_.hpp:324
TLength GetLength(void) const
Get the Length member data.
Definition: Seq_inst_.hpp:659
TMol GetMol(void) const
Get the Mol member data.
Definition: Seq_inst_.hpp:612
bool IsSetLength(void) const
length of sequence in residues Check if a value has been assigned to Length data member.
Definition: Seq_inst_.hpp:640
TSource & SetSource(void)
Select the variant.
Definition: Seqdesc_.cpp:572
bool IsSetDescr(void) const
descriptors Check if a value has been assigned to Descr data member.
Definition: Bioseq_.hpp:303
bool IsSet(void) const
Check if a value has been assigned to data member.
Definition: Seq_descr_.hpp:154
TBiomol GetBiomol(void) const
Get the Biomol member data.
Definition: MolInfo_.hpp:447
void SetBiomol(TBiomol value)
Assign a value to Biomol data member.
Definition: MolInfo_.hpp:453
bool CanGetId(void) const
Check if it is safe to call GetId method.
Definition: Bioseq_.hpp:284
bool IsSetTech(void) const
Check if a value has been assigned to Tech data member.
Definition: MolInfo_.hpp:472
const TFtable & GetFtable(void) const
Get the variant data.
Definition: Seq_annot_.hpp:621
TCompleteness GetCompleteness(void) const
Get the Completeness member data.
Definition: MolInfo_.hpp:594
const TData & GetData(void) const
Get the Data member data.
Definition: Seq_annot_.hpp:873
bool IsSetId(void) const
equivalent identifiers Check if a value has been assigned to Id data member.
Definition: Bioseq_.hpp:278
Tdata & Set(void)
Assign a value to data member.
Definition: Seq_descr_.hpp:172
const TDescr & GetDescr(void) const
Get the Descr member data.
Definition: Bioseq_.hpp:315
const TMolinfo & GetMolinfo(void) const
Get the variant data.
Definition: Seqdesc_.cpp:588
TMolinfo & SetMolinfo(void)
Select the variant.
Definition: Seqdesc_.cpp:594
bool CanGetInst(void) const
Check if it is safe to call GetInst method.
Definition: Bioseq_.hpp:330
@ eRepr_delta
sequence made by changes (delta) to others
Definition: Seq_inst_.hpp:100
@ eCompleteness_unknown
Definition: MolInfo_.hpp:155
@ eCompleteness_complete
complete biological entity
Definition: MolInfo_.hpp:156
@ eTech_targeted
targeted locus sets/studies
Definition: MolInfo_.hpp:147
@ eTech_tsa
transcriptome shotgun assembly
Definition: MolInfo_.hpp:146
@ eTech_wgs
whole genome shotgun sequencing
Definition: MolInfo_.hpp:143
@ eBiomol_pre_RNA
precursor RNA of any sort really
Definition: MolInfo_.hpp:102
@ e_Source
source of materials, includes Org-ref
Definition: Seqdesc_.hpp:133
where boath are integers</td > n< td ></td > n</tr > n< tr > n< td > tse</td > n< td > optional</td > n< td > String</td > n< td class=\"description\"> TSE option controls what blob is smart and slim</td> n<td> orig</td> n</tr> n<tr> n<td> last_modified</td> n<td> optional</td> n<td> Integer</td> n<td class=\"description\"> The blob last modification If provided then the exact match will be requested with n the Cassandra storage corresponding field value</td> n<td> Positive integer Not provided means that the most recent match will be selected</td> n<td></td> n</tr> n<tr> n<td> use_cache</td> n<td> optional</td> n<td> String</td> n<td class=\"description\"> The option controls if the Cassandra LMDB cache and or database should be used It n affects the seq id resolution step and the blob properties lookup step The following n options are BIOSEQ_INFO and BLOB_PROP at all
int i
yy_size_t n
void ReverseComplementFeature(CSeq_feat &feat, CScope &scope)
Definition: loc_edit.cpp:1068
static MDB_envinfo info
Definition: mdb_load.c:37
Simultaneous search of multiple RegEx patterns in the input string.
const struct ncbi::grid::netcache::search::fields::KEY key
const CharType(& source)[N]
Definition: pointer.h:1149
int isspace(Uchar c)
Definition: ncbictype.hpp:69
int ispunct(Uchar c)
Definition: ncbictype.hpp:68
Int mod(Int i, Int j)
Definition: njn_integer.hpp:67
double f(double x_, const double &y_)
Definition: njn_root.hpp:188
static char tmp[2048]
Definition: utf8.c:42
static const char * prefix[]
Definition: pcregrep.c:405
static const string kFixable
const string kStructuredCommentReport
const string & kPreviouslySeenFields
static const string kMrnaSequenceMinusStrandFeatures
USING_SCOPE(objects)
static bool IsSegmentSubtype(const CBioSource &bio_src)
static bool s_areCompatible(CBioSource::EGenome Location, CSubSource::ESubtype Qualifier)
void UnitTest_FLATFILE_FIND()
Checking that FLATFILE_FIND.inc is in sync with kSpellFixes If the array is changed,...
string AdjustDBLinkFieldName(const string &orig_field_name)
static const CSubSource::ESubtype eSubtype_unknown
const string kMissingDBLink
const string kStructuredCommentObservedPrefixes
const string kSomeIdenticalDeflines
const string &(CBioSource::* FnGet)() const
string GetFieldValueAsString(const CUser_field &field)
const string & kPreviouslySeenObjects
static const string kNonFixable
static bool SuspiciousId(const string &s)
static const string kInconsistentMolinfoTech
void AddUserObjectFieldItems(const CSeqdesc *desc, CReportObj &rep_seq, CReportNode &collector, CReportNode &previously_seen, CDiscrepancyContext &context, const string &object_name, const string &field_prefix=kEmptyStr)
const string kDBLinkObjectList
static const string kInconsistentMolinfoTechSummary
const string kStructuredCommentObservedPrefixesThis
static bool IsATGC(char ch)
string GetSummaryLabel(bool all_present, bool all_same)
static const size_t MIN_SEQUENCE_LEN
const string & kPreviouslySeenFieldsThis
static bool EndsWithSequence(const string &title)
static bool FixTextInObject(CSerialObject *obj, size_t misspell_idx)
void AnalyzeFieldReport(CReportNode &node, bool &all_present, bool &all_same)
const string kSequencesWithGaps
const string kIdenticalDeflines
const string kDeflineExists
static void FindFlatfileText(const char *str, bool *result)
static bool IsMolProd(int biomol)
void AnalyzeField(CReportNode &node, bool &all_present, bool &all_same)
static bool CompareOrGetString(const CBioSource &bio_src, FnIsSet is_set_fn, FnGet get_fn, string &val)
void CopyNode(CReportNode &new_home, CReportNode &original)
const string kStructuredCommentPrevious
const string kNoTaxnameInDefline
static SpellFixData kSpellFixes[]
static bool CompareOrgModValue(const CBioSource &bio_src, COrgMod::TSubtype subtype, string &val)
const string kStructuredCommentFieldPrefix
static const size_t kSpellFixesSize
const string kDBLinkFieldCountTop
static string GetProjectID(const CUser_object &user)
const string kUniqueDeflines
const string kAllUniqueDeflines
static bool IsMicroSatellite(const CSeq_feat &feat)
const string kDBLinkCollect
bool(CBioSource::* FnIsSet)() const
const string kStructuredCommentsSeqs
static const char * str(char *buf, int n)
Definition: stats.c:84
const char * m_misspell
const char * m_correct
else result
Definition: token2.c:20
C++ wrappers for the Perl-compatible regular expression (PCRE) library.
static const char *const features[]
#define const
Definition: zconf.h:230
Modified on Sat Sep 23 04:26:53 2023 by modify_doxy.py rev. 669887