NCBI C++ ToolKit
sequence_tests.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: sequence_tests.cpp 101470 2023-12-18 15:47:26Z kans $
2  * =========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * =========================================================================
25  *
26  * Authors: Colleen Bollin, based on similar discrepancy tests
27  *
28  */
29 
30 #include <ncbi_pch.hpp>
32 #include <objects/seq/MolInfo.hpp>
34 #include <objmgr/seqdesc_ci.hpp>
35 #include <objmgr/feat_ci.hpp>
36 #include <objmgr/seq_vector.hpp>
37 #include <objmgr/bioseq_ci.hpp>
41 #include <objects/seq/Seq_ext.hpp>
51 #include <objects/pub/Pub.hpp>
57 #include <util/xregexp/regexp.hpp>
60 
61 #include "discrepancy_core.hpp"
62 #include "utils.hpp"
64 
65 
69 
70 
71 // DUP_DEFLINE
72 
73 const string kUniqueDeflines = "[n] definition line[s] [is] unique";
74 const string kIdenticalDeflines = "[n] definition line[s] [is] identical";
75 const string kAllUniqueDeflines = "All deflines are unique";
76 const string kSomeIdenticalDeflines = "Defline Problem Report";
77 
78 
79 DISCREPANCY_CASE(DUP_DEFLINE, SEQUENCE, eOncaller, "Definition lines should be unique")
80 {
81  const CBioseq& bioseq = context.CurrentBioseq();
82  if (bioseq.IsAa()) {
83  return;
84  }
85 
86  auto bsh = context.GetBioseqHandle(bioseq);
87  if (bsh) {
88  sequence::CDeflineGenerator deflineGenerator;
89  auto defline = deflineGenerator.GenerateDefline(bsh, 0);
90  if (!NStr::IsBlank(defline)) {
91  if (bioseq.IsSetDescr() && bioseq.GetDescr().IsSet()) {
92  for (const auto& desc : context.GetSeqdesc()) {
93  if (desc.IsTitle()) {
94  m_Objs[defline].Add(*context.SeqdescObjRef(desc));
95  return;
96  }
97  }
98  }
99  m_Objs[defline].Add(*context.BioseqObjRef());
100  }
101  }
102 }
103 
104 
106 {
107  if (m_Objs.empty()) {
108  return;
109  }
110  bool all_unique = true;
112  for (auto& it : m_Objs.GetMap()) {
113  TReportObjectList& list = it.second->GetObjects();
114  if (list.size() == 1) {
116  }
117  else if (list.size() > 1) {
118  tmp[kSomeIdenticalDeflines][kIdenticalDeflines + "[*" + it.first + "*]"].Add(list);
119  all_unique = false;
120  }
121  }
122  if (all_unique) {
123  tmp.clear();
125  }
126  m_ReportItems = tmp.Export(*this)->GetSubitems();
127 }
128 
129 
130 // TERMINAL_NS
131 
132 DISCREPANCY_CASE(TERMINAL_NS, SEQUENCE, eDisc | eSmart | eBig | eFatal, "Ns at end of sequences")
133 {
134  const CBioseq& bioseq = context.CurrentBioseq();
135  if (bioseq.CanGetInst() && bioseq.GetInst().IsNa()) {
136  const CSeqSummary& sum = context.GetSeqSummary();
137  if (sum.StartsWithGap || sum.EndsWithGap) {
138  m_Objs["[n] sequence[s] [has] terminal Ns"].Fatal().Add(*context.BioseqObjRef());
139  }
140  }
141 }
142 
143 
144 // SHORT_PROT_SEQUENCES
145 
146 DISCREPANCY_CASE(SHORT_PROT_SEQUENCES, SEQUENCE, eDisc | eOncaller | eSmart, "Protein sequences should be at least 50 aa, unless they are partial")
147 {
148  const CBioseq& bioseq = context.CurrentBioseq();
149  if (bioseq.CanGetInst() && bioseq.GetInst().IsAa() && bioseq.GetInst().IsSetLength() && bioseq.GetInst().GetLength() < 50) {
150  const CSeqdesc* molinfo = context.GetMolinfo();
152  m_Objs["[n] protein sequences are shorter than 50 aa."].Add(*context.BioseqObjRef(), false);
153  }
154  }
155 }
156 
157 
158 // COMMENT_PRESENT
159 
160 DISCREPANCY_CASE(COMMENT_PRESENT, DESC, eOncaller, "Comment descriptor present")
161 {
162  for (const auto& desc : context.GetSeqdesc()) {
163  if (desc.IsComment()) {
164  m_Objs[desc.GetComment()].Add(*context.SeqdescObjRef(desc));
165  }
166  }
167 }
168 
169 
170 DISCREPANCY_SUMMARIZE(COMMENT_PRESENT)
171 {
172  if (!m_Objs.empty()) {
173  CReportNode rep;
174  string label = m_Objs.GetMap().size() == 1 ? "[n] comment descriptor[s] were found (all same)" : "[n] comment descriptor[s] were found (some different)";
175  for (auto it : m_Objs.GetMap()) {
176  for (auto obj : it.second->GetObjects()) {
177  rep[label].Add(*obj);
178  }
179  }
180  m_ReportItems = rep.Export(*this)->GetSubitems();
181  }
182 }
183 
184 
185 // MRNA_ON_WRONG_SEQUENCE_TYPE
186 
187 DISCREPANCY_CASE(MRNA_ON_WRONG_SEQUENCE_TYPE, SEQUENCE, eDisc | eOncaller, "Eukaryotic sequences that are not genomic or macronuclear should not have mRNA features")
188 {
189  const CBioseq& bioseq = context.CurrentBioseq();
190  if (!bioseq.IsSetInst() || !bioseq.GetInst().IsSetMol() || bioseq.GetInst().GetMol() != CSeq_inst::eMol_dna) {
191  return;
192  }
193  const CSeqdesc* molinfo = context.GetMolinfo();
194  if (!molinfo || !molinfo->GetMolinfo().IsSetBiomol() || molinfo->GetMolinfo().GetBiomol() != CMolInfo::eBiomol_genomic) {
195  return;
196  }
197  const CSeqdesc* biosrc = context.GetBiosource();
198  if (!biosrc || !biosrc->GetSource().IsSetGenome() ||
201  !context.IsEukaryotic(&biosrc->GetSource())) {
202  return;
203  }
204  for (const CSeq_feat* feat : context.FeatMRNAs()) {
205  m_Objs["[n] mRNA[s] [is] located on eukaryotic sequence[s] that [does] not have genomic or plasmid source[s]"].Add(*context.SeqFeatObjRef(*feat));
206  }
207 }
208 
209 
210 // DISC_GAPS
211 
212 const string kSequencesWithGaps = "[n] sequence[s] contain[S] gaps";
213 
214 DISCREPANCY_CASE(GAPS, SEQUENCE, eDisc | eSubmitter | eSmart | eBig, "Sequences with gaps")
215 {
216  const CBioseq& bioseq = context.CurrentBioseq();
217  if (bioseq.CanGetInst() && bioseq.GetInst().IsSetRepr() && bioseq.GetInst().GetRepr() == CSeq_inst::eRepr_delta) {
218  const CSeqSummary& sum = context.GetSeqSummary();
219  bool has_gaps = !!sum.Gaps;
220  if (!has_gaps) {
221  const CSeq_annot* annot = nullptr;
222  for (auto it : bioseq.GetAnnot()) {
223  if (it->IsFtable()) {
224  annot = it;
225  break;
226  }
227  }
228  if (annot) {
229  for (const auto& feat : annot->GetData().GetFtable()) {
230  if (feat->IsSetData() && feat->GetData().GetSubtype() == CSeqFeatData::eSubtype_gap) {
231  has_gaps = true;
232  break;
233  }
234  }
235  }
236  }
237  if (has_gaps) {
238  m_Objs[kSequencesWithGaps].Add(*context.BioseqObjRef());
239  }
240  }
241 }
242 
243 
244 // BIOPROJECT_ID
245 
246 DISCREPANCY_CASE(BIOPROJECT_ID, SEQUENCE, eOncaller, "Sequences with BioProject IDs")
247 {
248  const CBioseq& bioseq = context.CurrentBioseq();
249  if (bioseq.CanGetInst() && bioseq.GetInst().IsNa()) {
250  for (const auto& desc : context.GetAllSeqdesc()) {
251  if (desc.IsUser()) {
252  const CUser_object& user = desc.GetUser();
253  if (user.IsSetData() && user.IsSetType() && user.GetType().IsStr() && user.GetType().GetStr() == "DBLink") {
254  for (const auto& user_field : user.GetData()) {
255  if (user_field->IsSetLabel() && user_field->GetLabel().IsStr() && user_field->GetLabel().GetStr() == "BioProject" && user_field->IsSetData() && user_field->GetData().IsStrs()) {
256  const CUser_field::C_Data::TStrs& strs = user_field->GetData().GetStrs();
257  if (!strs.empty() && !strs[0].empty()) {
258  m_Objs["[n] sequence[s] contain[S] BioProject IDs"].Add(*context.BioseqObjRef());
259  return;
260  }
261  }
262  }
263  }
264  }
265  }
266  }
267 }
268 
269 
270 // MISSING_DEFLINES
271 
272 DISCREPANCY_CASE(MISSING_DEFLINES, SEQUENCE, eAll, "Missing definition lines")
273 {
274  const CBioseq& bioseq = context.CurrentBioseq();
275  if (bioseq.CanGetInst() && !bioseq.GetInst().IsAa() && !context.GetTitle()) {
276  m_Objs["[n] bioseq[s] [has] no definition line"].Add(*context.BioseqObjRef());
277  }
278 }
279 
280 
281 
282 // N_RUNS_14
283 
284 DISCREPANCY_CASE(N_RUNS_14, SEQUENCE, eDisc | eTSA, "Runs of more than 14 Ns")
285 {
286  const CBioseq& bioseq = context.CurrentBioseq();
287  if (bioseq.CanGetInst() && bioseq.GetInst().IsNa()) {
288  const CSeqSummary& sum = context.GetSeqSummary();
289  if (sum.MaxN > 14) {
290  m_Objs["[n] sequence[s] [has] runs of 15 or more Ns"].Add(*context.BioseqObjRef());
291  }
292  }
293 }
294 
295 
296 // EXTERNAL_REFERENCE
297 
298 DISCREPANCY_CASE(EXTERNAL_REFERENCE, SEQUENCE, eDisc | eOncaller, "Sequence has external reference")
299 {
300  const CBioseq& bioseq = context.CurrentBioseq();
301  if (bioseq.CanGetInst() && bioseq.GetInst().IsNa()) {
302  const CSeqSummary& sum = context.GetSeqSummary();
303  if (sum.HasRef) {
304  m_Objs["[n] sequence[s] [has] external references"].Add(*context.BioseqObjRef());
305  }
306  }
307 }
308 
309 
310 // 10_PERCENTN
311 
312 DISCREPANCY_CASE0(TEN_PERCENTN, "10_PERCENTN", SEQUENCE, eDisc | eSubmitter | eSmart | eTSA, "Greater than 10 percent Ns")
313 {
314  const double MIN_N_PERCENTAGE = 10.0;
315 
316  const CBioseq& bioseq = context.CurrentBioseq();
317  if (bioseq.CanGetInst() && bioseq.GetInst().IsNa()) {
318  const CSeqSummary& sum = context.GetSeqSummary();
319  if (!sum.HasRef && sum.N * 100. / sum.Len > MIN_N_PERCENTAGE) {
320  m_Objs["[n] sequence[s] [has] > 10% Ns"].Add(*context.BioseqObjRef());
321  }
322  }
323 }
324 
325 
326 // FEATURE_COUNT
327 
328 DISCREPANCY_CASE(FEATURE_COUNT, FEAT, eOncaller | eSubmitter | eSmart, "Count features present or missing from sequences")
329 {
330  // context.SetGui(true); // for debug only!
331  for (const auto& feat : context.GetFeat()) {
332  if (!feat.IsSetData() || feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_prot) {
333  continue;
334  }
335  string key = feat.GetData().IsGene() ? "gene" : feat.GetData().GetKey(CSeqFeatData::eVocabulary_genbank);
336  m_Objs[key + ": [n] present"].Info().Incr();
337  }
338  if (CDiscrepancySet::IsGui() && context.IsBioseq()) {
339  const CBioseq& bioseq = context.CurrentBioseq();
340  bool na = false;
341  if (bioseq.CanGetInst() && bioseq.GetInst().IsNa()) {
342  na = true;
343  }
344  CRef<CReportObj> rep(context.BioseqObjRef());
345  for (const auto& feat : context.GetAllFeat()) {
346  if (!feat.IsSetData() || feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_prot) {
347  continue;
348  }
349  string key = feat.GetData().IsGene() ? "gene" : feat.GetData().GetKey(CSeqFeatData::eVocabulary_genbank);
350  key = to_string(feat.GetData().GetSubtype()) + " " + key;
351  m_Objs[kEmptyCStr][key].Add(*rep, false);
352  }
353  m_Objs[kEmptyCStr][na ? "N" : "A"].Add(*rep);
354  }
355 }
356 
357 
358 DISCREPANCY_SUMMARIZE(FEATURE_COUNT)
359 {
360  if (CDiscrepancySet::IsGui()) {
361  for (auto& it : m_Objs[kEmptyCStr].GetMap()) {
362  if (it.first == "N" || it.first == "A") {
363  continue;
364  }
365  size_t n = it.first.find(' ');
366  string key = it.first.substr(n + 1);
368  string label = key + ": [n] present";
369  map<CReportObj*, size_t> obj2num;
371  for (auto& obj : m_Objs[kEmptyStr]["N"].GetObjects()) {
372  obj2num[&*obj] = 0;
373  }
374  }
376  for (auto& obj : m_Objs[kEmptyStr]["A"].GetObjects()) {
377  obj2num[&*obj] = 0;
378  }
379  }
380  for (auto& obj : m_Objs[kEmptyStr][it.first].GetObjects()) {
381  obj2num[&*obj]++;
382  }
383  for (auto& pp : obj2num) {
384  m_Objs[label]["[n] bioseq[s] [has] [(]" + to_string(pp.second) + "[)] " + key + " features"].Info().Add(*pp.first);
385  }
386  }
387  m_Objs.GetMap().erase(kEmptyCStr);
388  }
389  xSummarize();
390 }
391 
392 
393 // EXON_ON_MRNA
394 
395 DISCREPANCY_CASE(EXON_ON_MRNA, SEQUENCE, eOncaller | eSmart, "mRNA sequences should not have exons")
396 {
397  const CSeqdesc* molinfo = context.GetMolinfo();
398  if (molinfo && molinfo->GetMolinfo().IsSetBiomol() && molinfo->GetMolinfo().GetBiomol() == CMolInfo::eBiomol_mRNA) {
399  if (context.FeatExons().size()) {
400  m_Objs["[n] mRNA bioseq[s] [has] exon features"].Add(*context.BioseqObjRef(CDiscrepancyContext::eFixSet));
401  }
402  }
403 }
404 
405 
406 DISCREPANCY_AUTOFIX(EXON_ON_MRNA)
407 {
408  unsigned int n = 0;
409  const CBioseq* seq = dynamic_cast<const CBioseq*>(context.FindObject(*obj));
410  CBioseq_EditHandle handle = context.GetBioseqHandle(*seq);
411  CFeat_CI ci(handle);
412  while (ci) {
414  if (ci->IsSetData() && ci->GetData().GetSubtype() == CSeqFeatData::eSubtype_exon) {
415  eh = CSeq_feat_EditHandle(context.GetScope().GetSeq_featHandle(ci->GetMappedFeature()));
416  }
417  ++ci;
418  if (eh) {
419  eh.Remove();
420  n++;
421  }
422  }
423  obj->SetFixed();
424  return CRef<CAutofixReport>(n ? new CAutofixReport("EXON_ON_MRNA: [n] exon[s] removed", n) : nullptr);
425 }
426 
427 
428 // INCONSISTENT_MOLINFO_TECH
429 
430 DISCREPANCY_CASE(INCONSISTENT_MOLINFO_TECH, SEQUENCE, eDisc | eSmart, "Inconsistent Molinfo Techniques")
431 {
432  const CBioseq& bioseq = context.CurrentBioseq();
433  if (bioseq.CanGetInst() && !bioseq.GetInst().IsAa()) {
434  string moltype;
435  const CSeqdesc* molinfo = context.GetMolinfo();
436  if (molinfo) {
437  if (molinfo->GetMolinfo().IsSetTech()) {
438  m_Objs[to_string(molinfo->GetMolinfo().GetTech())].Add(*context.BioseqObjRef());
439  }
440  else {
441  m_Objs[kEmptyStr].Add(*context.BioseqObjRef());
442  }
443  }
444  }
445 }
446 
447 
448 static const string kInconsistentMolinfoTechSummary = "Molinfo Technique Report";
449 static const string kInconsistentMolinfoTech = "[n] Molinfo[s] [is] missing field technique";
450 
451 DISCREPANCY_SUMMARIZE(INCONSISTENT_MOLINFO_TECH)
452 {
453  if (m_Objs.empty()) {
454  return;
455  }
456 
457  CReportNode report;
458 
459  CReportNode::TNodeMap& the_map = m_Objs.GetMap();
460 
461  bool same = true;
462  string tech;
463 
464  size_t num_of_missing = 0,
465  num_of_bioseqs = 0;
466 
467  for (auto it : the_map) {
468  num_of_bioseqs += it.second->GetObjects().size();
469  if (it.first.empty()) {
470  num_of_missing += it.second->GetObjects().size();
471  continue;
472  }
473  if (tech.empty()) {
474  tech = it.first;
475  }
476  else if (tech != it.first) {
477  same = false;
478  }
479  }
480  string summary = kInconsistentMolinfoTechSummary + " (";
481  if (num_of_missing == num_of_bioseqs || (same && !num_of_missing)) {
482  return;
483  }
484  summary += num_of_missing ? "some missing, " : "all present, ";
485  summary += same ? "all same)" : "some different)";
486  if (num_of_missing) {
487  if (num_of_missing == num_of_bioseqs) {
488  report[summary].SetCount(num_of_missing);
489  }
490  else {
491  report[summary][kInconsistentMolinfoTech].Add(the_map[kEmptyStr]->GetObjects());
492  }
493  }
494 
495  m_ReportItems = report.Export(*this)->GetSubitems();
496 }
497 
498 
499 // TITLE_ENDS_WITH_SEQUENCE
500 
501 static bool IsATGC(char ch)
502 {
503  return (ch == 'A' || ch == 'T' || ch == 'G' || ch == 'C');
504 }
505 
506 
507 static bool EndsWithSequence(const string& title)
508 {
509  static const size_t MIN_TITLE_SEQ_LEN = 19; // 19 was just copied from C-toolkit
510 
511  size_t count = 0;
512  for (string::const_reverse_iterator it = title.rbegin(); it != title.rend(); ++it) {
513  if (IsATGC(*it)) {
514  ++count;
515  }
516  else
517  break;
518 
519  if (count >= MIN_TITLE_SEQ_LEN) {
520  break;
521  }
522  }
523 
524  return count >= MIN_TITLE_SEQ_LEN;
525 }
526 
527 
528 DISCREPANCY_CASE(TITLE_ENDS_WITH_SEQUENCE, DESC, eDisc | eSubmitter | eSmart | eBig, "Sequence characters at end of defline")
529 {
530  for (auto& desc : context.GetSeqdesc()) {
531  if (desc.IsTitle() && EndsWithSequence(desc.GetTitle())) {
532  m_Objs["[n] defline[s] appear[S] to end with sequence characters"].Add(*context.SeqdescObjRef(desc));
533  }
534  }
535 }
536 
537 
538 // FEATURE_MOLTYPE_MISMATCH
539 
540 DISCREPANCY_CASE(FEATURE_MOLTYPE_MISMATCH, SEQUENCE, eOncaller, "Sequences with rRNA or misc_RNA features should be genomic DNA")
541 {
542  bool is_dna = false;
543  bool is_genomic = false;
544  const CBioseq& bioseq = context.CurrentBioseq();
545  if (bioseq.CanGetInst() && bioseq.GetInst().IsSetMol() && bioseq.GetInst().GetMol() == CSeq_inst::eMol_dna) {
546  is_dna = true;
547  }
548  auto molinfo = context.GetMolinfo();
549  if (molinfo && molinfo->GetMolinfo().IsSetBiomol() && molinfo->GetMolinfo().GetBiomol() == CMolInfo::eBiomol_genomic) {
550  is_genomic = true;
551  }
552  if (!is_genomic || !is_dna) {
553  const CSeq_annot* annot = nullptr;
554  if (bioseq.IsSetAnnot()) {
555  for (auto& annot_it : bioseq.GetAnnot()) {
556  if (annot_it->IsFtable()) {
557  annot = annot_it;
558  break;
559  }
560  }
561  }
562  if (annot) {
563  for (auto& feat : annot->GetData().GetFtable()) {
564  if (feat->IsSetData()) {
565  CSeqFeatData::ESubtype subtype = feat->GetData().GetSubtype();
566  if (subtype == CSeqFeatData::eSubtype_rRNA || subtype == CSeqFeatData::eSubtype_otherRNA) {
567  m_Objs["[n] sequence[s] [has] rRNA or misc_RNA features but [is] not genomic DNA"].Add(*context.BioseqObjRef(CDiscrepancyContext::eFixSelf));
568  break;
569  }
570  }
571  }
572  }
573  }
574 }
575 
576 DISCREPANCY_AUTOFIX(FEATURE_MOLTYPE_MISMATCH)
577 {
578  const CBioseq* seq = dynamic_cast<const CBioseq*>(context.FindObject(*obj));
579  CBioseq_EditHandle edit_handle = context.GetBioseqHandle(*seq);
580  edit_handle.SetInst_Mol(CSeq_inst::eMol_dna);
581  CSeq_descr& descrs = edit_handle.SetDescr();
582  CMolInfo* molinfo = nullptr;
583  if (descrs.IsSet()) {
584  for (auto descr : descrs.Set()) {
585  if (descr->IsMolinfo()) {
586  molinfo = &(descr->SetMolinfo());
587  break;
588  }
589  }
590  }
591  if (molinfo == nullptr) {
592  CRef<CSeqdesc> new_descr(new CSeqdesc);
593  molinfo = &(new_descr->SetMolinfo());
594  descrs.Set().push_back(new_descr);
595  }
596  if (molinfo == nullptr) {
597  return CRef<CAutofixReport>();
598  }
600  obj->SetFixed();
601  return CRef<CAutofixReport>(new CAutofixReport("FEATURE_MOLTYPE_MISMATCH: Moltype was set to genomic for [n] bioseq[s]", 1));
602 }
603 
604 
605 // INCONSISTENT_DBLINK
606 
607 const string kMissingDBLink = "[n] Bioseq [is] missing DBLink object";
608 const string kDBLinkObjectList = "DBLink Objects";
609 const string kDBLinkFieldCountTop = "DBLink Fields";
610 const string kDBLinkCollect = "DBLink Collection";
611 
612 string GetFieldValueAsString(const CUser_field& field)
613 {
614  string value;
615 
616  if (field.GetData().IsStr()) {
617  value = field.GetData().GetStr();
618  } else if (field.GetData().IsStrs()) {
619  for (const string& s : field.GetData().GetStrs()) {
620  if (!NStr::IsBlank(value)) {
621  value += "; ";
622  }
623  value += s;
624  }
625  }
626  return value;
627 }
628 
629 
630 const string& kPreviouslySeenFields = "Previously Seen Fields";
631 const string& kPreviouslySeenFieldsThis = "Previously Seen Fields This";
632 const string& kPreviouslySeenObjects = "Previously Seen Objects";
633 
635 (//CConstRef<CSeqdesc> desc,
636  const CSeqdesc* desc,
637  //CConstRef<CBioseq> seq,
638  //const CSeqSummary* info,
639  CReportObj& rep_seq,
640  CReportNode& collector,
641  CReportNode& previously_seen,
643  const string& object_name,
644  const string& field_prefix = kEmptyStr)
645 {
646  if (!desc) {
647  // add missing for all previously seen fields
648  for (auto& obj : previously_seen[kPreviouslySeenFields].GetMap()) {
649  for (auto& z : obj.second->GetMap()) {
650  collector[field_prefix + z.first][" [n] " + object_name + "[s] [is] missing field " + field_prefix + z.first]
651  //.Add(*context.NewBioseqObj(seq, info, eKeepRef), false);
652  //.Add(*context.NewBioseqObj(seq, info));
653  .Add(rep_seq);
654  }
655  }
656  return;
657  }
658 
659  bool already_seen = previously_seen[kPreviouslySeenObjects].Exist(object_name);
660  for (auto& f : desc->GetUser().GetData()) {
661  if (f->IsSetLabel() && f->GetLabel().IsStr() && f->IsSetData()) {
662  string field_name = field_prefix + f->GetLabel().GetStr();
663  // add missing field to all previous objects that do not have this field
664  if (already_seen && !collector.Exist(field_name)) {
665  for (auto& ro : previously_seen[kPreviouslySeenObjects][object_name].GetObjects()) {
666  string missing_label = "[n] " + object_name + "[s] [is] missing field " + field_name;
667  //CRef<CDiscrepancyObject> seq_disc_obj(dynamic_cast<CDiscrepancyObject*>(ro->GetNCPointer()));
668  //collector[field_name][missing_label].Add(*seq_disc_obj, false);
669  //collector[field_name][missing_label].Add(*seq_disc_obj);
670  collector[field_name][missing_label].Add(*ro);
671  }
672  }
673  collector[field_name]["[n] " + object_name + "[s] [has] field " + field_name + " value '" + GetFieldValueAsString(*f) + "'"].Add(*context.SeqdescObjRef(*desc), false);
674  previously_seen[kPreviouslySeenFieldsThis][f->GetLabel().GetStr()].Add(*context.SeqdescObjRef(*desc), false);
675  previously_seen[kPreviouslySeenFields][object_name][f->GetLabel().GetStr()].Add(*context.SeqdescObjRef(*desc), false);
676  }
677  }
678 
679  // add missing for all previously seen fields not on this object
680  for (auto& z : previously_seen[kPreviouslySeenFields][object_name].GetMap()) {
681  if (!previously_seen[kPreviouslySeenFieldsThis].Exist(z.first)) {
682  collector[field_prefix + z.first][" [n] " + object_name + "[s] [is] missing field " + field_prefix + z.first].Add(*context.SeqdescObjRef(*desc));
683  }
684  }
685 
686  // maintain object list for missing fields
687  //CRef<CDiscrepancyObject> this_disc_obj(context.NewSeqdescObj(d, context.GetCurrentBioseqLabel(), eKeepRef));
688  CRef<CDiscrepancyObject> this_disc_obj(context.SeqdescObjRef(*desc));
689  //previously_seen[kPreviouslySeenObjects][object_name].Add(*this_disc_obj, false);
690  previously_seen[kPreviouslySeenObjects][object_name].Add(*this_disc_obj);
691  previously_seen[kPreviouslySeenFieldsThis].clear();
692 }
693 
694 
695 DISCREPANCY_CASE(INCONSISTENT_DBLINK, SEQUENCE, eDisc | eSubmitter | eSmart | eBig, "Inconsistent DBLink fields")
696 {
697  const CBioseq& bioseq = context.CurrentBioseq();
698  if (bioseq.CanGetInst() && bioseq.GetInst().IsNa()) {
699  bool found = false;
700  auto rep_seq = context.BioseqObjRef();
701  for (auto& desc : context.GetAllSeqdesc()) {
702  if (desc.IsUser()) {
703  const CUser_object& user = desc.GetUser();
705  found = true;
706  AddUserObjectFieldItems(&desc, *rep_seq, m_Objs[kDBLinkCollect], m_Objs[kDBLinkObjectList], context, "DBLink object");
707  }
708  }
709  }
710  if (!found) {
711  m_Objs[kMissingDBLink].Add(*rep_seq);
712  AddUserObjectFieldItems(nullptr, *rep_seq, m_Objs[kDBLinkCollect], m_Objs[kDBLinkObjectList], context, "DBLink object");
713  }
714  }
715 }
716 
717 
718 void AnalyzeField(CReportNode& node, bool& all_present, bool& all_same)
719 {
720  all_present = true;
721  all_same = true;
722  size_t num_values = 0;
723  string value;
724  bool first = true;
725  for (auto& s : node.GetMap()) {
726  if (NStr::Find(s.first, " missing field ") != NPOS) {
727  all_present = false;
728  } else {
729  SIZE_TYPE pos = NStr::Find(s.first, " value '");
730  if (pos != NPOS) {
731  if (first) {
732  value = s.first.substr(pos);
733  num_values++;
734  first = false;
735  } else if (!NStr::Equal(s.first.substr(pos), value)) {
736  num_values++;
737  }
738  }
739  }
740  if (num_values > 1) {
741  all_same = false;
742  if (!all_present) {
743  // have all the info we need
744  break;
745  }
746  }
747  }
748 }
749 
750 
751 void AnalyzeFieldReport(CReportNode& node, bool& all_present, bool& all_same)
752 {
753  all_present = true;
754  all_same = true;
755  for (auto& s : node.GetMap()) {
756  bool this_present = true;
757  bool this_same = true;
758  AnalyzeField(*s.second, this_present, this_same);
759  all_present &= this_present;
760  all_same &= this_same;
761  if (!all_present && !all_same) {
762  break;
763  }
764  }
765 }
766 
767 
768 string GetSummaryLabel(bool all_present, bool all_same)
769 {
770  string summary = "(";
771  if (all_present) {
772  summary += "all present";
773  } else {
774  summary += "some missing";
775  }
776  summary += ", ";
777  if (all_same) {
778  summary += "all same";
779  } else {
780  summary += "inconsistent";
781  }
782  summary += ")";
783  return summary;
784 }
785 
786 
787 void CopyNode(CReportNode& new_home, CReportNode& original)
788 {
789  for (auto& s : original.GetMap()){
790  for (auto q : s.second->GetObjects()) {
791  new_home[s.first].Add(*q);
792  }
793  }
794  for (auto& q : original.GetObjects()) {
795  new_home.Add(*q);
796  }
797 }
798 
799 
800 string AdjustDBLinkFieldName(const string& orig_field_name)
801 {
802  if (NStr::Equal(orig_field_name, "BioSample")) {
803  return " " + orig_field_name;
804  } else if (NStr::Equal(orig_field_name, "ProbeDB")) {
805  return " " + orig_field_name;
806  } else if (NStr::Equal(orig_field_name, "Sequence Read Archive")) {
807  return " " + orig_field_name;
808  } else if (NStr::Equal(orig_field_name, "BioProject")) {
809  return " " + orig_field_name;
810  } else if (NStr::Equal(orig_field_name, "Assembly")) {
811  return " " + orig_field_name;
812  } else {
813  return orig_field_name;
814  }
815 }
816 
817 
818 DISCREPANCY_SUMMARIZE(INCONSISTENT_DBLINK)
819 {
820  m_Objs.GetMap().erase(kDBLinkObjectList);
821  m_Objs.GetMap().erase(kDBLinkFieldCountTop);
822  if (m_Objs.empty()) {
823  return;
824  }
825 
826  // add top-level category, rename field values
827  bool all_present = true;
828  bool all_same = true;
829  AnalyzeFieldReport(m_Objs[kDBLinkCollect], all_present, all_same);
830  if (all_present && all_same) {
831  m_Objs.clear();
832  return;
833  }
834  string top_label = "DBLink Report " + GetSummaryLabel(all_present, all_same);
835 
836  CReportNode::TNodeMap::iterator it = m_Objs.GetMap().begin();
837  while (it != m_Objs.GetMap().end()) {
838  if (!NStr::Equal(it->first, top_label)
839  && !NStr::Equal(it->first, kDBLinkCollect)) {
840  CopyNode(m_Objs[top_label][" " + it->first], *it->second);
841  it = m_Objs.GetMap().erase(it);
842  } else {
843  ++it;
844  }
845  }
846 
847  for (auto& it2 : m_Objs[kDBLinkCollect].GetMap()) {
848  bool this_present = true;
849  bool this_same = true;
850  AnalyzeField(*it2.second, this_present, this_same);
851  string new_label = AdjustDBLinkFieldName(it2.first) + " " + GetSummaryLabel(this_present, this_same);
852  for (auto& s : it2.second->GetMap()){
853  for (auto& q : s.second->GetObjects()) {
854  m_Objs[top_label][new_label][s.first].Add(*q);
855  }
856  }
857  }
858  m_Objs.GetMap().erase(kDBLinkCollect);
859 
860  xSummarize();
861 }
862 
863 
864 // INCONSISTENT_STRUCTURED_COMMENTS
865 const string kStructuredCommentsSeqs = "sequences";
866 const string kStructuredCommentObservedPrefixes = "observed prefixes";
867 const string kStructuredCommentObservedPrefixesThis = "observed prefixes this";
868 const string kStructuredCommentReport = "collection";
869 const string kStructuredCommentPrevious = "previous";
870 const string kStructuredCommentFieldPrefix = "structured comment field ";
871 
872 DISCREPANCY_CASE(INCONSISTENT_STRUCTURED_COMMENTS, SEQUENCE, eDisc | eSubmitter | eSmart | eBig, "Inconsistent structured comments")
873 {
874  const CBioseq& bioseq = context.CurrentBioseq();
875  if (bioseq.CanGetInst() && bioseq.GetInst().IsNa()) {
876  auto rep_seq = context.BioseqObjRef();
877  for (auto& desc : context.GetAllSeqdesc()) {
878  if (desc.IsUser()) {
879  const CUser_object& user = desc.GetUser();
881  string prefix = CComment_rule::GetStructuredCommentPrefix(user);
882  if (NStr::IsBlank(prefix)) {
883  prefix = "unnamed";
884  }
885  m_Objs[kStructuredCommentObservedPrefixesThis][prefix].Add(*context.SeqdescObjRef(desc));
886  AddUserObjectFieldItems(&desc, *rep_seq, m_Objs[kStructuredCommentReport], m_Objs[kStructuredCommentPrevious], context, prefix + " structured comment", kStructuredCommentFieldPrefix);
887  }
888  }
889  }
890  //report prefixes seen previously, not found on this sequence
891  for (auto& it : m_Objs[kStructuredCommentObservedPrefixes].GetMap()) {
892  if (!m_Objs[kStructuredCommentObservedPrefixesThis].Exist(it.first)) {
893  m_Objs["[n] Bioseq[s] [is] missing " + it.first + " structured comment"].Add(*rep_seq);
894  AddUserObjectFieldItems(nullptr, *rep_seq, m_Objs[kStructuredCommentReport], m_Objs[kStructuredCommentPrevious], context, it.first + " structured comment", kStructuredCommentFieldPrefix);
895  }
896  }
897  // report prefixes found on this sequence but not on previous sequences
898  for (auto& it : m_Objs[kStructuredCommentObservedPrefixesThis].GetMap()) {
899  if (!m_Objs[kStructuredCommentObservedPrefixes].Exist(it.first)) {
900  for (auto ro : m_Objs[kStructuredCommentsSeqs].GetObjects()) {
901  m_Objs["[n] Bioseq[s] [is] missing " + it.first + " structured comment"].Add(*ro);
902  AddUserObjectFieldItems(nullptr, *ro, m_Objs[kStructuredCommentReport], m_Objs[kStructuredCommentPrevious], context, it.first + " structured comment", kStructuredCommentFieldPrefix);
903  }
904  }
905  m_Objs[kStructuredCommentObservedPrefixes][it.first].Add(*context.BioseqObjRef());
906  }
908  m_Objs[kStructuredCommentsSeqs].Add(*context.BioseqObjRef());
909  }
910 }
911 
912 
913 DISCREPANCY_SUMMARIZE(INCONSISTENT_STRUCTURED_COMMENTS)
914 {
915  m_Objs.GetMap().erase(kStructuredCommentObservedPrefixesThis);
916  m_Objs.GetMap().erase(kStructuredCommentsSeqs);
917  m_Objs.GetMap().erase(kStructuredCommentObservedPrefixes);
918  m_Objs.GetMap().erase(kStructuredCommentPrevious);
919 
920  m_Objs[kStructuredCommentReport].GetMap().erase(kStructuredCommentFieldPrefix + "StructuredCommentPrefix");
921  m_Objs[kStructuredCommentReport].GetMap().erase(kStructuredCommentFieldPrefix + "StructuredCommentSuffix");
922 
923  if (m_Objs.empty()) {
924  return;
925  }
926 
927  // add top-level category, rename field values
928  bool all_present = true;
929  bool all_same = true;
930  AnalyzeFieldReport(m_Objs[kStructuredCommentReport], all_present, all_same);
931  if (all_present && all_same) {
932  return;
933  }
934 
935  string top_label = "Structured Comment Report " + GetSummaryLabel(all_present, all_same);
936 
937  CReportNode::TNodeMap::iterator it = m_Objs.GetMap().begin();
938  while (it != m_Objs.GetMap().end()) {
939  if (!NStr::Equal(it->first, top_label)
940  && !NStr::Equal(it->first, kStructuredCommentReport)) {
941  CopyNode(m_Objs[top_label][" " + it->first], *it->second);
942  it = m_Objs.GetMap().erase(it);
943  } else {
944  ++it;
945  }
946  }
947 
948  for (auto& it2 : m_Objs[kStructuredCommentReport].GetMap()) {
949  bool this_present = true;
950  bool this_same = true;
951  AnalyzeField(*it2.second, this_present, this_same);
952  string new_label = it2.first + " " + GetSummaryLabel(this_present, this_same);
953  for (auto& s : it2.second->GetMap()) {
954  string sub_label = s.first;
955  if (this_present && this_same) {
956  NStr::ReplaceInPlace(sub_label, "[n]", "All");
957  }
958  for (auto& q : s.second->GetObjects()) {
959  m_Objs[top_label][new_label][sub_label].Add(*q);
960  }
961  }
962  }
963  m_Objs.GetMap().erase(kStructuredCommentReport);
964 
965  xSummarize();
966 }
967 
968 
969 // MISSING_STRUCTURED_COMMENT
970 
971 DISCREPANCY_CASE(MISSING_STRUCTURED_COMMENT, SEQUENCE, eDisc | eTSA, "Structured comment not included")
972 {
973  const CBioseq& bioseq = context.CurrentBioseq();
974  if (bioseq.CanGetInst() && bioseq.GetInst().IsNa()) {
975  for (auto& desc : context.GetAllSeqdesc()) {
976  if (desc.IsUser()) {
977  const CUser_object& user = desc.GetUser();
979  return;
980  }
981  }
982  }
983  m_Objs["[n] sequence[s] [does] not include structured comments."].Add(*context.BioseqObjRef());
984  }
985 }
986 
987 
988 // MISSING_PROJECT
989 
990 DISCREPANCY_CASE(MISSING_PROJECT, SEQUENCE, eDisc | eTSA, "Project not included")
991 {
992  const CBioseq& bioseq = context.CurrentBioseq();
993  if (bioseq.CanGetInst()) {
994  for (auto& desc : context.GetAllSeqdesc()) {
995  if (desc.IsUser()) {
996  const CUser_object& user = desc.GetUser();
998  if (user.IsSetData()) {
999  for (auto& it : user.GetData()) {
1000  if (it->IsSetLabel() && it->GetLabel().IsStr() && NStr::Equal(it->GetLabel().GetStr(), "BioProject")) {
1001  return;
1002  }
1003  }
1004  }
1005  }
1006  else if (user.IsSetType() && user.GetType().IsStr() && NStr::Equal(user.GetType().GetStr(), "GenomeProjectsDB")) {
1007  return;
1008  }
1009  }
1010  }
1011  m_Objs["[n] sequence[s] [does] not include project."].Add(*context.BioseqObjRef());
1012  }
1013 }
1014 
1015 
1016 // COUNT_UNVERIFIED
1017 
1018 DISCREPANCY_CASE(COUNT_UNVERIFIED, SEQUENCE, eOncaller, "Count number of unverified sequences")
1019 {
1020  const CBioseq& bioseq = context.CurrentBioseq();
1021  if (bioseq.CanGetInst()) {
1022  for (auto& desc : context.GetAllSeqdesc()) {
1023  if (desc.IsUser()) {
1024  const CUser_object& user = desc.GetUser();
1026  m_Objs["[n] sequence[s] [is] unverified"].Add(*context.BioseqObjRef(), false);
1027  return;
1028  }
1029  }
1030  }
1031  }
1032 }
1033 
1034 
1035 // DEFLINE_PRESENT
1036 
1037 const string kDeflineExists = "[n] Bioseq[s] [has] definition line";
1038 
1039 DISCREPANCY_CASE(DEFLINE_PRESENT, SEQUENCE, eDisc, "Test defline existence")
1040 {
1041  const CBioseq& bioseq = context.CurrentBioseq();
1042  if (bioseq.CanGetInst() && !bioseq.GetInst().IsAa() && context.GetTitle()) {
1043  m_Objs[kDeflineExists].Add(*context.BioseqObjRef());
1044  }
1045 }
1046 
1047 
1048 // UNUSUAL_NT
1049 
1050 DISCREPANCY_CASE(UNUSUAL_NT, SEQUENCE, eDisc | eSubmitter | eSmart, "Sequence contains unusual nucleotides")
1051 {
1052  const CBioseq& bioseq = context.CurrentBioseq();
1053  if (bioseq.CanGetInst() && bioseq.GetInst().IsNa()) {
1054  const CSeqSummary& sum = context.GetSeqSummary();
1055  if (sum.Other) {
1056  m_Objs["[n] sequence[s] contain[S] nucleotides that are not ATCG or N"].Add(*context.BioseqObjRef());
1057  }
1058  }
1059 }
1060 
1061 
1062 // TAXNAME_NOT_IN_DEFLINE
1063 
1064 const string kNoTaxnameInDefline = "[n] defline[s] [does] not contain the complete taxname";
1065 
1066 DISCREPANCY_CASE(TAXNAME_NOT_IN_DEFLINE, SEQUENCE, eDisc | eOncaller, "Complete taxname should be present in definition line")
1067 {
1068  const CBioseq& bioseq = context.CurrentBioseq();
1069  if (bioseq.CanGetInst() && !bioseq.GetInst().IsAa()) {
1070  const CSeqdesc* source = context.GetBiosource();
1071  const CSeqdesc* title = context.GetTitle();
1072  if (source && source->IsSource() && source->GetSource().IsSetOrg() && source->GetSource().GetOrg().IsSetTaxname() && title) {
1073  string taxname = source->GetSource().GetOrg().GetTaxname();
1074  if (NStr::EqualNocase(taxname, "Human immunodeficiency virus 1")) {
1075  taxname = "HIV-1";
1076  }
1077  else if (NStr::EqualNocase(taxname, "Human immunodeficiency virus 2")) {
1078  taxname = "HIV-2";
1079  }
1080  bool no_taxname_in_defline = false;
1081  SIZE_TYPE taxname_pos = NStr::FindNoCase(title->GetTitle(), taxname);
1082  if (taxname_pos == NPOS) {
1083  no_taxname_in_defline = true;
1084  }
1085  else {
1086  //capitalization must match for all but the first letter
1087  no_taxname_in_defline = NStr::CompareCase(title->GetTitle().c_str() + taxname_pos, 1, taxname.size() - 1, taxname.c_str() + 1) != 0;
1088  if (taxname_pos > 0 && !isspace(title->GetTitle()[taxname_pos - 1]) && !ispunct(title->GetTitle()[taxname_pos - 1])) {
1089  no_taxname_in_defline = true;
1090  }
1091  }
1092  if (no_taxname_in_defline) {
1093  m_Objs[kNoTaxnameInDefline].Add(*context.SeqdescObjRef(*title));
1094  }
1095  }
1096  }
1097 }
1098 
1099 
1100 // HAS_PROJECT_ID
1101 
1102 static string GetProjectID(const CUser_object& user)
1103 {
1104  string res;
1105  if (user.IsSetData()) {
1106  for (auto field: user.GetData()) {
1107  if (field->IsSetData() && field->GetData().IsInt() && field->IsSetLabel() && field->GetLabel().IsStr() && field->GetLabel().GetStr() == "ProjectID") {
1108  return NStr::IntToString(field->GetData().GetInt());
1109  }
1110  }
1111  }
1112  return res;
1113 }
1114 
1115 
1116 DISCREPANCY_CASE(HAS_PROJECT_ID, SEQUENCE, eOncaller | eSmart, "Sequences with project IDs (looks for genome project IDs)")
1117 {
1118  const CBioseq& bioseq = context.CurrentBioseq();
1119  if (bioseq.CanGetInst()) {
1120  for (auto& desc : context.GetAllSeqdesc()) {
1121  if (desc.IsUser()) {
1122  const CUser_object& user = desc.GetUser();
1123  if (user.IsSetType() && user.GetType().IsStr() && user.GetType().GetStr() == "GenomeProjectsDB") {
1124  string proj_id = GetProjectID(user);
1125  if (!proj_id.empty()) {
1126  m_Objs[proj_id][bioseq.IsNa() ? "N" : "A"].Add(*context.BioseqObjRef());
1127  }
1128  }
1129  }
1130  }
1131  }
1132 }
1133 
1134 
1135 DISCREPANCY_SUMMARIZE(HAS_PROJECT_ID)
1136 {
1137  if (m_Objs.empty()) {
1138  return;
1139  }
1140  CReportNode res;
1141  string all = "[n] sequence[s] [has] project IDs ";
1142  string prots = "[n] protein sequence[s] [has] project IDs ";
1143  string nucs = "[n] nucleotide sequence[s] [has] project IDs ";
1144  auto& projects = m_Objs.GetMap();
1145  all += projects.size() > 1 ? "(some different)" : "(all same)";
1146  size_t count_prots = 0;
1147  size_t count_nucs = 0;
1148  for (auto it: projects) {
1149  auto& M = it.second->GetMap();
1150  if (M.find("A") != M.end()) {
1151  count_prots++;
1152  }
1153  if (M.find("N") != M.end()) {
1154  count_nucs++;
1155  }
1156  }
1157  prots += count_prots > 1 ? "(some different)" : "(all same)";
1158  nucs += count_nucs > 1 ? "(some different)" : "(all same)";
1159  for (auto it : projects) {
1160  auto& M = it.second->GetMap();
1161  if (M.find("A") != M.end()) {
1162  for (auto obj : M["A"]->GetObjects()) {
1163  res[all][prots].Add(*obj);
1164  }
1165  }
1166  if (M.find("N") != M.end()) {
1167  for (auto obj : M["N"]->GetObjects()) {
1168  res[all][nucs].Add(*obj);
1169  }
1170  }
1171  }
1172 
1173  m_ReportItems = res.Export(*this)->GetSubitems();
1174 }
1175 
1176 
1177 // MULTIPLE_CDS_ON_MRNA
1178 
1179 DISCREPANCY_CASE(MULTIPLE_CDS_ON_MRNA, SEQUENCE, eOncaller | eSubmitter | eSmart, "Multiple CDS on mRNA")
1180 {
1181  const CSeqdesc* molinfo = context.GetMolinfo();
1182  if (molinfo && molinfo->GetMolinfo().IsSetBiomol() && molinfo->GetMolinfo().GetBiomol() == CMolInfo::eBiomol_mRNA) {
1183  auto& cds = context.FeatCDS();
1184  if (cds.size() < 2) {
1185  return;
1186  }
1187  size_t count_pseudo = 0;
1188  size_t count_disrupt = 0;
1189  for (auto feat : cds) {
1190  if (feat->IsSetComment() && NStr::Find(feat->GetComment(), "coding region disrupted by sequencing gap") != NPOS) {
1191  count_disrupt++;
1192  }
1193  if (context.IsPseudo(*feat)) {
1194  count_pseudo++;
1195  }
1196  }
1197  if (count_disrupt != cds.size() && count_pseudo != cds.size()) {
1198  m_Objs["[n] mRNA bioseq[s] [has] multiple CDS features"].Add(*context.BioseqObjRef());
1199  }
1200  }
1201 }
1202 
1203 
1204 // MRNA_SEQUENCE_MINUS_STRAND_FEATURES
1205 
1206 static const string kMrnaSequenceMinusStrandFeatures = "[n] mRNA sequences have features on the complement strand.";
1207 
1208 DISCREPANCY_CASE(MRNA_SEQUENCE_MINUS_STRAND_FEATURES, SEQUENCE, eOncaller, "mRNA sequences have CDS/gene on the complement strand")
1209 {
1210  const CSeqdesc* molinfo = context.GetMolinfo();
1211  if (molinfo && molinfo->GetMolinfo().IsSetBiomol() && molinfo->GetMolinfo().GetBiomol() == CMolInfo::eBiomol_mRNA) {
1212  auto& cds = context.FeatCDS();
1213  size_t count_plus = 0;
1214  size_t count_minus = 0;
1215  for (auto& feat : cds) {
1216  if (feat->GetLocation().GetStrand() != eNa_strand_minus || feat->GetData().GetSubtype() == CSeqFeatData::eSubtype_primer_bind) {
1217  count_plus++;
1218  }
1219  else {
1220  count_minus++;
1221  }
1222  }
1223  if (count_minus) {
1224  if (!count_plus) {
1226  }
1227  else {
1228  m_Objs[kMrnaSequenceMinusStrandFeatures].Add(*context.BioseqObjRef());
1229  }
1230  }
1231  }
1232 }
1233 
1234 
1235 DISCREPANCY_AUTOFIX(MRNA_SEQUENCE_MINUS_STRAND_FEATURES)
1236 {
1237  unsigned int n = 0;
1238  const CBioseq* seq = dynamic_cast<const CBioseq*>(context.FindObject(*obj));
1239  CBioseq_EditHandle bioseq(context.GetBioseqHandle(*seq));
1240  vector<CSeq_feat*> features;
1241  CFeat_CI feat_ci(bioseq, CSeqFeatData::e_Cdregion);
1242  for (; feat_ci; ++feat_ci) {
1243  features.push_back(const_cast<CSeq_feat*>(&*feat_ci->GetSeq_feat()));
1244  }
1245 
1246  CRef<objects::CSeq_inst> new_inst(new objects::CSeq_inst());
1247  new_inst->Assign(bioseq.GetInst());
1248  ReverseComplement(*new_inst, &context.GetScope());
1249  bioseq.SetInst(*new_inst);
1250 
1251  for (auto& feat : features) {
1252  edit::ReverseComplementFeature(*feat, context.GetScope());
1253  ++n;
1254  }
1255  obj->SetFixed();
1256  return CRef<CAutofixReport>(n ? new CAutofixReport("MRNA_SEQUENCE_MINUS_STRAND_FEATURES: [n] sequence[s] [is] converted to reverse complement[s]", n) : nullptr);
1257 }
1258 
1259 
1260 // LOW_QUALITY_REGION
1261 
1262 DISCREPANCY_CASE(LOW_QUALITY_REGION, SEQUENCE, eDisc | eSubmitter | eSmart, "Sequence contains regions of low quality")
1263 {
1264  const size_t MAX_N_IN_SEQ = 7; // 25% of the sequence
1265  const CBioseq& bioseq = context.CurrentBioseq();
1266  if (bioseq.CanGetInst() && bioseq.GetInst().IsNa()) {
1267  const CSeqSummary& sum = context.GetSeqSummary();
1268  if (sum.MinQ > MAX_N_IN_SEQ) {
1269  m_Objs["[n] sequence[s] contain[S] low quality region"].Add(*context.BioseqObjRef());
1270  }
1271  }
1272 }
1273 
1274 
1275 // DEFLINE_ON_SET
1276 
1277 DISCREPANCY_CASE(DEFLINE_ON_SET, SEQ_SET, eOncaller, "Titles on sets")
1278 {
1279  const CBioseq_set& set = context.CurrentBioseq_set();
1280  if (set.IsSetDescr()) {
1281  for (const auto& descr : set.GetDescr().Get()) {
1282  if (descr->IsTitle()) {
1283  m_Objs["[n] title[s] on sets were found"].Add(*context.SeqdescObjRef(*descr));
1284  }
1285  }
1286  }
1287 }
1288 
1289 
1290 DISCREPANCY_SUMMARIZE(DEFLINE_ON_SET)
1291 {
1292  m_ReportItems = m_Objs.Export(*this, false)->GetSubitems();
1293 }
1294 
1295 
1296 // MITOCHONDRION_REQUIRED
1297 
1298 DISCREPANCY_CASE(MITOCHONDRION_REQUIRED, SEQUENCE, eDisc | eOncaller, "If D-loop or control region misc_feat is present, source must be mitochondrial")
1299 {
1300  const CSeqdesc* biosrc = context.GetBiosource();
1301  if (!biosrc || biosrc->GetSource().GetGenome() != CBioSource::eGenome_mitochondrion) {
1302  auto& all = context.FeatAll();
1303  bool has_D_loop = false;
1304  bool has_misc_feat_with_control_region = false;
1305  for (auto& feat : all) {
1306  if (feat->IsSetData()) {
1307  if (feat->GetData().GetSubtype() == CSeqFeatData::eSubtype_D_loop) {
1308  has_D_loop = true;
1309  break;
1310  }
1311  else if (feat->GetData().GetSubtype() == CSeqFeatData::eSubtype_misc_feature) {
1312  if (feat->IsSetComment() && NStr::FindNoCase(feat->GetComment(), "control region") != NPOS) {
1313  has_misc_feat_with_control_region = true;
1314  break;
1315  }
1316  }
1317  }
1318  }
1319  if (has_D_loop || has_misc_feat_with_control_region) {
1320  m_Objs["[n] bioseq[s] [has] D-loop or control region misc_feature, but [is] do not have mitochondrial source"].Add(*context.BioseqObjRef(CDiscrepancyContext::eFixSet));
1321  }
1322  }
1323 }
1324 
1325 
1326 DISCREPANCY_SUMMARIZE(MITOCHONDRION_REQUIRED)
1327 {
1328  m_ReportItems = m_Objs.Export(*this, false)->GetSubitems();
1329 }
1330 
1331 
1332 #if 0
1333 static bool FixGenome(const CBioseq& bioseq, CScope& scope)
1334 {
1335  CBioseq_Handle seq_h = scope.GetBioseqHandle(bioseq);
1336  CSeqdesc_CI biosrc(seq_h, CSeqdesc::e_Source);
1337  if (biosrc) {
1338  CSeqdesc& edit_biosrc = const_cast<CSeqdesc&>(*biosrc);
1340  return true;
1341  }
1342 
1343  return false;
1344 }
1345 #endif
1346 
1347 
1348 DISCREPANCY_AUTOFIX(MITOCHONDRION_REQUIRED)
1349 {
1350  const CBioseq* seq = dynamic_cast<const CBioseq*>(context.FindObject(*obj));
1351  CBioseq_EditHandle seq_h = context.GetBioseqHandle(*seq);
1352  CSeqdesc_CI biosrc(seq_h, CSeqdesc::e_Source);
1353  if (biosrc) {
1354  CSeqdesc& edit_biosrc = const_cast<CSeqdesc&>(*biosrc);
1356  obj->SetFixed();
1357  return CRef<CAutofixReport>(new CAutofixReport("MITOCHONDRION_REQUIRED: Genome was set to mitochondrion for [n] bioseq[s]", 1));
1358  }
1359  return CRef<CAutofixReport>();
1360 }
1361 
1362 
1363 
1364 // SEQ_SHORTER_THAN_50bp
1365 
1366 static bool IsMolProd(int biomol) { return biomol == CMolInfo::eBiomol_mRNA || biomol == CMolInfo::eBiomol_ncRNA || biomol == CMolInfo::eBiomol_rRNA || biomol == CMolInfo::eBiomol_pre_RNA || biomol == CMolInfo::eBiomol_tRNA; }
1367 
1368 DISCREPANCY_CASE(SEQ_SHORTER_THAN_50bp, SEQUENCE, eDisc | eSubmitter | eSmart | eBig, "Find Short Sequences")
1369 {
1370  const CBioseq& bioseq = context.CurrentBioseq();
1371  if (bioseq.CanGetInst() && bioseq.GetInst().IsNa() && bioseq.IsSetLength() && bioseq.GetLength() < 50) {
1372  if (context.InGenProdSet() && bioseq.IsSetInst() && bioseq.GetInst().IsSetMol() && bioseq.GetInst().GetMol() == objects::CSeq_inst::eMol_rna) {
1373  const CSeqdesc* molinfo = context.GetMolinfo();
1374  if (molinfo && molinfo->GetMolinfo().IsSetBiomol() && IsMolProd(molinfo->GetMolinfo().GetBiomol())) {
1375  return;
1376  }
1377  }
1378  m_Objs["[n] sequence[s] [is] shorter than 50 nt"].Add(*context.BioseqObjRef());
1379  }
1380 }
1381 
1382 
1383 // SEQ_SHORTER_THAN_200bp
1384 
1385 DISCREPANCY_CASE(SEQ_SHORTER_THAN_200bp, SEQUENCE, eDisc | eSubmitter | eSmart | eBig | eTSA, "Short Contig")
1386 {
1387  const CBioseq& bioseq = context.CurrentBioseq();
1388  if (bioseq.CanGetInst() && bioseq.GetInst().IsNa() && bioseq.IsSetLength() && bioseq.GetLength() < 200) {
1389  if (context.InGenProdSet() && bioseq.IsSetInst() && bioseq.GetInst().IsSetMol() && bioseq.GetInst().GetMol() == objects::CSeq_inst::eMol_rna) {
1390  const CSeqdesc* molinfo = context.GetMolinfo();
1391  if (molinfo && molinfo->GetMolinfo().IsSetBiomol() && IsMolProd(molinfo->GetMolinfo().GetBiomol())) {
1392  return;
1393  }
1394  }
1396  if (bioseq.IsSetAnnot()) {
1397  for (auto& annot_it : bioseq.GetAnnot()) {
1398  if (annot_it->IsFtable()) {
1400  }
1401  }
1402  }
1403  m_Objs["[n] contig[s] [is] shorter than 200 nt"].Add(*context.BioseqObjRef(fix));
1404  }
1405 }
1406 
1407 
1408 DISCREPANCY_SUMMARIZE(SEQ_SHORTER_THAN_200bp)
1409 {
1410  m_ReportItems = m_Objs.Export(*this, false)->GetSubitems();
1411 }
1412 
1413 
1414 DISCREPANCY_AUTOFIX(SEQ_SHORTER_THAN_200bp)
1415 {
1416  const CBioseq* seq = dynamic_cast<const CBioseq*>(context.FindObject(*obj));
1417  CBioseq_EditHandle bioseq_edit(context.GetBioseqHandle(*seq));
1418  bioseq_edit.Remove();
1419  obj->SetFixed();
1420  return CRef<CAutofixReport>(new CAutofixReport("SEQ_SHORTER_THAN_200bp: [n] short bioseq[s] [is] removed", 1));
1421 }
1422 
1423 
1424 // RNA_PROVIRAL
1425 
1426 DISCREPANCY_CASE(RNA_PROVIRAL, SEQUENCE, eOncaller, "RNA bioseqs are proviral")
1427 {
1428  const CBioseq& bioseq = context.CurrentBioseq();
1429  if (bioseq.CanGetInst() && bioseq.GetInst().IsSetMol() && bioseq.GetInst().GetMol() == CSeq_inst::eMol_rna) {
1430  const CSeqdesc* biosrc = context.GetBiosource();
1431  if (biosrc && biosrc->GetSource().IsSetOrg() && biosrc->GetSource().IsSetGenome() && biosrc->GetSource().GetGenome() == CBioSource::eGenome_proviral) {
1432  m_Objs["[n] RNA bioseq[s] [is] proviral"].Add(*context.BioseqObjRef());
1433  }
1434  }
1435 }
1436 
1437 
1439 {
1440  m_ReportItems = m_Objs.Export(*this, false)->GetSubitems();
1441 }
1442 
1443 
1444 // SMALL_GENOME_SET_PROBLEM
1445 
1446 typedef bool (CBioSource::*FnIsSet)() const;
1447 typedef const string& (CBioSource::*FnGet)() const;
1448 
1449 static bool CompareOrGetString(const CBioSource& bio_src, FnIsSet is_set_fn, FnGet get_fn, string& val)
1450 {
1451  bool ret = true;
1452  if ((bio_src.*is_set_fn)()) {
1453  if (val.empty()) {
1454  val = (bio_src.*get_fn)();
1455  }
1456  else if (val != (bio_src.*get_fn)()) {
1457  ret = false;
1458  }
1459  }
1460  return ret;
1461 }
1462 
1463 
1464 static bool CompareOrgModValue(const CBioSource& bio_src, COrgMod::TSubtype subtype, string& val)
1465 {
1466  bool ret = true;
1467  if (bio_src.IsSetOrgMod()) {
1468  for (auto& mod : bio_src.GetOrgname().GetMod()) {
1469  if (mod->IsSetSubtype() && mod->GetSubtype() == subtype && mod->IsSetSubname()) {
1470  if (val.empty()) {
1471  val = mod->GetSubname();
1472  }
1473  else {
1474  if (mod->GetSubname() != val) {
1475  ret = false;
1476  }
1477  }
1478  break;
1479  }
1480  }
1481  }
1482  return ret;
1483 }
1484 
1485 
1486 static bool IsSegmentSubtype(const CBioSource& bio_src)
1487 {
1488  bool ret = false;
1489  if (bio_src.IsSetSubtype()) {
1490  for (const auto& subtype : bio_src.GetSubtype()) {
1491  if (subtype->IsSetSubtype() && subtype->GetSubtype() == CSubSource::eSubtype_segment) {
1492  ret = true;
1493  break;
1494  }
1495  }
1496  }
1497  return ret;
1498 }
1499 
1500 
1501 DISCREPANCY_CASE(SMALL_GENOME_SET_PROBLEM, SEQ_SET, eOncaller, "Problems with small genome sets")
1502 {
1503  const CBioseq_set& set = context.CurrentBioseq_set();
1504  if (set.IsSetClass() && set.GetClass() == CBioseq_set::eClass_small_genome_set) {
1505  string taxname, isolate, strain;
1506  bool all_taxname_same = true, all_isolate_same = true, all_strain_same = true;
1507  for (auto& descr_bio_src : context.GetSetBiosources()) {
1508  const CBioSource& bio_src = descr_bio_src->GetSource();
1509  if (context.HasLineage(bio_src, "", "Viruses")) {
1510  if (!IsSegmentSubtype(bio_src)) {
1511  m_Objs["[n] biosource[s] should have segment qualifier but [does] not"].Add(*context.SeqdescObjRef(*descr_bio_src));
1512  }
1513  }
1514  if (all_taxname_same) {
1515  all_taxname_same = CompareOrGetString(bio_src, &CBioSource::IsSetTaxname, &CBioSource::GetTaxname, taxname);
1516  }
1517  if (all_isolate_same) {
1518  all_isolate_same = CompareOrgModValue(bio_src, COrgMod::eSubtype_isolate, isolate);
1519  }
1520  if (all_strain_same) {
1521  all_strain_same = CompareOrgModValue(bio_src, COrgMod::eSubtype_strain, strain);
1522  }
1523  }
1524  if (!all_taxname_same) {
1525  m_Objs["Not all biosources have same taxname"];
1526  }
1527  if (!all_isolate_same) {
1528  m_Objs["Not all biosources have same isolate"];
1529  }
1530  if (!all_strain_same) {
1531  m_Objs["Not all biosources have same strain"];
1532  }
1533  }
1534 }
1535 
1536 
1537 DISCREPANCY_SUMMARIZE(SMALL_GENOME_SET_PROBLEM)
1538 {
1539  m_ReportItems = m_Objs.Export(*this, false)->GetSubitems();
1540 }
1541 
1542 
1543 // UNWANTED_SET_WRAPPER
1544 
1545 static bool IsMicroSatellite(const CSeq_feat& feat)
1546 {
1548  if (feat.IsSetQual()) {
1549  for (auto& qual : feat.GetQual()) {
1550  if (qual->IsSetQual() && qual->IsSetVal() && NStr::EqualNocase("satellite", qual->GetQual()) && NStr::StartsWith(qual->GetVal(), "microsatellite", NStr::eNocase)) {
1551  return true;
1552  }
1553  }
1554  }
1555  }
1556  return false;
1557 }
1558 
1559 
1560 DISCREPANCY_CASE(UNWANTED_SET_WRAPPER, FEAT, eOncaller, "Set wrapper on microsatellites or rearranged genes")
1561 {
1562  const CSeqdesc* biosrc = context.GetBiosource();
1563  if (biosrc && biosrc->GetSource().IsSetSubtype()) {
1564  for (auto& subtype : biosrc->GetSource().GetSubtype()) {
1565  if (subtype->IsSetSubtype() && subtype->GetSubtype() == CSubSource::eSubtype_rearranged) {
1567  break;
1568  }
1569  }
1570  }
1571  for (auto& feat : context.GetFeat()) {
1572  if (IsMicroSatellite(feat)) {
1573  context.PropagateFlags(CDiscrepancyContext::eHasSatFeat);
1574  }
1575  else {
1577  }
1578  }
1579  if (!context.IsBioseq()) {
1580  const CBioseq_set& set = context.CurrentBioseq_set();
1581  if (set.IsSetClass()) {
1582  CBioseq_set::EClass bio_set_class = set.GetClass();
1583  if (bio_set_class == CBioseq_set::eClass_eco_set || bio_set_class == CBioseq_set::eClass_mut_set || bio_set_class == CBioseq_set::eClass_phy_set || bio_set_class == CBioseq_set::eClass_pop_set) {
1584  unsigned char flags = context.ReadFlags();
1586  m_Objs["[n] unwanted set wrapper[s]"].Add(*context.BioseqSetObjRef());
1587  }
1588  }
1589  }
1590  }
1591 }
1592 
1593 
1594 // FLATFILE_FIND
1596 {
1597  const char* m_misspell;
1598  const char* m_correct;
1600 };
1601 
1603  { "Agricultutral", "agricultural", false },
1604  { "Bacilllus", "Bacillus", false },
1605  { "Enviromental", "Environmental", false },
1606  { "Insitiute", "institute", false },
1607  { "Instutite", "institute", false },
1608  { "Instutute", "Institute", false },
1609  { "P.R.Chian", "P.R. China", false },
1610  { "PRChian", "PR China", false },
1611  { "Scieces", "Sciences", false },
1612  { "agricultral", "agricultural", false },
1613  { "agriculturral", "agricultural", false },
1614  { "biotechnlogy", "biotechnology", false },
1615  { "Biotechnlogy", "Biotechnology", false },
1616  { "biotechnolgy", "biotechnology", false },
1617  { "biotechology", "biotechnology", false },
1618  { "caputre", "capture", true },
1619  { "casette", "cassette", true },
1620  { "catalize", "catalyze", false },
1621  { "charaterization", "characterization", false },
1622  { "clonging", "cloning", false },
1623  { "consevered", "conserved", false },
1624  { "cotaining", "containing", false },
1625  { "cytochome", "cytochrome", true },
1626  { "diveristy", "diversity", true },
1627  { "enivronment", "environment", false },
1628  { "enviroment", "environment", false },
1629  { "genone", "genome", true },
1630  { "homologue", "homolog", true },
1631  { "hypotethical", "hypothetical", false },
1632  { "hypotetical", "hypothetical", false },
1633  { "hypothetcial", "hypothetical", false },
1634  { "hypothteical", "hypothetical", false },
1635  { "indepedent", "independent", false },
1636  { "insititute", "institute", false },
1637  { "insitute", "institute", false },
1638  { "institue", "institute", false },
1639  { "instute", "institute", false },
1640  { "muesum", "museum", true },
1641  { "musuem", "museum", true },
1642  { "nuclear shutting", "nuclear shuttling", true },
1643  { "phylogentic", "phylogenetic", false },
1644  { "protien", "protein", false },
1645  { "puatative", "putative", false },
1646  { "putaitve", "putative", false },
1647  { "putaive", "putative", false },
1648  { "putataive", "putative", false },
1649  { "putatitve", "putative", false },
1650  { "putatuve", "putative", false },
1651  { "putatvie", "putative", false },
1652  { "pylogeny", "phylogeny", false },
1653  { "resaerch", "research", false },
1654  { "reseach", "research", false },
1655  { "reserach", "research", true },
1656  { "reserch", "research", false },
1657  { "ribosoml", "ribosomal", false },
1658  { "ribossomal", "ribosomal", false },
1659  { "scencies", "sciences", false },
1660  { "scinece", "science", false },
1661  { "simmilar", "similar", false },
1662  { "structual", "structural", false },
1663  { "subitilus", "subtilis", false },
1664  { "sulfer", "sulfur", false },
1665  { "technlogy", "technology", false },
1666  { "technolgy", "technology", false },
1667  { "Technlogy", "Technology", false },
1668  { "Veterinry", "Veterinary", false },
1669  { "Argricultural", "Agricultural", false },
1670  { "transcirbed", "transcribed", false },
1671  { "transcirption", "transcription", true },
1672  { "uiniversity", "university", false },
1673  { "uinversity", "university", false },
1674  { "univercity", "university", false },
1675  { "univerisity", "university", false },
1676  { "univeristy", "university", false },
1677  { "univesity", "university", false },
1678  { "unversity", "university", true },
1679  { "uviversity", "university", false },
1680  { "anaemia", nullptr, false },
1681  { "haem", nullptr, false },
1682  { "haemagglutination", nullptr, false },
1683  { "heam", nullptr, false },
1684  { "mithocon", nullptr, false },
1685 };
1686 
1687 static const size_t kSpellFixesSize = ArraySize(kSpellFixes);
1688 static const string kFixable = "Fixable";
1689 static const string kNonFixable = "Non-fixable";
1690 
1691 
1692 static void FindFlatfileText(const char* str, bool *result)
1693 {
1694 #include "FLATFILE_FIND.inc"
1695  static constexpr TLocalFSM s_FSM{s_compact, s_hits_init_1, s_hits_init_2, s_states, nullptr};
1696 
1697  CMultipatternSearch::Search(str, s_FSM, [result](size_t n){ result[n] = true; });
1698 }
1699 
1700 
1701 /// Checking that FLATFILE_FIND.inc is in sync with kSpellFixes
1702 /// If the array is changed, need to regenerate FLATFILE_FIND.inc:
1703 /// multipattern.exe -i FLATFILE_FIND.txt > FLATFILE_FIND.inc
1705 {
1706  bool Found[kSpellFixesSize];
1707  string error = "String not found: ";
1708  for (size_t i = 0; i < kSpellFixesSize; i++) {
1709  fill(Found, Found + kSpellFixesSize, 0);
1710  FindFlatfileText(kSpellFixes[i].m_misspell, Found);
1711  if (!Found[i]) {
1714  }
1715  }
1716 }
1717 
1718 
1719 DISCREPANCY_CASE1(FLATFILE_FIND, SEQUENCE, eOncaller, "Flatfile representation of object contains suspect text",
1720  "FLATFILE_FIND_ONCALLER",
1721  "FLATFILE_FIND_ONCALLER_UNFIXABLE",
1722  "FLATFILE_FIND_ONCALLER_FIXABLE"
1723  )
1724 {
1725  bool Found[kSpellFixesSize];
1726  for (auto& desc : context.GetAllSeqdesc()) {
1727  fill(Found, Found + kSpellFixesSize, 0);
1728  for (CStdTypeConstIterator<string> it(desc); it; ++it) {
1729  FindFlatfileText(it->c_str(), Found);
1730  }
1731  for (size_t i = 0; i < kSpellFixesSize; i++) {
1732  if (Found[i]) {
1733  string subitem = string("[n] object[s] contain[S] ") + kSpellFixes[i].m_misspell;
1734  bool autofix = kSpellFixes[i].m_correct != nullptr;
1735  const string& fixable = (autofix ? kFixable : kNonFixable);
1736  m_Objs[fixable][subitem].Add(*context.SeqdescObjRef(desc, &desc));
1737  }
1738  }
1739  }
1740  for (auto& feat: context.FeatAll()) {
1741  fill(Found, Found + kSpellFixesSize, 0);
1742  for (CStdTypeConstIterator<string> it(*feat); it; ++it) {
1743  FindFlatfileText(it->c_str(), Found);
1744  }
1745  for (size_t i = 0; i < kSpellFixesSize; i++) {
1746  if (Found[i]) {
1747  string subitem = string("[n] object[s] contain[S] ") + kSpellFixes[i].m_misspell;
1748  bool autofix = kSpellFixes[i].m_correct != nullptr;
1749  const string& fixable = (autofix ? kFixable : kNonFixable);
1750  m_Objs[fixable][subitem].Add(*context.SeqFeatObjRef(*feat, feat));
1751  }
1752  }
1753  }
1754 }
1755 
1756 
1758 {
1759  m_ReportItems = m_Objs.Export(*this, false)->GetSubitems();
1760 }
1761 
1762 
1763 static bool FixTextInObject(CSerialObject* obj, size_t misspell_idx)
1764 {
1765  bool ret = false;
1766  const SpellFixData& fix_data = kSpellFixes[misspell_idx];
1767  for (CStdTypeIterator<string> it(*obj); it; ++it) {
1768  if (NStr::Find(*it, fix_data.m_misspell) != NPOS) {
1769  NStr::ReplaceInPlace(*it, fix_data.m_misspell, fix_data.m_correct, 0, -1);
1770  ret = true;
1771  }
1772  }
1773  return ret;
1774 }
1775 
1776 
1777 DISCREPANCY_AUTOFIX(FLATFILE_FIND)
1778 {
1779  unsigned int n = 0;
1780  bool Found[kSpellFixesSize];
1781  const CSeq_feat* feat = dynamic_cast<const CSeq_feat*>(context.FindObject(*obj));
1782  const CSeqdesc* desc = dynamic_cast<const CSeqdesc*>(context.FindObject(*obj));
1783  fill(Found, Found + kSpellFixesSize, 0);
1784  if (feat) {
1785  for (CStdTypeConstIterator<string> it(*feat); it; ++it) {
1786  FindFlatfileText(it->c_str(), Found);
1787  }
1788  for (size_t i = 0; i < kSpellFixesSize; i++) {
1789  if (Found[i]) {
1790  if (FixTextInObject(const_cast<CSeq_feat*>(feat), i)) {
1791  ++n;
1792  }
1793  }
1794  }
1795  }
1796  if (desc) {
1797  for (CStdTypeConstIterator<string> it(*desc); it; ++it) {
1798  FindFlatfileText(it->c_str(), Found);
1799  }
1800  for (size_t i = 0; i < kSpellFixesSize; i++) {
1801  if (Found[i]) {
1802  if (FixTextInObject(const_cast<CSeqdesc*>(desc), i)) {
1803  ++n;
1804  }
1805  }
1806  }
1807  }
1808  obj->SetFixed();
1809  return CRef<CAutofixReport>(new CAutofixReport("FLATFILE_FIND: [n] suspect text[s] [is] fixed", n));
1810 }
1811 
1812 
1813 // ALL_SEQS_SHORTER_THAN_20kb
1814 
1815 static const size_t MIN_SEQUENCE_LEN = 20000;
1816 
1817 
1818 DISCREPANCY_CASE(ALL_SEQS_SHORTER_THAN_20kb, SEQUENCE, eDisc | eSubmitter | eSmart | eBig, "Short sequences test")
1819 {
1820  if (context.CurrentBioseqSummary().Len > MIN_SEQUENCE_LEN) {
1821  m_Objs[kEmptyStr];
1822  }
1823 }
1824 
1825 
1826 DISCREPANCY_SUMMARIZE(ALL_SEQS_SHORTER_THAN_20kb)
1827 {
1828  if (m_Objs.GetMap().find(kEmptyStr) == m_Objs.GetMap().end()) {
1829  // no sequences longer than 20000 nt
1830  m_Objs["No sequences longer than 20,000 nt found"];
1831  }
1832  else {
1833  m_Objs.GetMap().erase(kEmptyStr);
1834  }
1835  m_ReportItems = m_Objs.Export(*this, false)->GetSubitems();
1836 }
1837 
1838 
1839 
1840 DISCREPANCY_CASE(ALL_SEQS_CIRCULAR, SEQUENCE, eDisc | eSubmitter | eSmart, "All sequences circular")
1841 {
1842  const CBioseq& bioseq = context.CurrentBioseq();
1843  if (bioseq.CanGetInst() && bioseq.GetInst().IsNa()) {
1844  if (m_Objs["N"].GetCount()) {
1845  return;
1846  }
1847  if (bioseq.GetInst().CanGetTopology() && bioseq.GetInst().GetTopology() == CSeq_inst::eTopology_circular) {
1848  const CSeqdesc* biosrc = context.GetBiosource();
1849  if (biosrc && biosrc->GetSource().IsSetGenome() && (biosrc->GetSource().GetGenome() == CBioSource::eGenome_plasmid || biosrc->GetSource().GetGenome() == CBioSource::eGenome_chromosome)) {
1850  return;
1851  }
1852  m_Objs["C"].Incr();
1853  if (!m_Objs["F"].GetCount()) {
1854  if (bioseq.IsSetId()) {
1855  for (auto id : bioseq.GetId()) {
1856  const CTextseq_id* txt = id->GetTextseq_Id();
1857  if (txt && txt->IsSetAccession()) {
1860  m_Objs["F"].Incr();
1861  return;
1862  }
1863  }
1864  }
1865  }
1866  if (bioseq.IsSetDescr() && bioseq.GetDescr().IsSet()) {
1867  for (const auto& descr : bioseq.GetDescr().Get()) {
1868  if (descr->IsMolinfo() && descr->GetMolinfo().CanGetTech()) {
1869  if (descr->GetMolinfo().GetTech() == CMolInfo::eTech_wgs || descr->GetMolinfo().GetTech() == CMolInfo::eTech_tsa || descr->GetMolinfo().GetTech() == CMolInfo::eTech_targeted) {
1870  m_Objs["F"].Incr();
1871  return;
1872  }
1873  }
1874  }
1875  }
1876  }
1877  }
1878  else {
1879  m_Objs["N"].Incr();
1880  }
1881  }
1882 }
1883 
1884 
1885 DISCREPANCY_SUMMARIZE(ALL_SEQS_CIRCULAR)
1886 {
1887  CReportNode rep;
1888  if (m_Objs["C"].GetCount() && !m_Objs["N"].GetCount()) {
1889  rep["All ([n]) sequences are circular"].Severity(m_Objs["F"].GetCount() ? CReportItem::eSeverity_error : CReportItem::eSeverity_warning).SetCount(m_Objs["C"].GetCount());
1890  m_ReportItems = rep.Export(*this, false)->GetSubitems();
1891  }
1892 }
1893 
1894 
1895 // SUSPICIOUS_SEQUENCE_ID
1896 
1897 static constexpr auto suspicious_id_re = ctll::fixed_string{ "chromosome|plasmid|mito|chloroplast|apicoplast|plastid|^chr|^lg|\\bnw_|\\bnz_|\\bnm_|\\bnc_|\\bac_|cp\\d\\d\\d\\d\\d\\d|^x$|^y$|^z$|^w$|^mt$|^pltd$|^chl$" };
1898 
1899 static bool SuspiciousId(const string& s)
1900 {
1901 #if 0
1902  static CRegexp regexp("chromosome|plasmid|mito|chloroplast|apicoplast|plastid|^chr|^lg|\\bNW_|\\bNZ_|\\bNM_|\\bNC_|\\bAC_|CP\\d\\d\\d\\d\\d\\d|^X$|^Y$|^Z$|^W$|^MT$|^PLTD$|^CHL$", CRegexp::fCompile_ignore_case);
1903  return regexp.IsMatch(s);
1904 #else
1905  string id = s;
1906  NStr::ToLower(id);
1907  return ctre::search<suspicious_id_re>(id);
1908 #endif
1909 }
1910 
1911 DISCREPANCY_CASE(SUSPICIOUS_SEQUENCE_ID, SEQUENCE, eOncaller | eSubmitter | eSmart | eBig, "Suspicious sequence identifiers")
1912 {
1913  const CBioseq& bioseq = context.CurrentBioseq();
1914  if (bioseq.CanGetId()) {
1915  bool report = false;
1916  for (const auto& id : bioseq.GetId()) {
1917  if (id->IsLocal()) {
1918  if (id->GetLocal().IsStr() && SuspiciousId(id->GetLocal().GetStr())) {
1919  report = true;
1920  break;
1921  }
1922  }
1923  else if (id->IsGeneral()) {
1924  if (id->GetGeneral().IsSetDb() && SuspiciousId(id->GetGeneral().GetDb())) {
1925  report = true;
1926  break;
1927  }
1928  if (id->GetGeneral().IsSetTag() && id->GetGeneral().GetTag().IsStr() && SuspiciousId(id->GetGeneral().GetTag().GetStr())) {
1929  report = true;
1930  break;
1931  }
1932  }
1933  }
1934  if (report) {
1935  m_Objs["[n] sequence[s] [has] suspicious identifiers"].Add(*context.BioseqSetObjRef());
1936  }
1937  }
1938 }
1939 
1940 
1941 DISCREPANCY_SUMMARIZE(SUSPICIOUS_SEQUENCE_ID)
1942 {
1943  m_ReportItems = m_Objs.Export(*this, false)->GetSubitems();
1944 }
1945 
1946 
1947 // CHROMOSOME_PRESENT
1948 
1950 
1952 {
1953  if (Qualifier == CSubSource::eSubtype_plasmid_name) {
1954  return true; // always OK by this test; might be handled elsewhere
1955  }
1956 
1957  switch (Location)
1958  {
1960  return false;
1961  case CBioSource::eGenome_unknown: // not present
1963  switch (Qualifier)
1964  {
1967  return false;
1968 // case eSubtype_unknown: // not present
1969  default:
1970  return true;
1971  }
1972  case CBioSource::eGenome_plasmid: // always OK by this test; might be handled elsewhere
1973  default:
1974  return true;
1975  }
1976 
1977 }
1978 
1979 DISCREPANCY_CASE(CHROMOSOME_PRESENT, SEQ_SET, eSubmitter | eSmart, "Chromosome present")
1980 {
1981  const CBioseq_set& set = context.CurrentBioseq_set();
1982  if (set.IsSetSeq_set()) {
1983  for (const auto& se : set.GetSeq_set()) {
1984  if (!se->IsSetDescr()) {
1985  continue;
1986  }
1987 
1988  for (const auto& descr : se->GetDescr().Get()) {
1989  if (!descr->IsSource()) {
1990  continue;
1991  }
1992  const CBioSource& bio_src = descr->GetSource();
1993 
1995  if (bio_src.IsSetGenome()) {
1996  Location = static_cast<CBioSource::EGenome>(bio_src.GetGenome());
1997  }
1998  // shortcut
1999  if (Location == CBioSource::eGenome_plasmid) {
2000  continue; // always OK by this test; might be handled elsewhere
2001  }
2002 
2003  if (bio_src.IsSetSubtype()) {
2004  for (const auto& subtype : bio_src.GetSubtype()) {
2006  if (subtype->IsSetSubtype()) {
2007  Qualifier = static_cast<CSubSource::ESubtype>(subtype->GetSubtype());
2008  }
2009  if (!s_areCompatible(Location, Qualifier)) {
2010  m_Objs["one or more chromosomes are present"];
2011  }
2012  }
2013  } else {
2014  if (!s_areCompatible(Location, eSubtype_unknown)) {
2015  m_Objs["one or more chromosomes are present"];
2016  }
2017  }
2018  }
2019  }
2020  }
2021 }
2022 
2023 
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
const string & GetTaxname(void) const
Definition: BioSource.cpp:340
bool IsSetOrgMod(void) const
Definition: BioSource.cpp:415
const COrgName & GetOrgname(void) const
Definition: BioSource.cpp:410
bool IsSetTaxname(void) const
Definition: BioSource.cpp:335
CBioseq_EditHandle –.
CBioseq_Handle –.
TSeqPos GetLength(void) const
Definition: Bioseq.cpp:360
bool IsSetLength(void) const
Definition: Bioseq.cpp:355
bool IsNa(void) const
Definition: Bioseq.cpp:345
bool IsAa(void) const
Definition: Bioseq.cpp:350
static string GetStructuredCommentPrefix(const CUser_object &user, bool normalize=true)
static bool IsGui()
CFeat_CI –.
Definition: feat_ci.hpp:64
void Search(const char *input, VoidCall1 found_callback) const
CRef –.
Definition: ncbiobj.hpp:618
CRegexp –.
Definition: regexp.hpp:74
virtual vector< CRef< CReportItem > > GetSubitems() const =0
static void Add(TReportObjectList &list, TReportObjectSet &hash, CReportObj &obj, bool unique=true)
TReportObjectList & GetObjects()
CReportNode & Severity(CReportItem::ESeverity s)
void SetCount(size_t n)
TNodeMap & GetMap()
CRef< CReportItem > Export(CDiscrepancyCore &test, bool unique=true) const
static bool Exist(TReportObjectSet &hash, CReportObj &obj)
CScope –.
Definition: scope.hpp:92
static EFeatureLocationAllowed AllowedFeatureLocation(ESubtype subtype)
@ eFeatureLocationAllowed_NucOnly
@ eFeatureLocationAllowed_ProtOnly
@ eFeatureLocationAllowed_Any
ESubtype GetSubtype(void) const
@Seq_descr.hpp User-defined methods of the data storage class.
Definition: Seq_descr.hpp:55
CSeq_feat_EditHandle –.
namespace ncbi::objects::
Definition: Seq_feat.hpp:58
static bool IsAa(EMol mol)
Definition: Seq_inst.hpp:99
static bool IsNa(EMol mol)
Definition: Seq_inst.hpp:90
CSeqdesc_CI –.
Definition: seqdesc_ci.hpp:65
Base class for all serializable objects.
Definition: serialbase.hpp:150
Template class for iteration on objects of class C (non-medifiable version)
Definition: iterator.hpp:767
Template class for iteration on objects of class C.
Definition: iterator.hpp:673
@ eObjectType_StructuredComment
EObjectType GetObjectType() const
void erase(iterator pos)
Definition: map.hpp:167
size_type size() const
Definition: map.hpp:148
Definition: map.hpp:338
Definition: set.hpp:45
API (CDeflineGenerator) for computing sequences' titles ("definitions").
static uch flags
vector< CRef< CReportObj > > TReportObjectList
@ eFatal
@ eAll
@ eTSA
@ eBig
@ eDisc
@ eOncaller
@ eSubmitter
@ eSmart
#define DISCREPANCY_AUTOFIX(name)
#define DISCREPANCY_CASE1(name, type, group, descr,...)
#define DISCREPANCY_CASE0(name, sname, type, group, descr)
#define DISCREPANCY_CASE(name, type, group, descr)
#define DISCREPANCY_SUMMARIZE(name)
vector< CConstRef< CObject > > GetObjects(CSeq_entry_Handle seh, const string &field, CFieldNamePanel::EFieldType field_type, int subtype, const string &ncRNA_class, CConstRef< objects::CSeq_submit > submit, CRef< CEditingActionConstraint > constraint, vector< CSeq_entry_Handle > *descr_context=nullptr)
#define bool
Definition: bool.h:34
static DLIST_TYPE *DLIST_NAME() first(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:46
static const char * str(char *buf, int n)
Definition: stats.c:84
static char tmp[3200]
Definition: utf8.c:42
static FILE * f
Definition: readconf.c:23
void ReverseComplement(const BidirectionalIterator &first, const BidirectionalIterator &last)
constexpr size_t ArraySize(const Element(&)[Size])
Definition: ncbimisc.hpp:1532
string
Definition: cgiapp.hpp:690
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
@ eUnknown
Definition: app_popup.hpp:72
static EAccessionInfo IdentifyAccession(const CTempString &accession, TParseFlags flags=fParse_AnyRaw)
Deduces information from a bare accession a la WHICH_db_accession; may report false negatives on prop...
Definition: Seq_id.cpp:1634
EAccessionInfo
For IdentifyAccession (below)
Definition: Seq_id.hpp:220
@ eAcc_wgs
Definition: Seq_id.hpp:290
@ eAcc_division_mask
Definition: Seq_id.hpp:299
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
Definition: scope.cpp:95
void SetDescr(TDescr &v) const
void SetInst_Mol(TInst_Mol v) const
const CSeqFeatData & GetData(void) const
void Remove(void) const
Remove the feature from Seq-annot.
void SetInst(TInst &v) const
void Remove(ERemoveMode mode=eRemoveSeq_entry) const
bool IsSetData(void) const
const TInst & GetInst(void) const
const CSeq_feat & GetMappedFeature(void) const
Feature mapped to the master sequence.
CConstRef< CSeq_feat > GetSeq_feat(void) const
Get current seq-feat.
bool IsMatch(CTempString str, TMatch flags=fMatch_default)
Check existence substring which match a specified pattern.
Definition: regexp.cpp:253
@ fCompile_ignore_case
Definition: regexp.hpp:111
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
NCBI_NS_STD::string::size_type SIZE_TYPE
Definition: ncbistr.hpp:132
#define kEmptyStr
Definition: ncbistr.hpp:123
static SIZE_TYPE FindNoCase(const CTempString str, const CTempString pattern, SIZE_TYPE start, SIZE_TYPE end, EOccurrence which=eFirst)
Find the pattern in the specified range of a string using a case insensitive search.
Definition: ncbistr.cpp:2984
static bool IsBlank(const CTempString str, SIZE_TYPE pos=0)
Check if a string is blank (has no text).
Definition: ncbistr.cpp:106
#define NPOS
Definition: ncbistr.hpp:133
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5078
static SIZE_TYPE Find(const CTempString str, const CTempString pattern, ECase use_case=eCase, EDirection direction=eForwardSearch, SIZE_TYPE occurrence=0)
Find the pattern in the string.
Definition: ncbistr.cpp:2882
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
Definition: ncbistr.hpp:5406
static bool EqualNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive equality of a substring with another string.
Definition: ncbistr.hpp:5347
static bool Equal(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2, ECase use_case=eCase)
Test for equality of a substring with another string.
Definition: ncbistr.hpp:5378
static string & ReplaceInPlace(string &src, const string &search, const string &replace, SIZE_TYPE start_pos=0, SIZE_TYPE max_replace=0, SIZE_TYPE *num_replace=0)
Replace occurrences of a substring within a string.
Definition: ncbistr.cpp:3396
const char *const kEmptyCStr
Empty "C" string (points to a '\0').
Definition: ncbistr.cpp:68
static string & ToLower(string &str)
Convert string to lower case – string& version.
Definition: ncbistr.cpp:405
static int CompareCase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-sensitive compare of a substring with another string.
Definition: ncbistr.cpp:135
@ eNocase
Case insensitive compare.
Definition: ncbistr.hpp:1206
static const char label[]
const TSubtype & GetSubtype(void) const
Get the Subtype member data.
Definition: BioSource_.hpp:539
TGenome GetGenome(void) const
Get the Genome member data.
Definition: BioSource_.hpp:422
bool IsSetOrg(void) const
Check if a value has been assigned to Org data member.
Definition: BioSource_.hpp:497
bool IsSetSubtype(void) const
Check if a value has been assigned to Subtype data member.
Definition: BioSource_.hpp:527
void SetGenome(TGenome value)
Assign a value to Genome data member.
Definition: BioSource_.hpp:428
bool IsSetGenome(void) const
Check if a value has been assigned to Genome data member.
Definition: BioSource_.hpp:397
EGenome
biological context
Definition: BioSource_.hpp:97
const TStr & GetStr(void) const
Get the variant data.
bool IsSetData(void) const
the object itself Check if a value has been assigned to Data data member.
bool IsStr(void) const
Check if variant Str is selected.
Definition: Object_id_.hpp:291
bool IsSetType(void) const
type of object within class Check if a value has been assigned to Type data member.
bool IsStrs(void) const
Check if variant Strs is selected.
const TStrs & GetStrs(void) const
Get the variant data.
const TData & GetData(void) const
Get the Data member data.
bool IsStr(void) const
Check if variant Str is selected.
const TStr & GetStr(void) const
Get the variant data.
Definition: Object_id_.hpp:297
const TData & GetData(void) const
Get the Data member data.
const TType & GetType(void) const
Get the Type member data.
vector< CStringUTF8 > TStrs
const TMod & GetMod(void) const
Get the Mod member data.
Definition: OrgName_.hpp:839
@ eSubtype_strain
Definition: OrgMod_.hpp:85
@ eSubtype_isolate
Definition: OrgMod_.hpp:100
bool IsSetData(void) const
the specific data Check if a value has been assigned to Data data member.
Definition: Seq_feat_.hpp:913
bool IsSetQual(void) const
qualifiers Check if a value has been assigned to Qual data member.
Definition: Seq_feat_.hpp:1135
const TQual & GetQual(void) const
Get the Qual member data.
Definition: Seq_feat_.hpp:1147
const TData & GetData(void) const
Get the Data member data.
Definition: Seq_feat_.hpp:925
bool IsSetAccession(void) const
Check if a value has been assigned to Accession data member.
const TAccession & GetAccession(void) const
Get the Accession member data.
@ eNa_strand_minus
Definition: Na_strand_.hpp:67
@ eClass_pop_set
population study
@ eClass_phy_set
phylogenetic study
@ eClass_mut_set
set of mutations
@ eClass_eco_set
ecological sample study
@ eClass_small_genome_set
viral segments or mitochondrial minicircles
TRepr GetRepr(void) const
Get the Repr member data.
Definition: Seq_inst_.hpp:565
bool IsSetCompleteness(void) const
Check if a value has been assigned to Completeness data member.
Definition: MolInfo_.hpp:569
const TUser & GetUser(void) const
Get the variant data.
Definition: Seqdesc_.cpp:384
const TInst & GetInst(void) const
Get the Inst member data.
Definition: Bioseq_.hpp:336
TTopology GetTopology(void) const
Get the Topology member data.
Definition: Seq_inst_.hpp:733
bool IsSetAnnot(void) const
Check if a value has been assigned to Annot data member.
Definition: Bioseq_.hpp:354
bool IsSetRepr(void) const
Check if a value has been assigned to Repr data member.
Definition: Seq_inst_.hpp:546
bool IsSetMol(void) const
Check if a value has been assigned to Mol data member.
Definition: Seq_inst_.hpp:593
bool CanGetTopology(void) const
Check if it is safe to call GetTopology method.
Definition: Seq_inst_.hpp:714
const TTitle & GetTitle(void) const
Get the variant data.
Definition: Seqdesc_.hpp:1032
const TSource & GetSource(void) const
Get the variant data.
Definition: Seqdesc_.cpp:566
bool IsSetBiomol(void) const
Check if a value has been assigned to Biomol data member.
Definition: MolInfo_.hpp:422
const TAnnot & GetAnnot(void) const
Get the Annot member data.
Definition: Bioseq_.hpp:366
const TId & GetId(void) const
Get the Id member data.
Definition: Bioseq_.hpp:290
TTech GetTech(void) const
Get the Tech member data.
Definition: MolInfo_.hpp:497
const Tdata & Get(void) const
Get the member data.
Definition: Seq_descr_.hpp:166
bool IsSetInst(void) const
the sequence data Check if a value has been assigned to Inst data member.
Definition: Bioseq_.hpp:324
TLength GetLength(void) const
Get the Length member data.
Definition: Seq_inst_.hpp:659
TMol GetMol(void) const
Get the Mol member data.
Definition: Seq_inst_.hpp:612
bool IsSetLength(void) const
length of sequence in residues Check if a value has been assigned to Length data member.
Definition: Seq_inst_.hpp:640
TSource & SetSource(void)
Select the variant.
Definition: Seqdesc_.cpp:572
bool IsSetDescr(void) const
descriptors Check if a value has been assigned to Descr data member.
Definition: Bioseq_.hpp:303
bool IsSet(void) const
Check if a value has been assigned to data member.
Definition: Seq_descr_.hpp:154
TBiomol GetBiomol(void) const
Get the Biomol member data.
Definition: MolInfo_.hpp:447
void SetBiomol(TBiomol value)
Assign a value to Biomol data member.
Definition: MolInfo_.hpp:453
bool CanGetId(void) const
Check if it is safe to call GetId method.
Definition: Bioseq_.hpp:284
bool IsSetTech(void) const
Check if a value has been assigned to Tech data member.
Definition: MolInfo_.hpp:472
const TFtable & GetFtable(void) const
Get the variant data.
Definition: Seq_annot_.hpp:621
TCompleteness GetCompleteness(void) const
Get the Completeness member data.
Definition: MolInfo_.hpp:594
const TData & GetData(void) const
Get the Data member data.
Definition: Seq_annot_.hpp:873
bool IsSetId(void) const
equivalent identifiers Check if a value has been assigned to Id data member.
Definition: Bioseq_.hpp:278
Tdata & Set(void)
Assign a value to data member.
Definition: Seq_descr_.hpp:172
const TDescr & GetDescr(void) const
Get the Descr member data.
Definition: Bioseq_.hpp:315
const TMolinfo & GetMolinfo(void) const
Get the variant data.
Definition: Seqdesc_.cpp:588
TMolinfo & SetMolinfo(void)
Select the variant.
Definition: Seqdesc_.cpp:594
bool CanGetInst(void) const
Check if it is safe to call GetInst method.
Definition: Bioseq_.hpp:330
@ eRepr_delta
sequence made by changes (delta) to others
Definition: Seq_inst_.hpp:100
@ eCompleteness_unknown
Definition: MolInfo_.hpp:155
@ eCompleteness_complete
complete biological entity
Definition: MolInfo_.hpp:156
@ eTech_targeted
targeted locus sets/studies
Definition: MolInfo_.hpp:147
@ eTech_tsa
transcriptome shotgun assembly
Definition: MolInfo_.hpp:146
@ eTech_wgs
whole genome shotgun sequencing
Definition: MolInfo_.hpp:143
@ eBiomol_pre_RNA
precursor RNA of any sort really
Definition: MolInfo_.hpp:102
@ e_Source
source of materials, includes Org-ref
Definition: Seqdesc_.hpp:133
where boath are integers</td > n< td ></td > n</tr > n< tr > n< td > tse</td > n< td > optional</td > n< td > String</td > n< td class=\"description\"> TSE option controls what blob is smart and slim</td> n<td> orig</td> n</tr> n<tr> n<td> last_modified</td> n<td> optional</td> n<td> Integer</td> n<td class=\"description\"> The blob last modification If provided then the exact match will be requested with n the Cassandra storage corresponding field value</td> n<td> Positive integer Not provided means that the most recent match will be selected</td> n<td></td> n</tr> n<tr> n<td> use_cache</td> n<td> optional</td> n<td> String</td> n<td class=\"description\"> The option controls if the Cassandra LMDB cache and or database should be used It n affects the seq id resolution step and the blob properties lookup step The following n options are BIOSEQ_INFO and BLOB_PROP at all
int i
yy_size_t n
void ReverseComplementFeature(CSeq_feat &feat, CScope &scope)
Definition: loc_edit.cpp:1068
static MDB_envinfo info
Definition: mdb_load.c:37
Simultaneous search of multiple RegEx patterns in the input string.
const struct ncbi::grid::netcache::search::fields::KEY key
const GenericPointer< typename T::ValueType > T2 value
Definition: pointer.h:1227
const CharType(& source)[N]
Definition: pointer.h:1149
int isspace(Uchar c)
Definition: ncbictype.hpp:69
int ispunct(Uchar c)
Definition: ncbictype.hpp:68
Int mod(Int i, Int j)
Definition: njn_integer.hpp:67
#define M
#define count
static const string kFixable
const string kStructuredCommentReport
const string & kPreviouslySeenFields
static const string kMrnaSequenceMinusStrandFeatures
USING_SCOPE(objects)
static bool IsSegmentSubtype(const CBioSource &bio_src)
static bool s_areCompatible(CBioSource::EGenome Location, CSubSource::ESubtype Qualifier)
void UnitTest_FLATFILE_FIND()
Checking that FLATFILE_FIND.inc is in sync with kSpellFixes If the array is changed,...
string AdjustDBLinkFieldName(const string &orig_field_name)
static const CSubSource::ESubtype eSubtype_unknown
const string kMissingDBLink
const string kStructuredCommentObservedPrefixes
const string kSomeIdenticalDeflines
const string &(CBioSource::* FnGet)() const
string GetFieldValueAsString(const CUser_field &field)
const string & kPreviouslySeenObjects
static const string kNonFixable
static bool SuspiciousId(const string &s)
static const string kInconsistentMolinfoTech
void AddUserObjectFieldItems(const CSeqdesc *desc, CReportObj &rep_seq, CReportNode &collector, CReportNode &previously_seen, CDiscrepancyContext &context, const string &object_name, const string &field_prefix=kEmptyStr)
const string kDBLinkObjectList
static const string kInconsistentMolinfoTechSummary
const string kStructuredCommentObservedPrefixesThis
static bool IsATGC(char ch)
string GetSummaryLabel(bool all_present, bool all_same)
static const size_t MIN_SEQUENCE_LEN
const string & kPreviouslySeenFieldsThis
static bool EndsWithSequence(const string &title)
static bool FixTextInObject(CSerialObject *obj, size_t misspell_idx)
void AnalyzeFieldReport(CReportNode &node, bool &all_present, bool &all_same)
const string kSequencesWithGaps
const string kIdenticalDeflines
const string kDeflineExists
static void FindFlatfileText(const char *str, bool *result)
static bool IsMolProd(int biomol)
void AnalyzeField(CReportNode &node, bool &all_present, bool &all_same)
static bool CompareOrGetString(const CBioSource &bio_src, FnIsSet is_set_fn, FnGet get_fn, string &val)
void CopyNode(CReportNode &new_home, CReportNode &original)
const string kStructuredCommentPrevious
const string kNoTaxnameInDefline
static SpellFixData kSpellFixes[]
static bool CompareOrgModValue(const CBioSource &bio_src, COrgMod::TSubtype subtype, string &val)
static constexpr auto suspicious_id_re
const string kStructuredCommentFieldPrefix
static const size_t kSpellFixesSize
const string kDBLinkFieldCountTop
static string GetProjectID(const CUser_object &user)
const string kUniqueDeflines
const string kAllUniqueDeflines
static bool IsMicroSatellite(const CSeq_feat &feat)
const string kDBLinkCollect
bool(CBioSource::* FnIsSet)() const
const string kStructuredCommentsSeqs
const char * m_misspell
const char * m_correct
else result
Definition: token2.c:20
C++ wrappers for the Perl-compatible regular expression (PCRE) library.
static CS_CONTEXT * context
Definition: will_convert.c:21
#define const
Definition: zconf.h:232
Modified on Fri Sep 20 14:57:09 2024 by modify_doxy.py rev. 669887