NCBI C++ ToolKit
discrepancy_case.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: discrepancy_case.cpp 100293 2023-07-17 20:59:36Z kans $
2  * =========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * =========================================================================
25  *
26  * Authors: Sema Kachalo
27  *
28  */
29 
30 #include <ncbi_pch.hpp>
31 #include "discrepancy_core.hpp"
32 #include "utils.hpp"
42 #include <objects/seq/MolInfo.hpp>
43 #include <objmgr/bioseq_ci.hpp>
44 #include <objmgr/feat_ci.hpp>
45 #include <objmgr/seq_vector.hpp>
46 #include <objmgr/util/feature.hpp>
47 #include <objmgr/bioseq_handle.hpp>
49 #include <objmgr/seqdesc_ci.hpp>
50 #include <sstream>
51 
55 
56 
57 // COUNT_NUCLEOTIDES
58 
59 DISCREPANCY_CASE(COUNT_NUCLEOTIDES, SEQUENCE, eOncaller | eSubmitter | eSmart | eBig, "Count nucleotide sequences")
60 {
61  const CBioseq& bioseq = context.CurrentBioseq();
62  if (bioseq.CanGetInst() && bioseq.GetInst().IsNa()) {
63  auto& node = m_Objs["[n] nucleotide Bioseq[s] [is] present"].Info();
64  node.Incr();
65  node.GetCount();
66  node.Add(*context.BioseqObjRef());
67  }
68 }
69 
70 
71 DISCREPANCY_SUMMARIZE(COUNT_NUCLEOTIDES)
72 {
73  m_Objs["[n] nucleotide Bioseq[s] [is] present"]; // If no sequences found still report 0
74  xSummarize();
75 }
76 
77 
78 // COUNT_PROTEINS
79 
80 DISCREPANCY_CASE(COUNT_PROTEINS, SEQUENCE, eDisc, "Count Proteins")
81 {
82  const CBioseq& bioseq = context.CurrentBioseq();
83  if (bioseq.CanGetInst() && bioseq.GetInst().IsAa()) {
84  auto& node = m_Objs["[n] protein sequence[s] [is] present"].Info();
85  node.Incr();
86  node.GetCount();
87  node.Add(*context.BioseqObjRef());
88  }
89 }
90 
91 
92 static const CSeq_id* GetProteinId(const CBioseq& seq)
93 {
94  for (auto& id_it : seq.GetId()) {
95  const CSeq_id& seq_id = *id_it;
96  if (seq_id.IsGeneral() && !seq_id.GetGeneral().IsSkippable()) {
97  return &seq_id;
98  }
99  }
100  return nullptr;
101 }
102 
103 
104 // MISSING_PROTEIN_ID
105 DISCREPANCY_CASE(MISSING_PROTEIN_ID, SEQUENCE, eDisc | eSubmitter | eSmart | eFatal, "Missing Protein ID")
106 {
107  const CBioseq& bioseq = context.CurrentBioseq();
108  if (bioseq.CanGetInst() && bioseq.GetInst().IsAa() && !GetProteinId(bioseq)) {
109  m_Objs["[n] protein[s] [has] invalid ID[s]."].Add(*context.BioseqObjRef()).Fatal();
110  }
111 }
112 
113 
114 // INCONSISTENT_PROTEIN_ID
115 DISCREPANCY_CASE(INCONSISTENT_PROTEIN_ID, SEQUENCE, eDisc | eSubmitter | eSmart | eFatal, "Inconsistent Protein ID")
116 {
117  const CBioseq& bioseq = context.CurrentBioseq();
118  if (bioseq.CanGetInst() && bioseq.GetInst().IsAa()) {
119  const CSeq_id* protein_id = GetProteinId(bioseq);
120  if (protein_id) {
121  _ASSERT(protein_id->IsGeneral());
122  CTempString protein_id_prefix(GET_STRING_FLD_OR_BLANK(protein_id->GetGeneral(), Db));
123  if (protein_id_prefix.empty()) {
124  return;
125  }
126  string protein_id_prefix_lowercase = protein_id_prefix;
127  NStr::ToLower(protein_id_prefix_lowercase);
128  // find (or create if none before) the canonical form of the
129  // protein_id_prefix since case-insensitivity means it could have
130  // multiple forms. Here, the canonical form is the way it appears
131  // the first time.
132  CReportNode& canonical_forms_node = m_Objs["canonical forms"][protein_id_prefix_lowercase];
133  string canonical_protein_id_prefix;
134  if (canonical_forms_node.empty()) {
135  // haven't seen this protein_id_prefix_lowercase before so we have
136  // to set the canonical form.
137  canonical_protein_id_prefix = protein_id_prefix;
138  canonical_forms_node[protein_id_prefix];
139  }
140  else {
141  // use previously set canonical form;
142  canonical_protein_id_prefix = canonical_forms_node.GetMap().begin()->first;
143  }
144  _ASSERT(NStr::EqualNocase(protein_id_prefix, canonical_protein_id_prefix));
145  m_Objs[kEmptyStr]["[n] sequence[s] [has] protein ID prefix [(]" + canonical_protein_id_prefix].Fatal().Add(*context.BioseqObjRef());
146  }
147  }
148 }
149 
150 
151 DISCREPANCY_SUMMARIZE(INCONSISTENT_PROTEIN_ID)
152 {
153  // if _all_ are identical, don't report
154  CReportNode& reports_collected = m_Objs[kEmptyStr];
155  if( reports_collected.GetMap().size() <= 1 ) {
156  // if there are no sequences or all sequences have the same
157  // canonical protein id, then do not show any discrepancy
158  return;
159  }
160 
161  m_ReportItems = reports_collected.Export(*this)->GetSubitems();
162 }
163 
164 
165 // N_RUNS
166 
167 DISCREPANCY_CASE(N_RUNS, SEQUENCE, eDisc | eSubmitter | eSmart | eBig | eFatal, "More than 10 Ns in a row")
168 {
169  const CBioseq& bioseq = context.CurrentBioseq();
170  if (bioseq.CanGetInst() && bioseq.GetInst().IsNa()) {
171  const CSeqSummary& sum = context.GetSeqSummary();
172  if (!sum.HasRef && sum.NRuns.size()) { // !context.SequenceHasFarPointers()
173  string details;
174  for (const auto& p: sum.NRuns) {
175  details += (details.empty() ? " " : ", ") + to_string(p.first) + "-" + to_string(p.second);
176  }
177  m_Objs["[n] sequence[s] [has] runs of 10 or more Ns"][sum.Label + " has runs of Ns at the following locations: " + details].Ext().Fatal().Add(*context.BioseqObjRef());
178  }
179  }
180 }
181 
182 
183 // PERCENT_N
184 
185 DISCREPANCY_CASE(PERCENT_N, SEQUENCE, eDisc | eSubmitter | eSmart | eBig, "More than 5 percent Ns")
186 {
187  const CBioseq& bioseq = context.CurrentBioseq();
188  if (bioseq.CanGetInst() && bioseq.GetInst().IsNa()) {
189  const CSeqSummary& sum = context.GetSeqSummary();
190  if (!sum.HasRef && sum.N * 100. / sum.Len > 5) { // !context.SequenceHasFarPointers()
191  m_Objs["[n] sequence[s] [has] more than 5% Ns"].Add(*context.BioseqObjRef());
192  }
193  }
194 }
195 
196 
197 // INTERNAL_TRANSCRIBED_SPACER_RRNA
198 
199 static const char* kRRNASpacer[] = { "internal", "transcribed", "spacer" };
200 static const size_t kRRNASpacer_len = ArraySize(kRRNASpacer);
201 
202 DISCREPANCY_CASE(INTERNAL_TRANSCRIBED_SPACER_RRNA, FEAT, eOncaller, "Look for rRNAs that contain either 'internal', 'transcribed' or 'spacer'")
203 {
204  for (const CSeq_feat& feat : context.GetFeat()) {
205  if (feat.IsSetData() && feat.GetData().IsRna() && feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_rRNA) {
206  const string rna_name = feat.GetData().GetRna().GetRnaProductName();
207  for (size_t i = 0; i < kRRNASpacer_len; ++i) {
208  if (NStr::FindNoCase(rna_name, kRRNASpacer[i]) != NPOS) {
209  m_Objs["[n] rRNA feature products contain 'internal', 'transcribed', or 'spacer'"].Add(*context.SeqFeatObjRef(feat));
210  }
211  }
212  }
213  }
214 }
215 
216 
217 // OVERLAPPING_CDS
218 
219 static bool StrandsMatch(ENa_strand strand1, ENa_strand strand2)
220 {
221  return (strand1 == eNa_strand_minus && strand2 == eNa_strand_minus) || (strand1 != eNa_strand_minus && strand2 != eNa_strand_minus);
222 }
223 
224 
225 static const char* kSimilarProductWords[] = { "transposase", "integrase" };
227 
228 static const char* kIgnoreSimilarProductWords[] = { "hypothetical protein", "phage", "predicted protein" };
230 
231 
232 static bool ProductNamesAreSimilar(const string& product1, const string& product2)
233 {
234  bool str1_has_similarity_word = false, str2_has_similarity_word = false;
235 
236  size_t i;
237  // if both product names contain one of the special case similarity words, the product names are similar.
238 
239  for (i = 0; i < kNumSimilarProductWords; i++) {
240  if (NPOS != NStr::FindNoCase(product1, kSimilarProductWords[i])) {
241  str1_has_similarity_word = true;
242  }
243 
244  if (NPOS != NStr::FindNoCase(product2, kSimilarProductWords[i])) {
245  str2_has_similarity_word = true;
246  }
247  }
248  if (str1_has_similarity_word && str2_has_similarity_word) {
249  return true;
250  }
251 
252  // otherwise, if one of the product names contains one of special ignore similarity
253  // words, the product names are not similar.
254 
255  for (i = 0; i < kNumIgnoreSimilarProductWords; i++) {
257  return false;
258  }
259  }
260 
261  return !NStr::CompareNocase(product1, product2);
262 }
263 
264 
265 static bool ShouldIgnore(const string& product)
266 {
267  if (NStr::Find(product, "transposon") != string::npos || NStr::Find(product, "transposase") != string::npos) {
268  return true;
269  }
270  CString_constraint constraint;
271  constraint.SetMatch_text("ABC");
272  constraint.SetCase_sensitive(true);
273  constraint.SetWhole_word(true);
274  return constraint.Match(product);
275 }
276 
277 
278 static const string kOverlappingCDSNoteText = "overlaps another CDS with the same product name";
279 
280 
281 static bool HasOverlapNote(const CSeq_feat& feat)
282 {
283  return feat.IsSetComment() && NStr::Find(feat.GetComment(), kOverlappingCDSNoteText) != string::npos;
284 }
285 
286 
287 static bool SetOverlapNote(CSeq_feat& feat)
288 {
289  if (feat.IsSetComment() && NStr::Find(feat.GetComment(), kOverlappingCDSNoteText) != string::npos) {
290  return false;
291  }
292  AddComment(feat, (string)kOverlappingCDSNoteText);
293  return true;
294 }
295 
296 
297 static const char* kOverlap0 = "[n] coding region[s] overlap[S] another coding region with a similar or identical name.";
298 static const char* kOverlap1 = "[n] coding region[s] overlap[S] another coding region with a similar or identical name, but [has] the appropriate note text.";
299 static const char* kOverlap2 = "[n] coding region[s] overlap[S] another coding region with a similar or identical name and [does] not have the appropriate note text.";
300 
302 {
303  if (products.find(feat) == products.end()) {
304  string name = context.GetProdForFeature(*feat);
305  products[feat] = name.empty() || ShouldIgnore(name) ? kEmptyStr : name;
306  }
307  return products[feat];
308 }
309 
310 
311 DISCREPANCY_CASE(OVERLAPPING_CDS, SEQUENCE, eDisc, "Overlapping CDs")
312 {
313  const CBioseq& bioseq = context.CurrentBioseq();
314  if (bioseq.IsSetInst() && bioseq.GetInst().IsNa()) {
315  const auto& cds = context.FeatCDS();
317  for (size_t i = 0; i < cds.size(); i++) {
318  const CSeq_loc& loc_i = cds[i]->GetLocation();
319  ENa_strand strand_i = loc_i.GetStrand();
320  for (size_t j = i + 1; j < cds.size(); j++) {
321  const CSeq_loc& loc_j = cds[j]->GetLocation();
322  if (!StrandsMatch(strand_i, loc_j.GetStrand()) || context.Compare(loc_i, loc_j) == sequence::eNoOverlap) {
323  continue;
324  }
325  string prod_i = GetProdName(cds[i], products, context);
326  if (prod_i.empty()) {
327  break;
328  }
329  string prod_j = GetProdName(cds[j], products, context);
330  if (prod_j.empty() || !ProductNamesAreSimilar(prod_i, prod_j)) {
331  continue;
332  }
333  bool has_note = HasOverlapNote(*cds[i]);
334  m_Objs[kOverlap0][has_note ? kOverlap1 : kOverlap2].Add(*context.SeqFeatObjRef(*cds[i], has_note ? CDiscrepancyContext::eFixNone : CDiscrepancyContext::eFixSelf));
335  has_note = HasOverlapNote(*cds[j]);
336  m_Objs[kOverlap0][has_note ? kOverlap1 : kOverlap2].Add(*context.SeqFeatObjRef(*cds[j], has_note ? CDiscrepancyContext::eFixNone : CDiscrepancyContext::eFixSelf));
337  }
338  }
339  }
340 }
341 
342 
343 DISCREPANCY_SUMMARIZE(OVERLAPPING_CDS)
344 {
345  if (m_Objs.Exist(kOverlap0)) {
346  m_Objs[kOverlap0].Promote();
347  }
348  xSummarize();
349 }
350 
351 
352 DISCREPANCY_AUTOFIX(OVERLAPPING_CDS)
353 {
354  const CSeq_feat* sf = dynamic_cast<const CSeq_feat*>(context.FindObject(*obj));
355  CRef<CSeq_feat> new_feat(new CSeq_feat());
356  new_feat->Assign(*sf);
357  if (SetOverlapNote(*new_feat)) {
358  context.ReplaceSeq_feat(*obj, *sf, *new_feat);
359  obj->SetFixed();
360  return CRef<CAutofixReport>(new CAutofixReport("OVERLAPPING_CDS: Set note[s] for [n] coding region[s]", 1));
361  }
362  return CRef<CAutofixReport>();
363 }
364 
365 
366 DISCREPANCY_CASE(PARTIAL_CDS_COMPLETE_SEQUENCE, FEAT, eDisc | eOncaller | eSubmitter | eSmart, "Partial CDSs in Complete Sequences")
367 {
368  for (const CSeq_feat& feat : context.GetFeat()) {
369  if (feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_cdregion) {
370  // leave if this CDS is not at least in some way marked as partial
371  if (!GET_FIELD_OR_DEFAULT(feat, Partial, false) && !(feat.IsSetLocation() && feat.GetLocation().IsPartialStart(eExtreme_Biological)) && !(feat.IsSetLocation() && feat.GetLocation().IsPartialStop(eExtreme_Biological))) {
372  continue;
373  }
374  // leave if we're not on a complete sequence
375  auto mol_info = context.GetMolinfo();
376  if (!mol_info || !FIELD_EQUALS(mol_info->GetMolinfo(), Completeness, CMolInfo::eCompleteness_complete)) {
377  continue;
378  }
379  // record the issue
380  m_Objs["[n] partial CDS[s] in complete sequence[s]"].Add(*context.SeqFeatObjRef(feat));
381  }
382  }
383 }
384 
385 
386 
387 DISCREPANCY_CASE(RNA_NO_PRODUCT, FEAT, eDisc | eOncaller | eSubmitter | eSmart, "Find RNAs without Products")
388 {
389  for (const CSeq_feat& feat : context.GetFeat()) {
390  if (feat.GetData().IsRna() && !context.IsPseudo(feat)) {
391  // for the given RNA subtype, see whether a product is required
392  switch (feat.GetData().GetSubtype()) {
394  CTempString comment(feat.IsSetComment() ? feat.GetComment() : kEmptyStr);
395  if (NStr::StartsWith(comment, "contains ", NStr::eNocase) || NStr::StartsWith(comment, "may contain", NStr::eNocase)) {
396  continue;
397  }
398  break;
399  }
401  // don't require products for tmRNA
402  continue;
404  // if ncRNA has a class other than "other", don't need a product
405  const CRNA_ref & rna_ref = feat.GetData().GetRna();
406  if (!FIELD_IS_SET_AND_IS(rna_ref, Ext, Gen)) {
407  // no RNA-gen, so no class, so needs a product
408  break;
409  }
410  const CTempString gen_class(
411  GET_STRING_FLD_OR_BLANK(rna_ref.GetExt().GetGen(), Class));
412  if (!gen_class.empty() && !NStr::EqualNocase(gen_class, "other")) {
413  // product has a product other than "other", so no
414  // explicit product needed.
415  continue;
416  }
417  break;
418  }
419  default:
420  // other kinds always need a product
421  break;
422  }
423  const CRNA_ref & rna_ref = feat.GetData().GetRna();
424  if (!rna_ref.IsSetExt()) {
425  // will try other ways farther below
426  }
427  else {
428  const CRNA_ref::TExt & rna_ext = rna_ref.GetExt();
429  switch (rna_ext.Which()) {
430  case CRNA_ref::TExt::e_Name: {
431  const string & ext_name = rna_ref.GetExt().GetName();
432  if (!ext_name.empty() && !NStr::EqualNocase(ext_name, "ncRNA") && !NStr::EqualNocase(ext_name, "tmRNA") && !NStr::EqualNocase(ext_name, "misc_RNA")) {
433  // ext.name can considered a product
434  continue;
435  }
436  break;
437  }
440  if (!rna_ref.GetRnaProductName().empty()) {
441  // found a product
442  continue;
443  }
444  break;
445  default:
446  _TROUBLE;
447  break;
448  }
449  }
450  // try to get it from a "product" qual
451  if (!feat.GetNamedQual("product").empty()) {
452  // gb-qual can be considered a product
453  continue;
454  }
455  // could not find a product
456  m_Objs["[n] RNA feature[s] [has] no product and [is] not pseudo"].Add(*context.SeqFeatObjRef(feat), false); // not unique
457  }
458  }
459 }
460 
461 
462 
463 // CONTAINED_CDS
464 
465 static bool HasContainedNote(const CSeq_feat& feat)
466 {
467  return feat.IsSetComment() && NStr::EqualNocase(feat.GetComment(), "completely contained in another CDS");
468 }
469 
470 
471 static const char* kContained = "[n] coding region[s] [is] completely contained in another coding region.";
472 static const char* kContainedNote = "[n] coding region[s] [is] completely contained in another coding region, but have note.";
473 static const char* kContainedSame = "[n] coding region[s] [is] completely contained in another coding region on the same strand.";
474 static const char* kContainedOpps = "[n] coding region[s] [is] completely contained in another coding region, but on the opposite strand.";
475 
476 
477 DISCREPANCY_CASE(CONTAINED_CDS, SEQUENCE, eDisc | eSubmitter | eSmart | eFatal, "Contained CDs")
478 {
479  const CBioseq& bioseq = context.CurrentBioseq();
480  if (bioseq.IsSetInst() && bioseq.GetInst().IsNa()) {
481  const CSeqdesc* biosrc = context.GetBiosource();
482  if (!context.IsEukaryotic(biosrc ? &biosrc->GetSource() : nullptr)) {
483  const auto& cds = context.FeatCDS();
484  for (size_t i = 0; i < cds.size(); i++) {
485  const CSeq_loc& loc_i = cds[i]->GetLocation();
486  ENa_strand strand_i = loc_i.GetStrand();
487  for (size_t j = i + 1; j < cds.size(); j++) {
488  const CSeq_loc& loc_j = cds[j]->GetLocation();
489  sequence::ECompare compare = context.Compare(loc_j, loc_i);
490  if (compare == sequence::eContains || compare == sequence::eSame || compare == sequence::eContained) {
491  const char* strand = StrandsMatch(strand_i, loc_j.GetStrand()) ? kContainedSame : kContainedOpps;
492  bool has_note = HasContainedNote(*cds[i]);
493  new CSimpleTypeObject<string>(context.GetProdForFeature(*cds[i]));
494  bool autofix = compare == sequence::eContained && !has_note;
495  m_Objs[kContained][has_note ? kContainedNote : strand].Fatal().Add(*context.SeqFeatObjRef(*cds[i], autofix ? cds[i] : nullptr, autofix ? new CSimpleTypeObject<string>(context.GetProdForFeature(*cds[i])) : nullptr));
496  has_note = HasContainedNote(*cds[j]);
497  autofix = compare == sequence::eContains && !has_note;
498  m_Objs[kContained][has_note ? kContainedNote : strand].Fatal().Add(*context.SeqFeatObjRef(*cds[j], autofix ? cds[j] : nullptr, autofix ? new CSimpleTypeObject<string>(context.GetProdForFeature(*cds[j])) : nullptr));
499  }
500  }
501  }
502  }
503  }
504 }
505 
506 
507 DISCREPANCY_SUMMARIZE(CONTAINED_CDS)
508 {
509  if (m_Objs.Exist(kContained) && m_Objs[kContained].GetMap().size() == 1) {
510  m_ReportItems = m_Objs[kContained].Export(*this)->GetSubitems();
511  }
512  else {
513  xSummarize();
514  }
515 }
516 
517 
518 DISCREPANCY_AUTOFIX(CONTAINED_CDS)
519 {
520  const CSeq_feat* sf = dynamic_cast<const CSeq_feat*>(context.FindObject(*obj));
521  CRef<CSeq_feat> new_feat(new CSeq_feat());
522  new_feat->Assign(*sf);
523  new_feat->SetData().SetImp().SetKey("misc_feature");
524  const CSimpleTypeObject<string>* stringobj = dynamic_cast<const CSimpleTypeObject<string>*>(context.GetMore(*obj));
525  if (stringobj && !stringobj->Value.empty()) {
526  AddComment(*new_feat, stringobj->Value);
527  }
528  context.ReplaceSeq_feat(*obj, *sf, *new_feat);
529  obj->SetFixed();
530  return CRef<CAutofixReport>(new CAutofixReport("CONTAINED_CDS: Converted [n] coding region[s] to misc_feat", 1));
531 }
532 
533 
534 DISCREPANCY_CASE(ZERO_BASECOUNT, SEQUENCE, eDisc | eOncaller | eSubmitter | eSmart | eBig, "Zero Base Counts")
535 {
536  static const char* kMsg = "[n] sequence[s] [has] a zero basecount for a nucleotide";
537  const CBioseq& bioseq = context.CurrentBioseq();
538  if (bioseq.CanGetInst() && bioseq.GetInst().IsNa()) {
539  const CSeqSummary& sum = context.GetSeqSummary();
540  if (!sum.HasRef) {
541  if (!sum.A) {
542  m_Objs[kMsg]["[n] sequence[s] [has] no As"].Ext().Add(*context.BioseqObjRef());
543  }
544  if (!sum.C) {
545  m_Objs[kMsg]["[n] sequence[s] [has] no Cs"].Ext().Add(*context.BioseqObjRef());
546  }
547  if (!sum.G) {
548  m_Objs[kMsg]["[n] sequence[s] [has] no Gs"].Ext().Add(*context.BioseqObjRef());
549  }
550  if (!sum.T) {
551  m_Objs[kMsg]["[n] sequence[s] [has] no Ts"].Ext().Add(*context.BioseqObjRef());
552  }
553  }
554  }
555 }
556 
557 
558 // NONWGS_SETS_PRESENT
559 
560 DISCREPANCY_CASE(NONWGS_SETS_PRESENT, SEQ_SET, eDisc, "Eco, mut, phy or pop sets present")
561 {
562  const CBioseq_set& set = context.CurrentBioseq_set();
563  if (set.IsSetClass()) {
564  switch (set.GetClass()) {
569  // non-WGS set found
570  m_Objs["[n] set[s] [is] of type eco, mut, phy or pop"].Add(*context.BioseqSetObjRef(true));
571  break;
572  default:
573  break;
574  }
575  }
576 }
577 
578 
579 DISCREPANCY_AUTOFIX(NONWGS_SETS_PRESENT)
580 {
581  const CBioseq_set* bss = dynamic_cast<const CBioseq_set*>(context.FindObject(*obj));
582  CBioseq_set_Handle set_h = context.GetBioseq_setHandle(*bss);
583  CBioseq_set_EditHandle set_eh(set_h);
585  obj->SetFixed();
586  return CRef<CAutofixReport>(new CAutofixReport("NONWGS_SETS_PRESENT: Set class to GenBank for [n] set[s]", 1));
587 }
588 
589 
590 //NO_ANNOTATION
591 
592 DISCREPANCY_CASE(NO_ANNOTATION, SEQUENCE, eDisc | eOncaller | eSubmitter | eSmart | eBig, "No annotation")
593 {
594  auto all_feat = context.GetAllFeat();
595  if (all_feat.begin() == all_feat.end()) {
596  m_Objs["[n] bioseq[s] [has] no features"].Add(*context.BioseqObjRef());
597  }
598 }
599 
600 
601 DISCREPANCY_CASE(LONG_NO_ANNOTATION, SEQUENCE, eDisc | eOncaller | eSubmitter | eSmart | eBig, "No annotation for LONG sequence")
602 {
603  const int kSeqLength = 5000;
604  const CBioseq& bioseq = context.CurrentBioseq();
605  if (bioseq.IsNa() && bioseq.IsSetLength() && bioseq.GetLength() > kSeqLength) {
606  auto all_feat = context.GetAllFeat();
607  if (all_feat.begin() == all_feat.end()) {
608  m_Objs["[n] bioseq[s] [is] longer than 5000nt and [has] no features"].Add(*context.BioseqObjRef());
609  }
610  }
611 }
612 
613 
614 DISCREPANCY_CASE(VERY_LONG_NO_ANNOTATION, SEQUENCE, eDisc | eOncaller | eSubmitter | eSmart | eBig, "No annotation for LONG sequence")
615 {
616  const int kSeqLength = 50000;
617  const CBioseq& bioseq = context.CurrentBioseq();
618  if (bioseq.IsNa() && bioseq.IsSetLength() && bioseq.GetLength() > kSeqLength) {
619  auto all_feat = context.GetAllFeat();
620  if (all_feat.begin() == all_feat.end()) {
621  m_Objs["[n] bioseq[s] [is] longer than 50000nt and [has] no features"].Add(*context.BioseqObjRef());
622  }
623  }
624 }
625 
626 
627 // POSSIBLE_LINKER
628 
629 DISCREPANCY_CASE(POSSIBLE_LINKER, SEQUENCE, eOncaller, "Detect linker sequence after poly-A tail")
630 {
631  const CBioseq& bioseq = context.CurrentBioseq();
632  if (bioseq.CanGetInst() && !bioseq.GetInst().IsAa() && bioseq.GetInst().GetLength() >= 30) {
633  const CSeqdesc* molinfo = context.GetMolinfo();
634  if (molinfo && molinfo->GetMolinfo().IsSetBiomol() && molinfo->GetMolinfo().GetBiomol() == CMolInfo::eBiomol_mRNA) {
635  CSeqVector seq_vec(bioseq, &context.GetScope(), CBioseq_Handle::eCoding_Iupac, eNa_strand_plus);
636  static const size_t TAIL = 30;
637  string seq_data(kEmptyStr);
638  seq_vec.GetSeqData(bioseq.GetInst().GetLength() - TAIL, bioseq.GetInst().GetLength(), seq_data);
639  size_t tail_len = 0;
640  size_t cut = 0;
641  for (size_t i = 0; i < seq_data.length(); i++) {
642  if (seq_data[i] == 'A' || seq_data[i] == 'a') {
643  tail_len++;
644  }
645  else {
646  if (tail_len > 20) {
647  cut = i;
648  }
649  tail_len = 0;
650  }
651  }
652  if (cut) {
653  cut = TAIL - cut;
654  m_Objs["[n] bioseq[s] may have linker sequence after the poly-A tail"].Add(*context.BioseqObjRef(cut > 0 ? CDiscrepancyContext::eFixSelf : CDiscrepancyContext::eFixNone, cut ? new CSimpleTypeObject<size_t>(cut) : nullptr));
655  }
656  }
657  }
658 }
659 
660 
661 DISCREPANCY_AUTOFIX(POSSIBLE_LINKER)
662 {
663  const CBioseq* seq = dynamic_cast<const CBioseq*>(context.FindObject(*obj));
664  size_t cut_from_end = dynamic_cast<const CSimpleTypeObject<size_t>*>(obj->GetMoreInfo().GetPointer())->Value;
665  CBioseq_EditHandle besh(context.GetBioseqHandle(*seq));
666  SSeqMapSelector selector;
667  selector.SetFlags(CSeqMap::fFindData);
668  CSeqMap_I seqmap_i(besh, selector);
669  size_t start = 0;
670  size_t stop = besh.GetInst_Length() - cut_from_end;
671  while (seqmap_i) {
672  TSeqPos len = seqmap_i.GetLength();
673  if (start < stop && start + len > stop) {
674  string seq_in;
675  seqmap_i.GetSequence(seq_in, CSeqUtil::e_Iupacna);
676  string seq_out = seq_in.substr(0, stop - start);
677  seqmap_i.SetSequence(seq_out, CSeqUtil::e_Iupacna, seqmap_i.GetData().Which());
678  ++seqmap_i;
679  }
680  else if (start >= stop) {
681  seqmap_i = seqmap_i.Remove();
682  }
683  else {
684  ++seqmap_i;
685  }
686  start += len;
687  }
688  obj->SetFixed();
689  return CRef<CAutofixReport>(new CAutofixReport("POSSIBLE_LINKER: [n] sequence[s] trimmed", 1));
690 }
691 
692 
693 // ORDERED_LOCATION
694 
695 DISCREPANCY_CASE(ORDERED_LOCATION, FEAT, eDisc | eOncaller | eSmart, "Location is ordered (intervals interspersed with gaps)")
696 {
697  for (const CSeq_feat& feat : context.GetFeat()) {
698  if (feat.IsSetLocation()) {
699  CSeq_loc_CI loc_ci(feat.GetLocation(), CSeq_loc_CI::eEmpty_Allow);
700  for (; loc_ci; ++loc_ci) {
701  if (loc_ci.GetEmbeddingSeq_loc().IsNull()) {
702  m_Objs["[n] feature[s] [has] ordered location[s]"].Add(*context.SeqFeatObjRef(feat, &feat));
703  break;
704  }
705  }
706  }
707  }
708 }
709 
710 
711 DISCREPANCY_AUTOFIX(ORDERED_LOCATION)
712 {
713  const CSeq_feat* sf = dynamic_cast<const CSeq_feat*>(context.FindObject(*obj));
714  CSeq_loc_I new_loc_creator(*SerialClone(sf->GetLocation()));
715  while (new_loc_creator) {
716  if (new_loc_creator.GetEmbeddingSeq_loc().IsNull()) {
717  new_loc_creator.Delete();
718  }
719  else {
720  ++new_loc_creator;
721  }
722  }
723  if (!new_loc_creator.HasChanges()) {
724  return CRef<CAutofixReport>();
725  }
726  CRef<CSeq_loc> new_seq_feat_loc = new_loc_creator.MakeSeq_loc(CSeq_loc_I::eMake_PreserveType);
727  CRef<CSeq_feat> new_feat(SerialClone(*sf));
728  new_feat->SetLocation(*new_seq_feat_loc);
729  context.ReplaceSeq_feat(*obj, *sf, *new_feat);
730  obj->SetFixed();
731  return CRef<CAutofixReport>(new CAutofixReport("ORDERED_LOCATION: [n] features with ordered locations fixed", 1));
732 }
733 
734 
735 // MISSING_LOCUS_TAGS
736 
737 DISCREPANCY_CASE(MISSING_LOCUS_TAGS, SEQUENCE, eDisc | eSubmitter | eSmart | eFatal, "Missing locus tags")
738 {
739  const CBioseq& bioseq = context.CurrentBioseq();
740  if (bioseq.IsSetInst() && bioseq.GetInst().IsNa()) {
741  for (const CSeq_feat* feat : context.FeatGenes()) {
742  const CGene_ref& gene_ref = feat->GetData().GetGene();
743  if (!gene_ref.IsSetPseudo() || !gene_ref.GetPseudo()) {
744  if (!gene_ref.IsSetLocus_tag() || NStr::IsBlank(gene_ref.GetLocus_tag())) {
745  m_Objs["[n] gene[s] [has] no locus tag[s]."].Fatal().Add(*context.SeqFeatObjRef(*feat));
746  }
747  else if (!m_Objs.Exist(kEmptyStr)) {
748  m_Objs[kEmptyStr].Incr();
749  }
750  }
751  }
752  }
753 }
754 
755 
756 DISCREPANCY_SUMMARIZE(MISSING_LOCUS_TAGS)
757 {
758  if (m_Objs.Exist(kEmptyStr)) {
759  m_Objs.GetMap().erase(kEmptyStr);
760  xSummarize();
761  }
762 }
763 
764 
765 // NO_LOCUS_TAGS
766 
767 DISCREPANCY_CASE(NO_LOCUS_TAGS, FEAT, eDisc | eSubmitter | eSmart | eFatal, "No locus tags at all")
768 {
769  for (const CSeq_feat& feat : context.GetFeat()) {
770  if (feat.IsSetData() && feat.GetData().IsGene()) {
771  const CGene_ref& gene_ref = feat.GetData().GetGene();
772  if (gene_ref.IsSetPseudo() && gene_ref.GetPseudo()) {
773  continue;
774  }
775  if (!gene_ref.IsSetLocus_tag() || NStr::IsBlank(gene_ref.GetLocus_tag())) {
776  m_Objs["None of [n] gene[s] has locus tag."].Fatal().Add(*context.SeqFeatObjRef(feat));
777  }
778  else if (!m_Objs.Exist(kEmptyStr)) {
779  m_Objs[kEmptyStr].Incr();
780  }
781  }
782  }
783 }
784 
785 
786 DISCREPANCY_SUMMARIZE(NO_LOCUS_TAGS)
787 {
788  if (!m_Objs.Exist(kEmptyStr)) {
789  xSummarize();
790  }
791 }
792 
793 
794 // INCONSISTENT_LOCUS_TAG_PREFIX
795 
796 DISCREPANCY_CASE(INCONSISTENT_LOCUS_TAG_PREFIX, FEAT, eDisc | eSubmitter | eSmart, "Inconsistent locus tag prefix")
797 {
798  for (const CSeq_feat& feat : context.GetFeat()) {
799  if (feat.IsSetData() && feat.GetData().Which() == CSeqFeatData::e_Gene) {
800  const CGene_ref& gene_ref = feat.GetData().GetGene();
801  // Skip pseudo-genes
802  if (gene_ref.IsSetPseudo() && gene_ref.GetPseudo() == true) {
803  continue;
804  }
805  // Skip missing locus tags
806  if (!gene_ref.IsSetLocus_tag()) {
807  continue;
808  }
809  // Report on good locus tags - are they consistent?
810  string locus_tag = gene_ref.GetLocus_tag();
811  if (!locus_tag.empty() && !context.IsBadLocusTagFormat(locus_tag)) {
812  // Store each unique prefix in a bin
813  // If there is more than 1 bin, the prefixes are inconsistent
814  string prefix;
815  string tagvalue;
816  NStr::SplitInTwo(locus_tag, "_", prefix, tagvalue);
817  stringstream ss;
818  ss << "[n] feature[s] [has] locus tag prefix [(]" << prefix << ".";
819  m_Objs[ss.str()].Add(*context.SeqFeatObjRef(feat));
820  }
821  }
822  }
823 }
824 
825 
826 DISCREPANCY_SUMMARIZE(INCONSISTENT_LOCUS_TAG_PREFIX)
827 {
828  // If there is more than 1 bin, the prefixes are inconsistent
829  if (m_Objs.GetMap().size() > 1) {
830  xSummarize();
831  }
832 }
833 
834 
835 static const string kInconsistent_Moltype = "[n] sequences have inconsistent moltypes";
836 
837 DISCREPANCY_CASE(INCONSISTENT_MOLTYPES, SEQUENCE, eDisc | eOncaller | eSmart, "Inconsistent molecule types")
838 {
839  const CBioseq& bioseq = context.CurrentBioseq();
840  if (bioseq.CanGetInst() && !bioseq.GetInst().IsAa()) {
841  string moltype;
842  const CSeqdesc* molinfo = context.GetMolinfo();
843  if (molinfo && molinfo->GetMolinfo().IsSetBiomol()) {
844  moltype = CMolInfo::GetBiomolName(molinfo->GetMolinfo().GetBiomol());
845  }
846  if (NStr::IsBlank(moltype)) {
847  moltype = "genomic";
848  }
849  if (bioseq.IsSetInst() && bioseq.GetInst().IsSetMol()) {
850  moltype += string(" ") + CSeq_inst::GetMoleculeClass(bioseq.GetInst().GetMol());
851  }
852  m_Objs[kInconsistent_Moltype].Add(*context.BioseqObjRef());
853  m_Objs[kInconsistent_Moltype]["[n] sequence[s] [has] moltype " + moltype].Ext().Add(*context.BioseqObjRef());
854  }
855 }
856 
857 
858 DISCREPANCY_SUMMARIZE(INCONSISTENT_MOLTYPES)
859 {
860  // If there is more than 1 key, the moltypes are inconsistent
861  if (m_Objs[kInconsistent_Moltype].GetMap().size() > 1) {
862  xSummarize();
863  }
864 }
865 
866 
867 // BAD_LOCUS_TAG_FORMAT
868 
869 DISCREPANCY_CASE(BAD_LOCUS_TAG_FORMAT, SEQUENCE, eDisc | eSubmitter | eSmart, "Bad locus tag format")
870 {
871  const CBioseq& bioseq = context.CurrentBioseq();
872  if (bioseq.IsSetId()) {
873  for (const auto& id : bioseq.GetId()) {
874  switch (id->Which()) {
875  case CSeq_id::e_Genbank:
876  case CSeq_id::e_Embl:
877  case CSeq_id::e_Pir:
879  case CSeq_id::e_Other:
880  case CSeq_id::e_Ddbj:
881  case CSeq_id::e_Prf:
882  return;
883  default:
884  break;
885  }
886  }
887  }
888  const auto& genes = context.FeatGenes();
889  for (const CSeq_feat* gene : genes) {
890  const CGene_ref& gene_ref = gene->GetData().GetGene();
891  if (gene_ref.IsSetPseudo() && gene_ref.GetPseudo() == true) {
892  continue;
893  }
894  if (!gene_ref.IsSetLocus_tag()) {
895  continue;
896  }
897  string locus_tag = gene_ref.GetLocus_tag();
898  if (!locus_tag.empty() && context.IsBadLocusTagFormat(locus_tag)) {
899  m_Objs["[n] locus tag[s] [is] incorrectly formatted."].Fatal().Add(*context.SeqFeatObjRef(*gene));
900  }
901  }
902 }
903 
904 
905 // SEGSETS_PRESENT
906 
907 DISCREPANCY_CASE(SEGSETS_PRESENT, SEQ_SET, eDisc | eSmart | eFatal, "Segsets present")
908 {
909  const CBioseq_set& set = context.CurrentBioseq_set();
910  if (set.IsSetClass() && set.GetClass() == CBioseq_set::eClass_segset) {
911  m_Objs["[n] segset[s] [is] present"].Add(*context.BioseqSetObjRef());
912  }
913 }
914 
915 
916 // QUALITY_SCORES
917 
918 DISCREPANCY_CASE(QUALITY_SCORES, SEQUENCE, eDisc | eSmart, "Check for quality scores")
919 {
920  const CBioseq& bioseq = context.CurrentBioseq();
921  if (bioseq.IsSetInst() && bioseq.IsNa()) {
922  m_Objs["t"].Incr(); // total num
923  if (bioseq.CanGetAnnot()) {
924  for (const auto& ann : bioseq.GetAnnot()) {
925  if (ann->IsGraph()) {
926  m_Objs["q"].Incr();
927  return;
928  }
929  }
930  }
931  }
932 }
933 
934 
935 DISCREPANCY_SUMMARIZE(QUALITY_SCORES)
936 {
937  size_t q = m_Objs["q"].GetCount();
938  size_t n = m_Objs["t"].GetCount() - q;
939  if (q && n) {
940  CReportNode ret;
941  ret["Quality scores are missing on some(" + to_string(n) + ") sequences"];
942  m_ReportItems = ret.Export(*this)->GetSubitems();
943  }
944 }
945 
946 
947 // BACTERIA_SHOULD_NOT_HAVE_MRNA
948 
949 DISCREPANCY_CASE(BACTERIA_SHOULD_NOT_HAVE_MRNA, FEAT, eDisc | eOncaller | eSubmitter | eSmart, "Bacterial sequences should not have mRNA features")
950 {
951  const CSeqdesc* biosrc = context.GetBiosource();
952  if (biosrc && context.IsBacterial(&biosrc->GetSource())) {
953  for (const CSeq_feat& feat : context.GetFeat()) {
954  if (feat.IsSetData() && feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_mRNA) {
955  m_Objs["[n] bacterial sequence[s] [has] mRNA features"].Add(*context.SeqFeatObjRef(feat));
956  }
957  }
958  }
959 }
960 
961 
962 // BAD_BGPIPE_QUALS
963 
964 static const string kDiscMessage = "[n] feature[s] contain[S] invalid BGPIPE qualifiers";
965 
966 DISCREPANCY_CASE(BAD_BGPIPE_QUALS, SEQUENCE, eDisc | eSmart, "Bad BGPIPE qualifiers")
967 {
968  if (!context.IsRefseq() && context.IsBGPipe()) {
969  for (const CSeq_feat& feat : context.GetAllFeat()) {
970  if (STRING_FIELD_NOT_EMPTY(feat, Except_text)) {
971  if (feat.GetExcept_text() == "ribosomal slippage" && feat.IsSetComment() && feat.GetComment().find("programmed frameshift") != string::npos) {
972  continue;
973  }
974  m_Objs[kDiscMessage].Add(*context.SeqFeatObjRef(feat));
975  continue;
976  }
977  if (FIELD_IS_SET_AND_IS(feat, Data, Cdregion)) {
978  bool skip = false;
979  const CCdregion & cdregion = feat.GetData().GetCdregion();
980  if (RAW_FIELD_IS_EMPTY_OR_UNSET(cdregion, Code_break)) {
981  continue;
982  }
983  if (GET_STRING_FLD_OR_BLANK(feat, Comment) == "ambiguity in stop codon") {
984  // check if any code break is a stop codon
985  FOR_EACH_CODEBREAK_ON_CDREGION(code_break_it, cdregion) {
986  const CCode_break & code_break = **code_break_it;
987  if (FIELD_IS_SET_AND_IS(code_break, Aa, Ncbieaa) && code_break.GetAa().GetNcbieaa() == 42 /* *:Stop codon */) {
988  skip = true;
989  break;
990  }
991  }
992  if (!skip) {
993  m_Objs[kDiscMessage].Add(*context.SeqFeatObjRef(feat));
994  }
995  continue;
996  }
997  FOR_EACH_CODEBREAK_ON_CDREGION(code_break_it, cdregion) {
998  const CCode_break & code_break = **code_break_it;
999  if (FIELD_IS_SET_AND_IS(code_break, Aa, Ncbieaa) && code_break.GetAa().GetNcbieaa() == 85 /* U:Sec */) {
1000  skip = true;
1001  break;
1002  }
1003  }
1004  if (!skip) {
1005  m_Objs[kDiscMessage].Add(*context.SeqFeatObjRef(feat));
1006  }
1007  }
1008  }
1009  }
1010 }
1011 
1012 
1013 // GENE_PRODUCT_CONFLICT
1014 template<>
1015 class CDiscrepancyPrivateData<eTestNames::GENE_PRODUCT_CONFLICT>
1016 {
1017 public:
1019 };
1020 
1021 DISCREPANCY_CASE(GENE_PRODUCT_CONFLICT, SEQUENCE, eDisc | eSubmitter | eSmart, "Gene Product Conflict")
1022 {
1023  for (const CSeq_feat& feat : context.GetAllFeat()) {
1024  if (feat.IsSetData() && feat.GetData().IsCdregion()) {
1025  CConstRef<CSeq_feat> gene_feat(context.GetGeneForFeature(feat));
1026  if (gene_feat && gene_feat->IsSetData() && gene_feat->GetData().IsGene()) {
1027  const CGene_ref& gene = gene_feat->GetData().GetGene();
1028  if (gene.IsSetLocus()) {
1029  TGeneLocusMap& genes = m_private.m_GeneLocusMap;
1030  const string& locus = gene.GetLocus();
1031  string product = context.GetProdForFeature(feat);
1032  genes[locus].push_back(make_pair(context.SeqFeatObjRef(feat), product));
1033  }
1034  }
1035  }
1036  }
1037 }
1038 
1039 
1040 DISCREPANCY_SUMMARIZE(GENE_PRODUCT_CONFLICT)
1041 {
1042  TGeneLocusMap& genes = m_private.m_GeneLocusMap;
1043  for (auto& gene : genes) {
1044  if (gene.second.size() > 1) {
1045  TGenesList::const_iterator cur_gene = gene.second.cbegin();
1046  const string& product = cur_gene->second;
1047  bool diff = false;
1048  for (++cur_gene; cur_gene != gene.second.cend(); ++cur_gene) {
1049  const string& cur_product = cur_gene->second;
1050  if (product != cur_product) {
1051  diff = true;
1052  break;
1053  }
1054  }
1055  if (diff) {
1056  string sub = "[n] coding regions have the same gene name (" + gene.first + ") as another coding region but a different product";
1057  for (auto& rec : gene.second) {
1058  m_Objs["[n] coding region[s] [has] the same gene name as another coding region but a different product"][sub].Ext().Add(*rec.first, false);
1059  }
1060  }
1061  }
1062  }
1063  genes.clear();
1064  xSummarize();
1065 }
1066 
1067 
User-defined methods of the data storage class.
User-defined methods of the data storage class.
@ eExtreme_Biological
5' and 3'
Definition: Na_strand.hpp:62
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
CBioseq_EditHandle –.
CBioseq_set_EditHandle –.
CBioseq_set_Handle –.
TSeqPos GetLength(void) const
Definition: Bioseq.cpp:360
bool IsSetLength(void) const
Definition: Bioseq.cpp:355
bool IsNa(void) const
Definition: Bioseq.cpp:345
CCdregion –.
Definition: Cdregion.hpp:66
CCode_break –.
Definition: Code_break.hpp:66
bool IsSkippable(void) const
Definition: Dbtag.cpp:281
static string GetBiomolName(CMolInfo::TBiomol biomol)
Definition: MolInfo.cpp:116
@RNA_ref.hpp User-defined methods of the data storage class.
Definition: RNA_ref.hpp:54
string GetRnaProductName(void) const
Definition: RNA_ref.cpp:145
virtual vector< CRef< CReportItem > > GetSubitems() const =0
bool empty() const
TNodeMap & GetMap()
CRef< CReportItem > Export(CDiscrepancyCore &test, bool unique=true) const
Non-const iterator over CSeqMap (allows to edit the sequence).
Definition: seq_map_ci.hpp:407
@ e_Iupacna
Definition: sequtil.hpp:47
CSeqVector –.
Definition: seq_vector.hpp:65
namespace ncbi::objects::
Definition: Seq_feat.hpp:58
static bool IsAa(EMol mol)
Definition: Seq_inst.hpp:99
static string GetMoleculeClass(EMol mol)
Definition: Seq_inst.cpp:72
static bool IsNa(EMol mol)
Definition: Seq_inst.hpp:90
Seq-loc iterator class – iterates all intervals from a seq-loc in the correct order.
Definition: Seq_loc.hpp:453
Seq-loc iterator class – iterates all intervals from a seq-loc in the correct order.
Definition: Seq_loc.hpp:593
bool Match(const CMatchString &str) const
void SetMatch_text(const TMatch_text &value)
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
Definition: tempstr.hpp:65
size_type size() const
Definition: map.hpp:148
const_iterator begin() const
Definition: map.hpp:151
const_iterator end() const
Definition: map.hpp:152
bool empty() const
Definition: map.hpp:149
void clear()
Definition: map.hpp:169
const_iterator find(const key_type &key) const
Definition: map.hpp:153
Definition: map.hpp:338
Definition: set.hpp:45
@ eFatal
@ eBig
@ eDisc
@ eOncaller
@ eSubmitter
@ eSmart
USING_SCOPE(objects)
static const char * kRRNASpacer[]
static const char * kContainedOpps
static bool HasOverlapNote(const CSeq_feat &feat)
static bool HasContainedNote(const CSeq_feat &feat)
static const char * kIgnoreSimilarProductWords[]
static const string kDiscMessage
static bool SetOverlapNote(CSeq_feat &feat)
static const char * kContainedNote
static const char * kOverlap2
static const char * kOverlap1
static const char * kSimilarProductWords[]
static const size_t kNumIgnoreSimilarProductWords
static const string kInconsistent_Moltype
static bool ShouldIgnore(const string &product)
static const char * kOverlap0
static bool StrandsMatch(ENa_strand strand1, ENa_strand strand2)
static const char * kContainedSame
static const char * kContained
static const CSeq_id * GetProteinId(const CBioseq &seq)
static const size_t kRRNASpacer_len
static const string kOverlappingCDSNoteText
static bool ProductNamesAreSimilar(const string &product1, const string &product2)
static string GetProdName(const CSeq_feat *feat, map< const CSeq_feat *, string > &products, CDiscrepancyContext &context)
static const size_t kNumSimilarProductWords
#define DISCREPANCY_AUTOFIX(name)
#define DISCREPANCY_CASE(name, type, group, descr)
#define DISCREPANCY_SUMMARIZE(name)
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
constexpr size_t ArraySize(const Element(&)[Size])
Definition: ncbimisc.hpp:1532
string
Definition: cgiapp.hpp:690
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Set object to copy of another one.
C * SerialClone(const C &src)
Create on heap a clone of the source object.
Definition: serialbase.hpp:512
CRef< CSeq_loc > MakeSeq_loc(EMakeType make_type=eMake_CompactType) const
return constructed CSeq_loc with all changes
Definition: Seq_loc.cpp:2946
void Delete(void)
Delete current element, and make iterator to point to the next element.
Definition: Seq_loc.cpp:2724
ENa_strand GetStrand(void) const
Get the location's strand.
Definition: Seq_loc.cpp:882
bool HasChanges(void) const
return true of any part was changed since initialization
Definition: Seq_loc.cpp:2706
const CSeq_loc & GetEmbeddingSeq_loc(void) const
Get the nearest seq-loc containing the current range.
Definition: Seq_loc.cpp:2573
@ eEmpty_Allow
ignore empty locations
Definition: Seq_loc.hpp:458
@ eMake_PreserveType
use most compact Seq-loc type (default)
Definition: Seq_loc.hpp:599
ECompare
@ eContains
First CSeq_loc contains second.
@ eSame
CSeq_locs contain each other.
@ eContained
First CSeq_loc contained by second.
@ eNoOverlap
CSeq_locs do not overlap or abut.
void SetClass(TClass v) const
@ eCoding_Iupac
Set coding to printable coding (Iupacna or Iupacaa)
CSeqMap_I & Remove(void)
Remove current segment.
Definition: seq_map_ci.cpp:893
void GetSequence(string &buffer, CSeqUtil::ECoding buffer_coding) const
Get current sequence as a string with the selected encoding.
Definition: seq_map_ci.cpp:901
const CSeq_data & GetData(void) const
will allow only regular data segments (whole, plus strand)
Definition: seq_map_ci.cpp:267
SSeqMapSelector & SetFlags(TFlags flags)
Select segment type(s)
Definition: seq_map_ci.hpp:179
void SetSequence(const string &buffer, CSeqUtil::ECoding buffer_coding, CSeq_data::E_Choice seq_data_coding)
Set sequence data.
Definition: seq_map_ci.cpp:937
TSeqPos GetLength(void) const
return length of current segment
Definition: seq_map_ci.hpp:672
void GetSeqData(TSeqPos start, TSeqPos stop, string &buffer) const
Fill the buffer string with the sequence data for the interval [start, stop).
Definition: seq_vector.cpp:304
@ fFindData
Definition: seq_map.hpp:129
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
#define kEmptyStr
Definition: ncbistr.hpp:123
static int CompareNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive compare of a substring with another string.
Definition: ncbistr.cpp:219
static SIZE_TYPE FindNoCase(const CTempString str, const CTempString pattern, SIZE_TYPE start, SIZE_TYPE end, EOccurrence which=eFirst)
Find the pattern in the specified range of a string using a case insensitive search.
Definition: ncbistr.cpp:2984
static bool IsBlank(const CTempString str, SIZE_TYPE pos=0)
Check if a string is blank (has no text).
Definition: ncbistr.cpp:106
#define NPOS
Definition: ncbistr.hpp:133
static SIZE_TYPE Find(const CTempString str, const CTempString pattern, ECase use_case=eCase, EDirection direction=eForwardSearch, SIZE_TYPE occurrence=0)
Find the pattern in the string.
Definition: ncbistr.cpp:2882
bool empty(void) const
Return true if the represented string is empty (i.e., the length is zero)
Definition: tempstr.hpp:334
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
Definition: ncbistr.hpp:5406
static bool SplitInTwo(const CTempString str, const CTempString delim, string &str1, string &str2, TSplitFlags flags=0)
Split a string into two pieces using the specified delimiters.
Definition: ncbistr.cpp:3545
static bool EqualNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive equality of a substring with another string.
Definition: ncbistr.hpp:5347
static string & ToLower(string &str)
Convert string to lower case – string& version.
Definition: ncbistr.cpp:405
@ eNocase
Case insensitive compare.
Definition: ncbistr.hpp:1206
bool IsSetPseudo(void) const
pseudogene Check if a value has been assigned to Pseudo data member.
Definition: Gene_ref_.hpp:681
bool IsSetLocus_tag(void) const
systematic gene name (e.g., MI0001, ORF0069) Check if a value has been assigned to Locus_tag data mem...
Definition: Gene_ref_.hpp:781
bool IsSetLocus(void) const
Official gene symbol Check if a value has been assigned to Locus data member.
Definition: Gene_ref_.hpp:493
const TLocus_tag & GetLocus_tag(void) const
Get the Locus_tag member data.
Definition: Gene_ref_.hpp:793
const TLocus & GetLocus(void) const
Get the Locus member data.
Definition: Gene_ref_.hpp:505
TPseudo GetPseudo(void) const
Get the Pseudo member data.
Definition: Gene_ref_.hpp:706
void SetCase_sensitive(TCase_sensitive value)
Assign a value to Case_sensitive data member.
void SetWhole_word(TWhole_word value)
Assign a value to Whole_word data member.
E_Choice Which(void) const
Which variant is currently selected.
Definition: RNA_ref_.hpp:449
bool IsSetExt(void) const
generic fields for ncRNA, tmRNA, miscRNA Check if a value has been assigned to Ext data member.
Definition: RNA_ref_.hpp:604
const TGen & GetGen(void) const
Get the variant data.
Definition: RNA_ref_.cpp:156
const TName & GetName(void) const
Get the variant data.
Definition: RNA_ref_.hpp:484
const TExt & GetExt(void) const
Get the Ext member data.
Definition: RNA_ref_.hpp:616
@ e_Name
for naming "other" type
Definition: RNA_ref_.hpp:134
bool IsSetComment(void) const
Check if a value has been assigned to Comment data member.
Definition: Seq_feat_.hpp:1037
bool IsSetData(void) const
the specific data Check if a value has been assigned to Data data member.
Definition: Seq_feat_.hpp:913
void SetLocation(TLocation &value)
Assign a value to Location data member.
Definition: Seq_feat_.cpp:131
const TLocation & GetLocation(void) const
Get the Location member data.
Definition: Seq_feat_.hpp:1117
bool IsGene(void) const
Check if variant Gene is selected.
const TData & GetData(void) const
Get the Data member data.
Definition: Seq_feat_.hpp:925
void SetData(TData &value)
Assign a value to Data data member.
Definition: Seq_feat_.cpp:94
const TAa & GetAa(void) const
Get the Aa member data.
TNcbieaa GetNcbieaa(void) const
Get the variant data.
const TComment & GetComment(void) const
Get the Comment member data.
Definition: Seq_feat_.hpp:1049
const TGene & GetGene(void) const
Get the variant data.
ENa_strand
strand of nucleic acid
Definition: Na_strand_.hpp:64
bool IsGeneral(void) const
Check if variant General is selected.
Definition: Seq_id_.hpp:877
const TGeneral & GetGeneral(void) const
Get the variant data.
Definition: Seq_id_.cpp:369
bool IsNull(void) const
Check if variant Null is selected.
Definition: Seq_loc_.hpp:504
@ eNa_strand_plus
Definition: Na_strand_.hpp:66
@ eNa_strand_minus
Definition: Na_strand_.hpp:67
@ e_Other
for historical reasons, 'other' = 'refseq'
Definition: Seq_id_.hpp:104
@ e_Ddbj
DDBJ.
Definition: Seq_id_.hpp:107
@ e_Prf
PRF SEQDB.
Definition: Seq_id_.hpp:108
@ eClass_pop_set
population study
@ eClass_phy_set
phylogenetic study
@ eClass_mut_set
set of mutations
@ eClass_eco_set
ecological sample study
@ eClass_genbank
converted genbank
@ eClass_segset
segmented sequence + parts
const TInst & GetInst(void) const
Get the Inst member data.
Definition: Bioseq_.hpp:336
bool CanGetAnnot(void) const
Check if it is safe to call GetAnnot method.
Definition: Bioseq_.hpp:360
bool IsSetMol(void) const
Check if a value has been assigned to Mol data member.
Definition: Seq_inst_.hpp:593
const TSource & GetSource(void) const
Get the variant data.
Definition: Seqdesc_.cpp:566
bool IsSetBiomol(void) const
Check if a value has been assigned to Biomol data member.
Definition: MolInfo_.hpp:422
const TAnnot & GetAnnot(void) const
Get the Annot member data.
Definition: Bioseq_.hpp:366
const TId & GetId(void) const
Get the Id member data.
Definition: Bioseq_.hpp:290
bool IsSetInst(void) const
the sequence data Check if a value has been assigned to Inst data member.
Definition: Bioseq_.hpp:324
TLength GetLength(void) const
Get the Length member data.
Definition: Seq_inst_.hpp:659
TMol GetMol(void) const
Get the Mol member data.
Definition: Seq_inst_.hpp:612
TBiomol GetBiomol(void) const
Get the Biomol member data.
Definition: MolInfo_.hpp:447
bool IsSetId(void) const
equivalent identifiers Check if a value has been assigned to Id data member.
Definition: Bioseq_.hpp:278
const TMolinfo & GetMolinfo(void) const
Get the variant data.
Definition: Seqdesc_.cpp:588
bool CanGetInst(void) const
Check if it is safe to call GetInst method.
Definition: Bioseq_.hpp:330
E_Choice Which(void) const
Which variant is currently selected.
Definition: Seq_data_.hpp:475
@ eCompleteness_complete
complete biological entity
Definition: MolInfo_.hpp:156
#define TAIL()
int i
yy_size_t n
int len
void AddComment(CSeq_feat &feat, const string &comment)
Definition: utils.cpp:44
const struct ncbi::grid::netcache::search::fields::SIZE size
GenericValue< UTF8<> > Value
GenericValue with UTF8 encoding.
Definition: document.h:2107
The Object manager core.
#define FOR_EACH_CODEBREAK_ON_CDREGION(Itr, Var)
FOR_EACH_CODEBREAK_ON_CDREGION EDIT_EACH_CODEBREAK_ON_CDREGION.
#define FIELD_IS_SET_AND_IS(Var, Fld, Chs)
FIELD_IS_SET_AND_IS base macro.
#define RAW_FIELD_IS_EMPTY_OR_UNSET(Var, Fld)
RAW_FIELD_IS_EMPTY_OR_UNSET macro.
#define GET_FIELD_OR_DEFAULT(Var, Fld, Dflt)
GET_FIELD_OR_DEFAULT base macro.
#define FIELD_EQUALS(Var, Fld, Value)
FIELD_EQUALS base macro.
#define STRING_FIELD_NOT_EMPTY(Var, Fld)
STRING_FIELD_NOT_EMPTY base macro.
#define GET_STRING_FLD_OR_BLANK(Var, Fld)
GET_STRING_FLD_OR_BLANK base macro.
vector< pair< size_t, size_t > > NRuns
Selector used in CSeqMap methods returning iterators.
Definition: seq_map_ci.hpp:113
#define _TROUBLE
#define _ASSERT
static CS_CONTEXT * context
Definition: will_convert.c:21
Modified on Fri Sep 20 14:57:50 2024 by modify_doxy.py rev. 669887