NCBI C++ ToolKit
feature_tests.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: feature_tests.cpp 98937 2023-01-25 15:40:02Z foleyjp $
2  * =========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * =========================================================================
25  *
26  * Authors: Colleen Bollin, based on similar discrepancy tests
27  *
28  */
29 
30 #include <ncbi_pch.hpp>
31 
32 #include "discrepancy_core.hpp"
33 #include "utils.hpp"
34 
42 #include <objects/seq/Seq_ext.hpp>
50 #include <objmgr/util/sequence.hpp>
51 #include <objmgr/feat_ci.hpp>
52 #include <objmgr/seqdesc_ci.hpp>
53 #include <objmgr/seq_annot_ci.hpp>
54 #include <objmgr/seq_vector.hpp>
55 #include <objmgr/tse_handle.hpp>
57 
61 
62 
63 // PSEUDO_MISMATCH
64 
65 const string kPseudoMismatch = "[n] CDSs, RNAs, and genes have mismatching pseudos.";
66 
67 DISCREPANCY_CASE(PSEUDO_MISMATCH, FEAT, eDisc | eOncaller | eSubmitter | eSmart | eFatal, "Pseudo Mismatch")
68 {
69  for (const CSeq_feat& feat : context.GetFeat()) {
70  if (feat.IsSetPseudo() && feat.GetPseudo() && (feat.GetData().IsCdregion() || feat.GetData().IsRna())) {
71  const CSeq_feat* gene = context.GetGeneForFeature(feat);
72  if (gene && !context.IsPseudo(*gene)) {
73  m_Objs[kPseudoMismatch].Add(*context.SeqFeatObjRef(feat, CDiscrepancyContext::eFixSelf), false).Fatal();
74  m_Objs[kPseudoMismatch].Add(*context.SeqFeatObjRef(*gene), false).Fatal();
75  }
76  }
77  }
78 }
79 
80 
81 DISCREPANCY_AUTOFIX(PSEUDO_MISMATCH)
82 {
83  const CSeq_feat* sf = dynamic_cast<const CSeq_feat*>(context.FindObject(*obj));
84  CRef<CSeq_feat> new_feat(new CSeq_feat());
85  new_feat->Assign(*sf);
86  new_feat->SetPseudo(true);
87  context.ReplaceSeq_feat(*obj, *sf, *new_feat);
88  obj->SetFixed();
89  return CRef<CAutofixReport>(new CAutofixReport("PSEUDO_MISMATCH: Set pseudo for [n] feature[s]", 1));
90 }
91 
92 
93 // SHORT_RRNA
94 
95 DISCREPANCY_CASE(SHORT_RRNA, FEAT, eDisc | eOncaller | eSubmitter | eSmart | eFatal, "Short rRNA Features")
96 {
97  for (const CSeq_feat& feat : context.GetFeat()) {
98  if (feat.IsSetData() && feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_rRNA && !feat.IsSetPartial() && IsShortrRNA(feat, &(context.GetScope()))) {
99  m_Objs["[n] rRNA feature[s] [is] too short"].Add(*context.SeqFeatObjRef(feat)).Fatal();
100  }
101  }
102 }
103 
104 
105 // DISC_RBS_WITHOUT_GENE
106 
107 static bool IsRBS(const CSeq_feat& f)
108 {
109  if (f.GetData().GetSubtype() == CSeqFeatData::eSubtype_RBS) {
110  return true;
111  }
112  if (f.GetData().GetSubtype() != CSeqFeatData::eSubtype_regulatory) {
113  return false;
114  }
115  if (!f.IsSetQual()) {
116  return false;
117  }
118  for (const auto& it : f.GetQual()) {
119  if (it->IsSetQual() && NStr::Equal(it->GetQual(), "regulatory_class") &&
121  return true;
122  }
123  }
124  return false;
125 }
126 
127 
128 DISCREPANCY_CASE(RBS_WITHOUT_GENE, FEAT, eOncaller | eFatal, "RBS features should have an overlapping gene")
129 {
130  bool has_genes = false;
131  for (const CSeq_feat& feat : context.GetAllFeat()) {
132  if (feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_gene) {
133  has_genes = true;
134  break;
135  }
136  }
137  if (has_genes) {
138  for (const CSeq_feat& feat : context.GetFeat()) {
139  if (IsRBS(feat) && !context.GetGeneForFeature(feat)) {
140  m_Objs["[n] RBS feature[s] [does] not have overlapping gene[s]"].Add(*context.SeqFeatObjRef(feat)).Fatal();
141  }
142  }
143  }
144 }
145 
146 
147 // MISSING_GENES
148 
150 {
151  CSeqFeatData::ESubtype subtype = f.GetData().GetSubtype();
152  if (subtype == CSeqFeatData::eSubtype_regulatory) {
153  return false;
154  }
155  if (IsRBS(f) ||
156  f.GetData().IsCdregion() ||
157  f.GetData().IsRna() ||
158  subtype == CSeqFeatData::eSubtype_exon ||
159  subtype == CSeqFeatData::eSubtype_intron) {
160  return true;
161  }
162  return false;
163 }
164 
165 
166 DISCREPANCY_CASE(MISSING_GENES, FEAT, eDisc | eSubmitter | eSmart | eFatal, "Missing Genes")
167 {
168  for (const CSeq_feat& feat : context.GetFeat()) {
169  if (!feat.GetGeneXref() && feat.IsSetData() && ReportGeneMissing(feat)) {
170  const CSeq_feat* gene_feat = context.GetGeneForFeature(feat);
171  if (!gene_feat) {
172  m_Objs["[n] feature[s] [has] no genes"].Add(*context.SeqFeatObjRef(feat)).Fatal();
173  }
174  }
175  }
176 }
177 
178 
179 // EXTRA_GENES
180 
181 const string kExtraGene = "[n] gene feature[s] [is] not associated with a CDS or RNA feature.";
182 const string kExtraPseudo = "[n] pseudo gene feature[s] [is] not associated with a CDS or RNA feature.";
183 const string kExtraGeneNonPseudoNonFrameshift = "[n] non-pseudo gene feature[s] are not associated with a CDS or RNA feature and [does] not have frameshift in the comment.";
184 
185 bool IsGeneInXref(const CSeq_feat& gene, const CSeq_feat& feat, bool& have_gene_ref)
186 {
187  for (const auto& it : feat.GetXref()) {
188  if (it->IsSetId()) {
189  const CFeat_id& id = it->GetId();
190  if (gene.CanGetId() && gene.GetId().Equals(id)) {
191  return true;
192  }
193  }
194  if (it->IsSetData() && it->GetData().IsGene()) {
195  have_gene_ref = true;
196  const CGene_ref& gene_ref = it->GetData().GetGene();
197  const string& locus = gene.GetData().GetGene().IsSetLocus() ? gene.GetData().GetGene().GetLocus() : kEmptyStr;
198  const string& locus_tag = gene.GetData().GetGene().IsSetLocus_tag() ? gene.GetData().GetGene().GetLocus_tag() : kEmptyStr;
199  if ((gene_ref.IsSetLocus() || gene_ref.IsSetLocus_tag())
200  && (!gene_ref.IsSetLocus_tag() || gene_ref.GetLocus_tag() == locus_tag)
201  && (gene_ref.IsSetLocus_tag() || locus_tag.empty())
202  && (!gene_ref.IsSetLocus() || gene_ref.GetLocus() == locus)
203  && (gene_ref.IsSetLocus() || locus.empty())) {
204  return true;
205  }
206  }
207  }
208  return false;
209 }
210 
211 
212 DISCREPANCY_CASE(EXTRA_GENES, SEQUENCE, eDisc | eSubmitter | eSmart, "Extra Genes")
213 {
214  // TODO: Do not collect if mRNA sequence in Gen-prod set
215  const auto& genes = context.FeatGenes();
216  const auto& all = context.FeatAll();
217  for (const CSeq_feat* gene : genes) {
218  if ((gene->IsSetComment() && !gene->GetComment().empty()) || (gene->GetData().GetGene().IsSetDesc() && !gene->GetData().GetGene().GetDesc().empty())) {
219  continue;
220  }
221  const CSeq_loc& loc = gene->GetLocation();
222  bool found = false;
223  for (const CSeq_feat* feat : all) {
224  if (feat->GetData().IsCdregion() || feat->GetData().IsRna()) {
225  const CSeq_loc& loc_f = feat->GetLocation();
226  sequence::ECompare cmp = context.Compare(loc, loc_f);
228  bool have_gene_ref = false;
229  if (IsGeneInXref(*gene, *feat, have_gene_ref)) {
230  found = true;
231  break;
232  }
233  else if (!have_gene_ref) {
235  if (best_gene.NotEmpty() && &*best_gene == &*gene) {
236  found = true;
237  break;
238  }
239  }
240  }
241  }
242  }
243  if (!found) {
244  m_Objs[kExtraGene][context.IsPseudo(*gene) ? kExtraPseudo : kExtraGeneNonPseudoNonFrameshift].Ext().Add(*context.SeqFeatObjRef(*gene));
245  }
246  }
247 }
248 
249 
250 //SUPERFLUOUS_GENE
251 
252 DISCREPANCY_CASE(SUPERFLUOUS_GENE, SEQUENCE, eDisc | eOncaller, "Superfluous Genes")
253 {
254  const auto& genes = context.FeatGenes();
255  const auto& feats = context.FeatAll();
256  for (size_t i = 0; i < genes.size(); i++) {
257  if (genes[i]->IsSetPseudo() && genes[i]->GetPseudo()) {
258  continue;
259  }
260  const CSeq_loc& loc_i = genes[i]->GetLocation();
261  bool found = false;
262  for (size_t j = 0; j < feats.size(); j++) {
263  if (feats[j]->GetData().IsGene()) {
264  continue;
265  }
266  const CSeq_loc& loc_j = feats[j]->GetLocation();
267  sequence::ECompare compare = context.Compare(loc_j, loc_i);
268  if (compare == sequence::eNoOverlap) {
269  continue;
270  }
271  if (genes[i] == context.GetGeneForFeature(*feats[j])) {
272  found = true;
273  break;
274  }
275  }
276  if (!found) {
277  m_Objs["[n] gene feature[s] [is] not associated with any feature and [is] not pseudo."].Add(*context.SeqFeatObjRef(*genes[i]));
278  }
279  }
280 }
281 
282 
283 // BACTERIAL_PARTIAL_NONEXTENDABLE_PROBLEMS
284 
288  eExtensibe_abut = 2
289 };
290 
291 
292 EExtensibe IsExtendableLeft(TSeqPos left, const CBioseq& seq, CScope* scope, TSeqPos& extend_len, ENa_strand strand)
293 {
294  bool circular = seq.IsSetInst() && seq.GetInst().GetTopology() == CSeq_inst::eTopology_circular;
296  if (left < 3) {
297  extend_len = left;
298  rval = extend_len ? circular ? eExtensibe_none : eExtensibe_fixable : eExtensibe_abut;
299  }
300  else if (seq.IsSetInst() && seq.GetInst().IsSetRepr() && seq.GetInst().GetRepr() == CSeq_inst::eRepr_delta && seq.GetInst().IsSetExt() && seq.GetInst().GetExt().IsDelta()) {
301  TSeqPos offset = 0;
302  TSeqPos last_gap_stop = 0;
303  bool gap = false;
304  for (const auto& it : seq.GetInst().GetExt().GetDelta().Get()) {
305  if (it->IsLiteral()) {
306  offset += it->GetLiteral().GetLength();
307  if (!it->GetLiteral().IsSetSeq_data()) {
308  last_gap_stop = offset;
309  gap = true;
310  }
311  else if (it->GetLiteral().GetSeq_data().IsGap()) {
312  last_gap_stop = offset;
313  gap = true;
314  }
315  }
316  else if (it->IsLoc()) {
317  offset += sequence::GetLength(it->GetLoc(), scope);
318  }
319  if (offset > left) {
320  break;
321  }
322  }
323  if (left >= last_gap_stop && left - last_gap_stop <= 3) {
324  extend_len = left - last_gap_stop;
325  rval = extend_len ? (circular && !gap) ? eExtensibe_none : eExtensibe_fixable : eExtensibe_abut;
326  }
327  }
328  if (rval == eExtensibe_abut) return rval;
329  CSeqVector svec(seq, scope, CBioseq_Handle::CBioseq_Handle::eCoding_Iupac);
330  string codon;
331  TSeqPos count = extend_len ? extend_len : 1;
332  svec.GetSeqData(left - count, left, codon);
333  for (unsigned i = 0; i < count; i++) {
334  if (codon[i] == 'N') {
335  count = i;
336  break;
337  }
338  }
339  if (!count) {
340  extend_len = 0;
341  return eExtensibe_abut;
342  }
343  if (rval == eExtensibe_fixable) {
344  svec.GetSeqData(left - extend_len, left - extend_len + 3, codon);
345  if (strand == eNa_strand_minus) {
346  if (codon == "CTA" || codon == "TTA" || codon == "TCA") { // reverse TAG / TAA / TGA
347  rval = eExtensibe_none;
348  }
349  }
350  else {
351  if (codon == "TAG" || codon == "TAA" || codon == "TGA") {
352  rval = eExtensibe_none;
353  }
354  }
355  }
356  return rval;
357 }
358 
359 
360 EExtensibe IsExtendableRight(TSeqPos right, const CBioseq& seq, CScope* scope, TSeqPos& extend_len, ENa_strand strand)
361 {
362  bool circular = seq.IsSetInst() && seq.GetInst().GetTopology() == CSeq_inst::eTopology_circular;
364  if (right > seq.GetLength() - 4) {
365  extend_len = seq.GetLength() - right - 1;
366  rval = extend_len ? circular ? eExtensibe_none : eExtensibe_fixable : eExtensibe_abut;
367  }
368  else if (seq.IsSetInst() && seq.GetInst().IsSetRepr() && seq.GetInst().GetRepr() == CSeq_inst::eRepr_delta && seq.GetInst().IsSetExt() && seq.GetInst().GetExt().IsDelta()) {
369  TSeqPos offset = 0;
370  TSeqPos next_gap_start = 0;
371  bool gap = false;
372  for (const auto& it : seq.GetInst().GetExt().GetDelta().Get()) {
373  if (it->IsLiteral()) {
374  if (!it->GetLiteral().IsSetSeq_data()) {
375  next_gap_start = offset;
376  gap = true;
377  }
378  else if (it->GetLiteral().GetSeq_data().IsGap()) {
379  next_gap_start = offset;
380  gap = true;
381  }
382  offset += it->GetLiteral().GetLength();
383  }
384  else if (it->IsLoc()) {
385  offset += sequence::GetLength(it->GetLoc(), scope);
386  }
387  if (offset > right + 3) {
388  break;
389  }
390  }
391  if (next_gap_start > right && next_gap_start - right - 1 <= 3) {
392  extend_len = next_gap_start - right - 1;
393  rval = extend_len ? (circular && !gap) ? eExtensibe_none : eExtensibe_fixable : eExtensibe_abut;
394  }
395  }
396  if (rval == eExtensibe_abut) return rval;
397  CSeqVector svec(seq, scope, CBioseq_Handle::CBioseq_Handle::eCoding_Iupac);
398  string codon;
399  TSeqPos count = extend_len ? extend_len : 1;
400  svec.GetSeqData(right + 1, right + count + 1, codon);
401  for (unsigned i = 0; i < count; i++) {
402  if (codon[i] == 'N') {
403  count = i;
404  break;
405  }
406  }
407  if (!count) {
408  extend_len = 0;
409  return eExtensibe_abut;
410  }
411  if (rval == eExtensibe_fixable) {
412  svec.GetSeqData(right + extend_len - 3, right + extend_len, codon);
413  if (strand == eNa_strand_minus) {
414  if (codon == "CTA" || codon == "TTA" || codon == "TCA") { // reverse TAG / TAA / TGA
415  rval = eExtensibe_none;
416  }
417  }
418  else {
419  if (codon == "TAG" || codon == "TAA" || codon == "TGA") {
420  rval = eExtensibe_none;
421  }
422  }
423  }
424  return rval;
425 }
426 
427 
428 // Cannot be extended and not abut the end or the gap
429 bool IsNonExtendable(const CSeq_loc& loc, const CBioseq& seq, CScope* scope)
430 {
431  bool rval = false;
433  TSeqPos start = loc.GetStart(eExtreme_Positional);
434  if (start > 0) {
435  TSeqPos extend_len = 0;
436  if (IsExtendableLeft(start, seq, scope, extend_len, loc.GetStrand()) == eExtensibe_none) {
437  rval = true;
438  }
439  }
440  }
441  if (!rval && loc.IsPartialStop(eExtreme_Positional)) {
442  TSeqPos stop = loc.GetStop(eExtreme_Positional);
443  if (stop < seq.GetLength() - 1) {
444  TSeqPos extend_len = 0;
445  if (IsExtendableRight(stop, seq, scope, extend_len, loc.GetStrand()) == eExtensibe_none) {
446  rval = true;
447  }
448  }
449  }
450  return rval;
451 }
452 
453 
454 DISCREPANCY_CASE(BACTERIAL_PARTIAL_NONEXTENDABLE_PROBLEMS, SEQUENCE, eDisc | eSubmitter | eSmart | eFatal, "Find partial feature ends on bacterial sequences that cannot be extended: on when non-eukaryote")
455 {
456  const CBioseq& bioseq = context.CurrentBioseq();
457  const CSeqdesc* biosrc = context.GetBiosource();
458  if (!biosrc || context.IsEukaryotic(&biosrc->GetSource()) || context.IsOrganelle(&biosrc->GetSource()) || bioseq.IsAa()) {
459  return;
460  }
461  for (const CSeq_feat& feat : context.GetAllFeat()) {
462  if (feat.IsSetData() && feat.GetData().IsCdregion() && IsNonExtendable(feat.GetLocation(), bioseq, &(context.GetScope()))) {
463  m_Objs["[n] feature[s] [has] partial ends that do not abut the end of the sequence or a gap, and cannot be extended by 3 or fewer nucleotides to do so"].Add(*context.SeqFeatObjRef(feat, &feat)).Fatal();
464  }
465  }
466 }
467 
468 
469 const string kNonExtendableException = "unextendable partial coding region";
470 
471 DISCREPANCY_AUTOFIX(BACTERIAL_PARTIAL_NONEXTENDABLE_PROBLEMS)
472 {
473  const CSeq_feat* sf = dynamic_cast<const CSeq_feat*>(context.FindObject(*obj));
474  if (!sf->IsSetExcept_text() || sf->GetExcept_text().find(kNonExtendableException) == string::npos) {
475  CRef<CSeq_feat> new_feat(new CSeq_feat());
476  new_feat->Assign(*sf);
477  if (new_feat->IsSetExcept_text()) {
478  new_feat->SetExcept_text(sf->GetExcept_text() + "; " + kNonExtendableException);
479  }
480  else {
482  }
483  new_feat->SetExcept(true);
484  context.ReplaceSeq_feat(*obj, *sf, *new_feat);
485  obj->SetFixed();
486  return CRef<CAutofixReport>(new CAutofixReport("BACTERIAL_PARTIAL_NONEXTENDABLE_PROBLEMS: Set exception for [n] feature[s]", 1));
487  }
488  return CRef<CAutofixReport>();
489 }
490 
491 
492 // BACTERIAL_PARTIAL_NONEXTENDABLE_EXCEPTION
493 
494 DISCREPANCY_CASE(BACTERIAL_PARTIAL_NONEXTENDABLE_EXCEPTION, SEQUENCE, eDisc | eSubmitter | eSmart, "Find partial feature ends on bacterial sequences that cannot be extended but have exceptions: on when non-eukaryote")
495 {
496  const CBioseq& bioseq = context.CurrentBioseq();
497  const CSeqdesc* biosrc = context.GetBiosource();
498  if (!biosrc || context.IsEukaryotic(&biosrc->GetSource()) || context.IsOrganelle(&biosrc->GetSource()) || bioseq.IsAa()) {
499  return;
500  }
501  for (const CSeq_feat& feat : context.GetAllFeat()) {
502  if (feat.IsSetData() && feat.GetData().IsCdregion() && feat.IsSetExcept_text() && NStr::FindNoCase(feat.GetExcept_text(), kNonExtendableException) != NPOS && IsNonExtendable(feat.GetLocation(), bioseq, &(context.GetScope()))) {
503  m_Objs["[n] feature[s] [has] partial ends that do not abut the end of the sequence or a gap, and cannot be extended by 3 or fewer nucleotides to do so, but [has] the correct exception"].Add(*context.SeqFeatObjRef(feat));
504  }
505  }
506 }
507 
508 
509 // PARTIAL_PROBLEMS
510 
511 DISCREPANCY_CASE(PARTIAL_PROBLEMS, SEQUENCE, eDisc | eOncaller | eSubmitter | eSmart | eFatal, "Find partial feature ends on bacterial sequences, but could be extended by 3 or fewer nucleotides")
512 {
513  const CBioseq& bioseq = context.CurrentBioseq();
514  const CSeqdesc* biosrc = context.GetBiosource();
515  if (!biosrc || context.IsEukaryotic(&biosrc->GetSource()) || context.IsOrganelle(&biosrc->GetSource()) || bioseq.IsAa()) {
516  return;
517  }
518  //bool circular = bioseq.IsSetInst() && bioseq.GetInst().GetTopology() == CSeq_inst::eTopology_circular;
519  for (const CSeq_feat& feat : context.GetAllFeat()) {
520  if (feat.IsSetData() && feat.GetData().IsCdregion()) {
521  if (feat.IsSetPseudo() && feat.GetPseudo() == true && !context.IsRefseq()) continue;
522  bool add_this = false;
523  if (feat.GetLocation().IsPartialStart(eExtreme_Positional)) {
524  TSeqPos start = feat.GetLocation().GetStart(eExtreme_Positional);
525  if (start > 0) {
526  TSeqPos extend_len = 0;
527  if (IsExtendableLeft(start, bioseq, &(context.GetScope()), extend_len, feat.GetLocation().GetStrand()) == eExtensibe_fixable) {
528  //cout << "extend start: " << extend_len << "\n";
529  add_this = extend_len > 0 && extend_len <= 3;
530  }
531  }
532  }
533  if (!add_this && feat.GetLocation().IsPartialStop(eExtreme_Positional)) {
534  TSeqPos stop = feat.GetLocation().GetStop(eExtreme_Positional);
535  if (stop < bioseq.GetLength() - 1) {
536  TSeqPos extend_len = 0;
537  if (IsExtendableRight(stop, bioseq, &(context.GetScope()), extend_len, feat.GetLocation().GetStrand()) == eExtensibe_fixable) {
538  //cout << "extend end: " << extend_len << "\n";
539  add_this = extend_len > 0 && extend_len <= 3;
540  }
541  }
542  }
543  if (add_this) {
544  m_Objs["[n] feature[s] [has] partial ends that do not abut the end of the sequence or a gap, but could be extended by 3 or fewer nucleotides to do so"].Add(*context.SeqFeatObjRef(feat, CDiscrepancyContext::eFixSet)).Fatal();
545  }
546  }
547  }
548 }
549 
550 
551 static bool ExtendToGapsOrEnds(const CSeq_feat& cds, CScope& scope)
552 {
553  bool rval = false;
554 
555  CBioseq_Handle bsh = scope.GetBioseqHandle(cds.GetLocation());
556  if (!bsh) {
557  return rval;
558  }
560  //bool circular = seq->IsSetInst() && seq->GetInst().GetTopology() == CSeq_inst::eTopology_circular;
561 
563  for (CFeat_CI gene_it(bsh, CSeqFeatData::eSubtype_gene); gene_it; ++gene_it) {
564  if (gene_it->GetLocation().GetStart(eExtreme_Positional) == cds.GetLocation().GetStart(eExtreme_Positional) && gene_it->GetLocation().GetStop(eExtreme_Positional) == cds.GetLocation().GetStop(eExtreme_Positional)) {
565  gene.Reset(&gene_it->GetMappedFeature());
566  break;
567  }
568  }
569 
570  CRef<CSeq_feat> new_feat(new CSeq_feat());
571  new_feat->Assign(cds);
572 
573  CRef<CSeq_feat> new_gene;
574  if (gene) {
575  new_gene.Reset(new CSeq_feat());
576  new_gene->Assign(*gene);
577  }
578 
581  if (start > 0) {
582  TSeqPos extend_len = 0;
583  if (IsExtendableLeft(start, *seq, &scope, extend_len, cds.GetLocation().GetStrand()) && CCleanup::SeqLocExtend(new_feat->SetLocation(), start - extend_len, scope)) {
584  if (gene) {
585  CCleanup::SeqLocExtend(new_gene->SetLocation(), start - extend_len, scope);
586  }
587  if (new_feat->GetData().GetCdregion().CanGetFrame() && cds.GetLocation().GetStrand() != eNa_strand_minus) {
588  CCdregion::EFrame frame = new_feat->GetData().GetCdregion().GetFrame();
589  if (frame != CCdregion::eFrame_not_set) {
590  // eFrame_not_set = 0, ///< not set, code uses one
591  // eFrame_one = 1,
592  // eFrame_two = 2,
593  // eFrame_three = 3 ///< reading frame
594  unsigned fr = (unsigned)frame - 1;
595  fr = (fr + extend_len) % 3;
596  frame = (CCdregion::EFrame)(fr + 1);
597  new_feat->SetData().SetCdregion().SetFrame() = frame;
598  }
599  }
600  rval = true;
601  }
602  }
603  }
604 
607  if (stop > 0) {
608  TSeqPos extend_len = 0;
609  if (IsExtendableRight(stop, *seq, &scope, extend_len, cds.GetLocation().GetStrand()) && CCleanup::SeqLocExtend(new_feat->SetLocation(), stop + extend_len, scope)) {
610  if (gene) {
611  CCleanup::SeqLocExtend(new_gene->SetLocation(), stop + extend_len, scope);
612  }
613  if (new_feat->GetData().GetCdregion().CanGetFrame() && cds.GetLocation().GetStrand() == eNa_strand_minus) {
614  CCdregion::EFrame frame = new_feat->GetData().GetCdregion().GetFrame();
615  if (frame != CCdregion::eFrame_not_set) {
616  unsigned fr = (unsigned)frame - 1;
617  fr = (fr + extend_len) % 3;
618  frame = (CCdregion::EFrame)(fr + 1);
619  new_feat->SetData().SetCdregion().SetFrame() = frame;
620  }
621  }
622  rval = true;
623  }
624  }
625  }
626 
627  if (rval) {
628  CSeq_feat_EditHandle feh(scope.GetSeq_featHandle(cds));
629  feh.Replace(*new_feat);
630  if (gene) {
631  CSeq_feat_EditHandle geh(scope.GetSeq_featHandle(*gene));
632  geh.Replace(*new_gene);
633  }
634  }
635  return rval;
636 }
637 
638 
639 DISCREPANCY_AUTOFIX(PARTIAL_PROBLEMS)
640 {
641  const CSeq_feat* sf = dynamic_cast<const CSeq_feat*>(context.FindObject(*obj));
642  if (ExtendToGapsOrEnds(*sf, context.GetScope())) {
643  obj->SetFixed();
644  return CRef<CAutofixReport>(new CAutofixReport("PARTIAL_PROBLEMS: [n] feature[s] [is] extended to end or gap", 1));
645  }
646  return CRef<CAutofixReport>();
647 }
648 
649 
650 // EUKARYOTE_SHOULD_HAVE_MRNA
651 
652 const string kEukaryoteShouldHavemRNA = "no mRNA present";
653 const string kEukaryoticCDSHasMrna = "Eukaryotic CDS has mRNA";
654 
655 DISCREPANCY_CASE(EUKARYOTE_SHOULD_HAVE_MRNA, SEQUENCE, eDisc | eSubmitter | eSmart | eFatal, "Eukaryote should have mRNA")
656 {
657  const CSeqdesc* molinfo = context.GetMolinfo();
658  if (!molinfo || !molinfo->GetMolinfo().IsSetBiomol() || molinfo->GetMolinfo().GetBiomol() != CMolInfo::eBiomol_genomic) {
659  return;
660  }
661  const CSeqdesc* biosrc = context.GetBiosource();
662  if (!context.IsEukaryotic(biosrc ? &biosrc->GetSource() : nullptr)) {
663  return;
664  }
665  for (const CSeq_feat& feat : context.GetAllFeat()) {
666  if (feat.IsSetData() && feat.GetData().IsCdregion() && !context.IsPseudo(feat)) {
667  CConstRef<CSeq_feat> mrna = sequence::GetmRNAforCDS(feat, context.GetScope());
668  if (mrna) {
669  m_Objs[kEukaryoticCDSHasMrna].Add(*context.SeqFeatObjRef(feat));
670  }
671  else if (m_Objs[kEukaryoteShouldHavemRNA].GetObjects().empty()) {
672  m_Objs[kEukaryoteShouldHavemRNA].Add(*context.SeqFeatObjRef(feat)).Fatal();
673  }
674  }
675  }
676 }
677 
678 
679 DISCREPANCY_SUMMARIZE(EUKARYOTE_SHOULD_HAVE_MRNA)
680 {
681  if (m_Objs.empty()) {
682  return;
683  }
685  m_Objs.GetMap().erase(kEukaryoticCDSHasMrna);
686  m_Objs[kEukaryoteShouldHavemRNA].clearObjs();
687  xSummarize();
688  }
689 }
690 
691 
692 // NON_GENE_LOCUS_TAG
693 
694 DISCREPANCY_CASE(NON_GENE_LOCUS_TAG, FEAT, eDisc | eOncaller | eSubmitter | eSmart, "Nongene Locus Tag")
695 {
696  for (const CSeq_feat& feat : context.GetFeat()) {
697  if (feat.IsSetQual() && (!feat.IsSetData() || !feat.GetData().IsGene())) {
698  for (const auto& it : feat.GetQual()) {
699  if (it->IsSetQual() && NStr::EqualNocase(it->GetQual(), "locus_tag")) {
700  m_Objs["[n] non-gene feature[s] [has] locus tag[s]."].Add(*context.SeqFeatObjRef(feat));
701  break;
702  }
703  }
704  }
705  }
706 }
707 
708 
709 // FIND_BADLEN_TRNAS
710 
711 DISCREPANCY_CASE(FIND_BADLEN_TRNAS, FEAT, eDisc | eOncaller | eSubmitter | eSmart, "Find short and long tRNAs")
712 {
713  for (const CSeq_feat& feat : context.GetFeat()) {
714  if (feat.IsSetData() && feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_tRNA) {
715  TSeqPos len = sequence::GetLength(feat.GetLocation(), &(context.GetScope()));
716  if (!feat.IsSetPartial() && len < 50) {
717  m_Objs["[n] tRNA[s] [is] too short"].Add(*context.SeqFeatObjRef(feat));
718  }
719  else if (len >= 150) {
720  m_Objs["[n] tRNA[s] [is] too long - over 150 nucleotides"].Add(*context.SeqFeatObjRef(feat));
721  }
722  }
723  }
724 }
725 
726 
727 // ORG_TRNAS
728 
729 DISCREPANCY_CASE(ORG_TRNAS, FEAT, eDisc | eOncaller, "Find long tRNAs > 90nt except Ser/Leu/Sec")
730 {
731  for (const CSeq_feat& feat : context.GetFeat()) {
732  if (feat.IsSetData() && feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_tRNA) {
733  TSeqPos len = sequence::GetLength(feat.GetLocation(), &(context.GetScope()));
734  if (len > 90) {
735  const string aa = context.GetAminoacidName(feat);
736  if (aa != "Ser" && aa != "Sec" && aa != "Leu") {
737  m_Objs["[n] tRNA[s] [is] too long"].Add(*context.SeqFeatObjRef(feat));
738  }
739  }
740  }
741  }
742 }
743 
744 
745 // GENE_PARTIAL_CONFLICT
746 bool IsPartialStartConflict(const CSeq_feat& feat, const CSeq_feat& gene, bool is_mrna = false)
747 {
748  bool partial_feat = feat.GetLocation().IsPartialStart(eExtreme_Biological);
749  bool partial_gene = gene.GetLocation().IsPartialStart(eExtreme_Biological);
750  if (partial_feat != partial_gene) {
752  return true;
753  }
754  }
755  return false;
756 }
757 
758 
759 bool IsPartialStopConflict(const CSeq_feat& feat, const CSeq_feat& gene, bool is_mrna = false)
760 {
761  bool partial_feat = feat.GetLocation().IsPartialStop(eExtreme_Biological);
762  bool partial_gene = gene.GetLocation().IsPartialStop(eExtreme_Biological);
763  if (partial_feat != partial_gene) {
765  return true;
766  }
767  }
768  return false;
769 }
770 
771 const string kGenePartialConflictTop = "[n/2] feature location[s] conflict with partialness of overlapping gene";
772 const string kGenePartialConflictOther = "[n/2] feature[s] that [is] not coding region[s] or misc_feature[s] conflict with partialness of overlapping gene";
773 const string kGenePartialConflictCodingRegion = "[n/2] coding region location[s] conflict with partialness of overlapping gene";
774 const string kGenePartialConflictMiscFeat = "[n/2] misc_feature location[s] conflict with partialness of overlapping gene";
775 const string kConflictBoth = " feature partialness conflicts with gene on both ends";
776 const string kConflictStart = " feature partialness conflicts with gene on 5' end";
777 const string kConflictStop = " feature partialness conflicts with gene on 3' end";
778 
779 
780 DISCREPANCY_CASE(GENE_PARTIAL_CONFLICT, SEQUENCE, eOncaller | eSubmitter | eSmart, "Feature partialness should agree with gene partialness if endpoints match")
781 {
782  const CSeqdesc* molinfo = context.GetMolinfo();
783  const CSeqdesc* biosrc = context.GetBiosource();
784  bool is_mrna = molinfo && molinfo->GetMolinfo().IsSetBiomol() && molinfo->GetMolinfo().GetBiomol() == CMolInfo::eBiomol_mRNA;
785  bool is_eukaryotic = context.IsEukaryotic(biosrc ? &biosrc->GetSource() : nullptr);
786 
787  const auto& all = context.FeatAll();
788  for (const CSeq_feat* feat : all) {
789  if (!feat->IsSetData()) {
790  continue;
791  }
792  const CSeq_feat* gene = context.GetGeneForFeature(*feat);
793  if (!gene) {
794  continue;
795  }
796  bool conflict_start = false;
797  bool conflict_stop = false;
798  CSeqFeatData::ESubtype subtype = feat->GetData().GetSubtype();
799  string middle_label = kGenePartialConflictOther;
800  if (feat->GetData().IsCdregion()) {
801  if (!is_eukaryotic || is_mrna) {
802  middle_label = kGenePartialConflictCodingRegion;
803  conflict_start = IsPartialStartConflict(*feat, *gene, is_mrna);
804  conflict_stop = IsPartialStopConflict(*feat, *gene, is_mrna);
805  if (is_mrna) {
806  //look for 5' UTR
807  TSeqPos gene_start = gene->GetLocation().GetStart(eExtreme_Biological);
808  bool gene_start_partial = gene->GetLocation().IsPartialStart(eExtreme_Biological);
809  bool found_start = false;
810  bool found_utr5 = false;
811  for (const CSeq_feat* fi : all) {
812  if (fi->IsSetData() && fi->GetData().GetSubtype() == CSeqFeatData::eSubtype_5UTR) {
813  found_utr5 = true;
814  if (fi->GetLocation().GetStart(eExtreme_Biological) == gene_start && fi->GetLocation().IsPartialStart(eExtreme_Biological) == gene_start_partial) {
815  found_start = true;
816  conflict_start = false;
817  break;
818  }
819  }
820  }
821  if (found_utr5 && !found_start) {
822  conflict_start = true;
823  }
824  //look for 3' UTR
825  TSeqPos gene_stop = gene->GetLocation().GetStop(eExtreme_Biological);
826  bool gene_stop_partial = gene->GetLocation().IsPartialStop(eExtreme_Biological);
827  bool found_stop = false;
828  bool found_utr3 = false;
829  for (const CSeq_feat* fi : all) {
830  if (fi->IsSetData() && fi->GetData().GetSubtype() == CSeqFeatData::eSubtype_3UTR) {
831  found_utr3 = true;
832  if (fi->GetLocation().GetStop(eExtreme_Biological) == gene_stop && fi->GetLocation().IsPartialStop(eExtreme_Biological) == gene_stop_partial) {
833  found_stop = true;
834  conflict_stop = false;
835  break;
836  }
837  }
838  }
839  if (found_utr3 && !found_stop) {
840  conflict_stop = true;
841  }
842  }
843  }
844  }
845  else if (feat->GetData().IsRna() || subtype == CSeqFeatData::eSubtype_intron || subtype == CSeqFeatData::eSubtype_exon || subtype == CSeqFeatData::eSubtype_5UTR || subtype == CSeqFeatData::eSubtype_3UTR || subtype == CSeqFeatData::eSubtype_misc_feature) {
846  conflict_start = IsPartialStartConflict(*feat, *gene);
847  conflict_stop = IsPartialStopConflict(*feat, *gene);
848  if (subtype == CSeqFeatData::eSubtype_misc_feature) {
849  middle_label = kGenePartialConflictMiscFeat;
850  }
851  }
852  if (conflict_start || conflict_stop) {
853  string label = CSeqFeatData::SubtypeValueToName(subtype);
854  label += conflict_start && conflict_stop ? kConflictBoth : conflict_start ? kConflictStart : kConflictStop;
855  m_Objs[kGenePartialConflictTop][middle_label].Ext()[label].Ext().Add(*context.SeqFeatObjRef(*feat), false).Add(*context.SeqFeatObjRef(*gene), false);
856  }
857  }
858 }
859 
860 
861 DISCREPANCY_SUMMARIZE(GENE_PARTIAL_CONFLICT)
862 {
863  m_ReportItems = m_Objs.Export(*this, false)->GetSubitems();
864 }
865 
866 
867 // BAD_GENE_STRAND
868 
870 {
871  if (s1 == eNa_strand_minus && s2 != eNa_strand_minus) {
872  return false;
873  } else if (s1 != eNa_strand_minus && s2 == eNa_strand_minus) {
874  return false;
875  } else {
876  return true;
877  }
878 }
879 
880 
881 bool HasMixedStrands(const CSeq_loc& loc)
882 {
883  CSeq_loc_CI li(loc);
884  if (!li) {
885  return false;
886  }
887  ENa_strand first_strand = li.GetStrand();
888  ++li;
889  while (li) {
890  if (!StrandsMatch(li.GetStrand(), first_strand)) {
891  return true;
892  }
893  ++li;
894  }
895  return false;
896 }
897 
898 
899 const string kBadGeneStrand = "[n/2] feature location[s] conflict with gene location strand[s]";
900 
901 
902 DISCREPANCY_CASE(BAD_GENE_STRAND, SEQUENCE, eOncaller | eSubmitter | eSmart, "Genes and features that share endpoints should be on the same strand")
903 {
904  // note - use positional instead of biological, because we are *looking* for objects on the opposite strand
905  const auto& genes = context.FeatGenes();
906  const auto& feats = context.FeatAll();
907 
908  for (size_t j = 0; j < feats.size(); j++) {
909  CSeqFeatData::ESubtype subtype = feats[j]->GetData().GetSubtype();
911  continue;
912  }
913  const CSeq_loc& loc_j = feats[j]->GetLocation();
914  TSeqPos feat_start = loc_j.GetStart(eExtreme_Positional);
915  TSeqPos feat_stop = loc_j.GetStop(eExtreme_Positional);
916  for (size_t i = 0; i < genes.size(); i++) {
917  if (!genes[i]->IsSetLocation()) {
918  continue;
919  }
920  const CSeq_loc& loc_i = genes[i]->GetLocation();
921  ENa_strand strand_i = loc_i.GetStrand();
922  TSeqPos gene_start = loc_i.GetStart(eExtreme_Positional);
923  TSeqPos gene_stop = loc_i.GetStop(eExtreme_Positional);
924  if (feat_start == gene_start || feat_stop == gene_stop) {
925  bool all_ok = true;
926  if (HasMixedStrands(loc_i)) {
927  // compare intervals, to make sure that for each pair of feature interval and gene interval
928  // where the gene interval contains the feature interval, the intervals are on the same strand
929  CSeq_loc_CI f_loc(loc_j);
930  bool found_bad = false;
931  while (f_loc && !found_bad) {
932  CConstRef<CSeq_loc> f_int = f_loc.GetRangeAsSeq_loc();
933  CSeq_loc_CI g_loc(loc_i);
934  while (g_loc && !found_bad) {
935  CConstRef<CSeq_loc> g_int = g_loc.GetRangeAsSeq_loc();
936  sequence::ECompare cmp = context.Compare(*f_int, *g_int);
938  if (!StrandsMatch(f_loc.GetStrand(), g_loc.GetStrand())) {
939  found_bad = true;
940  }
941  }
942  ++g_loc;
943  }
944  ++f_loc;
945  }
946  all_ok = !found_bad;
947  }
948  else {
949  all_ok = StrandsMatch(loc_j.GetStrand(), strand_i);
950  }
951  if (!all_ok) {
952  size_t offset = m_Objs[kBadGeneStrand].GetMap().size() + 1;
953  string label = "Gene and feature strands conflict (pair " + NStr::NumericToString(offset) + ")";
954  m_Objs[kBadGeneStrand][label].Ext().Add(*context.SeqFeatObjRef(*genes[i]), false).Add(*context.SeqFeatObjRef(*feats[j]), false);
955  }
956  }
957  }
958  }
959 }
960 
961 
962 DISCREPANCY_SUMMARIZE(BAD_GENE_STRAND)
963 {
964  m_ReportItems = m_Objs.Export(*this, false)->GetSubitems();
965 }
966 
967 
968 // MICROSATELLITE_REPEAT_TYPE
969 
970 DISCREPANCY_CASE(MICROSATELLITE_REPEAT_TYPE, FEAT, eOncaller | eFatal, "Microsatellites must have repeat type of tandem")
971 {
972  for (const CSeq_feat& feat : context.GetFeat()) {
973  if (feat.IsSetData() && feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_repeat_region && feat.IsSetQual()) {
974  bool is_microsatellite = false;
975  bool is_tandem = false;
976  const CSeq_feat::TQual& quals = feat.GetQual();
977  for (auto it = quals.cbegin(); it != quals.cend() && (!is_microsatellite || !is_tandem); ++it) {
978  const CGb_qual& qual = **it;
979  if (NStr::EqualCase(qual.GetQual(), "satellite")) {
980  if (NStr::EqualNocase(qual.GetVal(), "microsatellite") ||
981  NStr::StartsWith(qual.GetVal(), "microsatellite:", NStr::eNocase)) {
982  is_microsatellite = true;
983  }
984  }
985  else if (NStr::EqualCase(qual.GetQual(), "rpt_type")) {
986  is_tandem = NStr::EqualCase(qual.GetVal(), "tandem");
987  }
988  }
989  if (is_microsatellite && !is_tandem) {
990  m_Objs["[n] microsatellite[s] do not have a repeat type of tandem"].Add(*context.SeqFeatObjRef(feat, &feat)).Fatal();
991  }
992  }
993  }
994 }
995 
996 
997 DISCREPANCY_AUTOFIX(MICROSATELLITE_REPEAT_TYPE)
998 {
999  const CSeq_feat* sf = dynamic_cast<const CSeq_feat*>(context.FindObject(*obj));
1000  CRef<CSeq_feat> new_feat(new CSeq_feat());
1001  new_feat->Assign(*sf);
1002  CRef<CGb_qual> new_qual(new CGb_qual("rpt_type", "tandem"));
1003  new_feat->SetQual().push_back(new_qual);
1004  context.ReplaceSeq_feat(*obj, *sf, *new_feat);
1005  obj->SetFixed();
1006  return CRef<CAutofixReport>(new CAutofixReport("MICROSATELLITE_REPEAT_TYPE: added repeat type of tandem to [n] microsatellite[s]", 1));
1007 }
1008 
1009 
1010 // SUSPICIOUS_NOTE_TEXT
1011 static const string kSuspiciousNotePhrases[] =
1012 {
1013  "characterised",
1014  "recognised",
1015  "characterisation",
1016  "localisation",
1017  "tumour",
1018  "uncharacterised",
1019  "oxydase",
1020  "colour",
1021  "localise",
1022  "faecal",
1023  "orthologue",
1024  "paralogue",
1025  "homolog",
1026  "homologue",
1027  "intronless gene"
1028 };
1029 
1031 
1032 static void FindSuspiciousNotePhrases(const string& s, CDiscrepancyContext& context, CReportNode& rep, const CSeq_feat& feat)
1033 {
1034  for (size_t k = 0; k < kNumSuspiciousNotePhrases; k++) {
1036  rep["[n] note text[s] contain suspicious phrase[s]"]["[n] note text[s] contain '" + kSuspiciousNotePhrases[k] + "'"].Ext().Add(*context.SeqFeatObjRef(feat));
1037  }
1038  }
1039 }
1040 
1041 
1042 DISCREPANCY_CASE(SUSPICIOUS_NOTE_TEXT, FEAT, eOncaller, "Find Suspicious Phrases in Note Text")
1043 {
1044  for (const CSeq_feat& feat : context.GetFeat()) {
1045  if (feat.IsSetData()) {
1046  switch (feat.GetData().GetSubtype()) {
1048  // look in gene comment and gene description
1049  if (feat.IsSetComment()) {
1050  FindSuspiciousNotePhrases(feat.GetComment(), context, m_Objs, feat);
1051  }
1052  if (feat.GetData().GetGene().IsSetDesc()) {
1053  FindSuspiciousNotePhrases(feat.GetData().GetGene().GetDesc(), context, m_Objs, feat);
1054  }
1055  break;
1057  if (feat.GetData().GetProt().IsSetDesc()) {
1058  FindSuspiciousNotePhrases(feat.GetData().GetProt().GetDesc(), context, m_Objs, feat);
1059  }
1060  break;
1063  if (feat.IsSetComment()) {
1064  FindSuspiciousNotePhrases(feat.GetComment(), context, m_Objs, feat);
1065  }
1066  break;
1067  default:
1068  break;
1069  }
1070  }
1071  }
1072 }
1073 
1074 
1075 DISCREPANCY_SUMMARIZE(SUSPICIOUS_NOTE_TEXT)
1076 {
1077  m_ReportItems = m_Objs.Export(*this, false)->GetSubitems();
1078 }
1079 
1080 
1081 // CDS_HAS_NEW_EXCEPTION
1082 
1083 static const string kNewExceptions[] =
1084 {
1085  "annotated by transcript or proteomic data",
1086  "heterogeneous population sequenced",
1087  "low-quality sequence region",
1088  "unextendable partial coding region",
1089 };
1090 
1091 
1092 DISCREPANCY_CASE(CDS_HAS_NEW_EXCEPTION, FEAT, eDisc | eOncaller | eSmart, "Coding region has new exception")
1093 {
1094  static const size_t max = ArraySize(kNewExceptions);
1095  for (const CSeq_feat& feat : context.GetFeat()) {
1096  if (feat.IsSetData() && feat.GetData().IsCdregion() && feat.IsSetExcept_text()) {
1097  for (size_t i = 0; i < max; i++) {
1098  if (NStr::FindNoCase(feat.GetExcept_text(), kNewExceptions[i]) != NPOS) {
1099  m_Objs["[n] coding region[s] [has] new exception[s]"].Add(*context.SeqFeatObjRef(feat));
1100  break;
1101  }
1102  }
1103  }
1104  }
1105 }
1106 
1107 
1108 DISCREPANCY_SUMMARIZE(CDS_HAS_NEW_EXCEPTION)
1109 {
1110  m_ReportItems = m_Objs.Export(*this, false)->GetSubitems();
1111 }
1112 
1113 
1114 // SHORT_LNCRNA
1115 
1116 DISCREPANCY_CASE(SHORT_LNCRNA, FEAT, eDisc | eOncaller | eSubmitter | eSmart, "Short lncRNA sequences")
1117 {
1118  for (const CSeq_feat& feat : context.GetFeat()) {
1119  if (feat.IsSetData() && feat.GetData().IsRna() && feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_ncRNA
1120  && feat.GetData().GetRna().IsSetExt() && feat.GetData().GetRna().GetExt().IsGen() && feat.GetData().GetRna().GetExt().GetGen().IsSetClass()
1121  && NStr::EqualNocase(feat.GetData().GetRna().GetExt().GetGen().GetClass(), "lncrna") // only looking at lncrna features
1122  && !feat.GetLocation().IsPartialStart(eExtreme_Biological) && !feat.GetLocation().IsPartialStop(eExtreme_Biological) // ignore if partial
1123  && sequence::GetLength(feat.GetLocation(), &(context.GetScope())) < 200) {
1124  m_Objs["[n] lncRNA feature[s] [is] suspiciously short"].Add(*context.SeqFeatObjRef(feat));
1125  }
1126  }
1127 }
1128 
1129 
1131 {
1132  m_ReportItems = m_Objs.Export(*this, false)->GetSubitems();
1133 }
1134 
1135 
1136 // JOINED_FEATURES
1137 
1138 const string& kJoinedFeatures = "[n] feature[s] [has] joined location[s].";
1139 const string& kJoinedFeaturesNoException = "[n] feature[s] [has] joined location but no exception";
1140 const string& kJoinedFeaturesException = "[n] feature[s] [has] joined location but exception '";
1141 const string& kJoinedFeaturesBlankException = "[n] feature[s] [has] joined location but a blank exception";
1142 
1143 DISCREPANCY_CASE(JOINED_FEATURES, FEAT, eDisc | eSubmitter | eSmart, "Joined Features: on when non-eukaryote")
1144 {
1145  const CSeqdesc* biosrc = context.GetBiosource();
1146  if (biosrc && !context.IsEukaryotic(&biosrc->GetSource()) && !context.IsOrganelle(&biosrc->GetSource())) {
1147  for (const CSeq_feat& feat : context.GetFeat()) {
1148  if (feat.IsSetLocation()) {
1149  if (feat.GetLocation().IsMix() || feat.GetLocation().IsPacked_int()) {
1150  if (feat.IsSetExcept_text()) {
1151  if (NStr::IsBlank(feat.GetExcept_text())) {
1152  m_Objs[kJoinedFeatures][kJoinedFeaturesBlankException].Ext().Add(*context.SeqFeatObjRef(feat));
1153  }
1154  else {
1155  m_Objs[kJoinedFeatures][kJoinedFeaturesException + feat.GetExcept_text() + "'"].Ext().Add(*context.SeqFeatObjRef(feat));
1156  }
1157  }
1158  else if (feat.IsSetExcept() && feat.GetExcept()) {
1159  m_Objs[kJoinedFeatures][kJoinedFeaturesBlankException].Ext().Add(*context.SeqFeatObjRef(feat));
1160  }
1161  else {
1162  m_Objs[kJoinedFeatures][kJoinedFeaturesNoException].Ext().Add(*context.SeqFeatObjRef(feat));
1163  }
1164  }
1165  }
1166  }
1167  }
1168 }
1169 
1170 
1171 DISCREPANCY_SUMMARIZE(JOINED_FEATURES)
1172 {
1173  m_ReportItems = m_Objs.Export(*this, false)->GetSubitems();
1174 }
1175 
1176 
1177 // BACTERIAL_JOINED_FEATURES_NO_EXCEPTION
1178 
1179 DISCREPANCY_CASE(BACTERIAL_JOINED_FEATURES_NO_EXCEPTION, SEQUENCE, eDisc | eSubmitter | eSmart, "Joined Features on prokaryote without exception")
1180 {
1181  const CSeqdesc* biosrc = context.GetBiosource();
1182  if (biosrc && (context.IsEukaryotic(&biosrc->GetSource()) || context.IsOrganelle(&biosrc->GetSource()))) {
1183  return;
1184  }
1185  for (const CSeq_feat& feat : context.GetAllFeat()) {
1186  if (feat.IsSetLocation() && feat.CanGetData() && feat.GetData().IsCdregion() && !context.IsPseudo(feat)) {
1187  if (feat.GetLocation().IsMix() || feat.GetLocation().IsPacked_int()) {
1188  if ((feat.IsSetExcept_text() && !feat.GetExcept_text().empty()) || (feat.IsSetExcept() && feat.GetExcept())) {
1189  continue;
1190  }
1191  bool bad = true;
1192  if (context.CurrentBioseq().CanGetInst()) {
1193  const CSeq_inst& inst = context.CurrentBioseq().GetInst();
1195  unsigned int len = inst.GetLength();
1196  CSeq_loc_CI ci0(feat.GetLocation());
1197  if (ci0) {
1198  CSeq_loc_CI ci1 = ci0;
1199  ++ci1;
1200  if (ci1) {
1201  CSeq_loc_CI ci2 = ci1;
1202  ++ci2;
1203  if (!ci2) { // location has exactly 2 intervals
1204  if (ci0.GetStrand() == eNa_strand_plus && ci1.GetStrand() == eNa_strand_plus) {
1205  if (ci0.GetRange().GetTo() == len - 1 && ci1.GetRange().GetFrom() == 0) {
1206  bad = false;
1207  }
1208  }
1209  else if (ci0.GetStrand() == eNa_strand_minus && ci1.GetStrand() == eNa_strand_minus) {
1210  if (ci1.GetRange().GetTo() == len - 1 && ci0.GetRange().GetFrom() == 0) {
1211  bad = false;
1212  }
1213  }
1214  }
1215  }
1216  }
1217  }
1218  }
1219  m_Objs["[n] coding region[s] with joined location[s] [has] no exception[s]"][bad ? "[n] coding region[s] not over the origin of circular DNA" : "[n] coding region[s] over the origin of circular DNA"].Severity(bad ? CReportItem::eSeverity_error : CReportItem::eSeverity_warning).Add(*context.SeqFeatObjRef(feat));
1220  }
1221  }
1222  }
1223 }
1224 
1225 
1226 DISCREPANCY_SUMMARIZE(BACTERIAL_JOINED_FEATURES_NO_EXCEPTION)
1227 {
1228  m_ReportItems = m_Objs.Export(*this, false)->GetSubitems();
1229 }
1230 
1231 
1232 // RIBOSOMAL_SLIPPAGE
1233 
1234 DISCREPANCY_CASE(RIBOSOMAL_SLIPPAGE, FEAT, eDisc | eSmart | eFatal, " Only a select number of proteins undergo programmed frameshifts due to ribosomal slippage")
1235 {
1236  const CSeqdesc* biosrc = context.GetBiosource();
1237  if (biosrc && !context.IsEukaryotic(&biosrc->GetSource()) && !context.IsOrganelle(&biosrc->GetSource())) {
1238  for (const CSeq_feat& feat : context.GetFeat()) {
1239  if (feat.IsSetLocation() && feat.CanGetData() && feat.GetData().IsCdregion() && feat.IsSetExcept_text() && (feat.GetLocation().IsMix() || feat.GetLocation().IsPacked_int())) {
1240  if (feat.GetExcept_text().find("ribosomal slippage") != string::npos) {
1241  //string product = GetProductForCDS(feat, context.GetScope()); // sema: may need to change when we start using CFeatTree
1242  string product = context.GetProdForFeature(feat);
1244  continue; // note: used to be "return" but that seems wrong
1245  }
1246  m_Objs["[n] coding region[s] [has] unexpected ribosomal slippage"].Fatal().Add(*context.SeqFeatObjRef(feat));
1247  }
1248  }
1249  }
1250  }
1251 }
1252 
1253 
1254 DISCREPANCY_SUMMARIZE(RIBOSOMAL_SLIPPAGE)
1255 {
1256  m_ReportItems = m_Objs.Export(*this, false)->GetSubitems();
1257 }
1258 
1259 
1260 // SHORT_INTRON
1261 
1262 const string kShortIntronTop = "[n] intron[s] [is] shorter than 10 nt";
1263 const string kShortIntronExcept = "[n] intron[s] [is] shorter than 11 nt and [has] an exception";
1264 
1265 DISCREPANCY_CASE(SHORT_INTRON, FEAT, eDisc | eOncaller | eSubmitter | eSmart, "Introns shorter than 10 nt")
1266 {
1267  for (const CSeq_feat& feat : context.GetFeat()) {
1268  if (feat.IsSetData() && feat.GetData().IsCdregion() && feat.IsSetLocation() && !feat.IsSetExcept() && !context.IsPseudo(feat)) {
1269  CSeq_loc_CI li(feat.GetLocation());
1270  if (li) {
1271  bool found_short = false;
1272  TSeqPos last_start = li.GetRange().GetFrom();
1273  TSeqPos last_stop = li.GetRange().GetTo();
1274  ++li;
1275  while (li && !found_short) {
1276  TSeqPos start = li.GetRange().GetFrom();
1277  TSeqPos stop = li.GetRange().GetTo();
1278  if (start >= last_stop && start - last_stop < 11) {
1279  found_short = true;
1280  }
1281  else if (last_stop >= start && last_stop - start < 11) {
1282  found_short = true;
1283  }
1284  else if (stop >= last_start && stop - last_start < 11) {
1285  found_short = true;
1286  }
1287  else if (last_start >= stop && last_start - stop < 11) {
1288  found_short = true;
1289  }
1290  last_start = start;
1291  last_stop = stop;
1292  ++li;
1293  }
1294  if (found_short) {
1295  //if (obj.IsSetExcept() && obj.GetExcept()) {
1296  // m_Objs[kShortIntronTop][kShortIntronExcept].Ext().Add(*context.DiscrObj(obj, true));
1297  //}
1298  m_Objs[kShortIntronTop].Add(*context.SeqFeatObjRef(feat, CDiscrepancyContext::eFixSet));
1299  }
1300  }
1301  }
1302  }
1303 }
1304 
1305 
1307 {
1308  m_ReportItems = m_Objs.Export(*this, false)->GetSubitems();
1309 }
1310 
1311 
1312 static const string kPutativeFrameShift = "putative frameshift";
1313 
1314 static void AddException(const CSeq_feat& sf, CScope& scope, const string& exception_text)
1315 {
1316  CRef<CSeq_feat> new_feat(new CSeq_feat());
1317  new_feat->Assign(sf);
1318  if (new_feat->IsSetExcept_text() && !NStr::IsBlank(new_feat->GetExcept_text())) {
1319  new_feat->SetExcept_text(new_feat->GetExcept_text() + "; " + exception_text);
1320  } else {
1321  new_feat->SetExcept_text(exception_text);
1322  }
1323  new_feat->SetExcept(true);
1324  CSeq_feat_EditHandle feh(scope.GetSeq_featHandle(sf));
1325  feh.Replace(*new_feat);
1326 }
1327 
1328 
1330 {
1331  if (sf.IsSetComment() && !NStr::IsBlank(sf.GetComment())) {
1332  if (gene.IsSetComment() && !NStr::IsBlank(gene.GetComment())) {
1333  gene.SetComment(sf.GetComment() + ';' + gene.GetComment());
1334  }
1335  else {
1336  gene.ResetComment();
1337 
1338  if (sf.IsSetComment()) {
1339  gene.SetComment(sf.GetComment());
1340  sf.ResetComment();
1341  }
1342 
1343  if (is_bacterial) {
1344  sf.SetComment("contains short intron that may represent a frameshift");
1345  }
1346  }
1347  }
1348 
1349  if (!gene.IsSetComment() || NStr::Find(gene.GetComment(), kPutativeFrameShift) == NPOS) {
1350  if (gene.IsSetComment() && !NStr::IsBlank(gene.GetComment())) {
1351  gene.SetComment(kPutativeFrameShift + ';' + gene.GetComment());
1352  }
1353  else {
1354  gene.ResetComment();
1355 
1356  if (sf.IsSetComment()) {
1357  gene.SetComment(sf.GetComment());
1358  sf.ResetComment();
1359  }
1360 
1361  if (is_bacterial) {
1362  sf.SetComment("contains short intron that may represent a frameshift");
1363  }
1364  }
1365  }
1366 }
1367 
1368 
1369 static void ConvertToMiscFeature(CSeq_feat& sf, CScope& scope)
1370 {
1371  if (sf.IsSetData()) {
1372 
1373  if (sf.GetData().IsCdregion() || sf.GetData().IsRna()) {
1374 
1375  string prod_name;
1376  if (sf.GetData().IsCdregion()) {
1377  prod_name = GetProductName(sf, scope);
1378  sf.ResetProduct();
1379  }
1380  else {
1381  prod_name = sf.GetData().GetRna().GetRnaProductName();
1382  }
1383 
1384  if (!NStr::IsBlank(prod_name)) {
1385  if (sf.IsSetComment()) {
1386  sf.SetComment(prod_name + ';' + sf.GetComment());
1387  }
1388  else {
1389  sf.SetComment(prod_name);
1390  }
1391  }
1392 
1393  sf.ResetData();
1394  sf.SetData().SetImp().SetKey("misc_feature");
1395  }
1396  }
1397 }
1398 
1399 
1400 static bool AddExceptionsToShortIntron(const CSeq_feat& sf, CScope& scope, std::list<CConstRef<CSeq_loc>>& to_remove)
1401 {
1402  bool rval = false;
1403  const CBioSource* source = nullptr;
1404  {
1405  auto bsh = scope.GetBioseqHandle(sf.GetLocation());
1406  CSeqdesc_CI src(bsh, CSeqdesc::e_Source);
1407  if (src) {
1408  source = &src->GetSource();
1409  }
1410  }
1411  if (source) {
1412  if (source->IsSetGenome() && source->GetGenome() == CBioSource::eGenome_mitochondrion) {
1413  return false;
1414  }
1415  bool is_bacterial = CDiscrepancyContext::HasLineage(*source, "", "Bacteria");
1416  if (is_bacterial || CDiscrepancyContext::HasLineage(*source, "", "Archea")) {
1418  if (gene.NotEmpty()) {
1419  CSeq_feat* gene_edit = const_cast<CSeq_feat*>(gene.GetPointer());
1420  CSeq_feat& sf_edit = const_cast<CSeq_feat&>(sf);
1421  rval = true;
1422  gene_edit->SetPseudo(true);
1423  AdjustBacterialGeneForCodingRegionWithShortIntron(sf_edit, *gene_edit, is_bacterial);
1424  // Merge gene's location
1425  if (gene_edit->IsSetLocation()) {
1426  CRef<CSeq_loc> new_loc = gene_edit->SetLocation().Merge(CSeq_loc::fMerge_All, nullptr);
1427  if (new_loc.NotEmpty()) {
1428  gene_edit->SetLocation().Assign(*new_loc);
1429  }
1430  }
1431  if (sf.IsSetProduct()) {
1432  to_remove.push_back(CConstRef<CSeq_loc>(&sf.GetProduct()));
1433  }
1434  if (is_bacterial) {
1435  ConvertToMiscFeature(sf_edit, scope);
1436  }
1437  else {
1438  CSeq_feat_EditHandle sf_handle(scope.GetSeq_featHandle(sf));
1439  sf_handle.Remove();
1440  }
1441  }
1442  return rval;
1443  }
1444  }
1445  if (!sf.IsSetExcept_text() || NStr::Find(sf.GetExcept_text(), "low-quality sequence region") == string::npos) {
1446  AddException(sf, scope, "low-quality sequence region");
1447  rval = true;
1448  }
1449  return rval;
1450 }
1451 
1452 
1453 DISCREPANCY_AUTOFIX(SHORT_INTRON)
1454 {
1455  unsigned int n = 0;
1456  const CSeq_feat* sf = dynamic_cast<const CSeq_feat*>(context.FindObject(*obj));
1457  std::list<CConstRef<CSeq_loc>> to_remove;
1458  if (AddExceptionsToShortIntron(*sf, context.GetScope(), to_remove)) {
1459  n++;
1460  }
1461  for (auto& loc : to_remove) {
1462  CBioseq_Handle bioseq_h = context.GetScope().GetBioseqHandle(*loc);
1463  CBioseq_EditHandle bioseq_edit = bioseq_h.GetEditHandle();
1464  bioseq_edit.Remove();
1465  }
1466  obj->SetFixed();
1467  return CRef<CAutofixReport>(n ? new CAutofixReport("SHORT_INTRON: Set exception for [n] feature[s]", n) : nullptr);
1468 }
1469 
1470 
1471 // UNNECESSARY_VIRUS_GENE
1472 
1473 DISCREPANCY_CASE(UNNECESSARY_VIRUS_GENE, FEAT, eOncaller, "Unnecessary gene features on virus: on when lineage is not Picornaviridae,Potyviridae,Flaviviridae and Togaviridae")
1474 {
1475  const CSeqdesc* biosrc = context.GetBiosource();
1476  if (biosrc) {
1477  const CBioSource* src = &biosrc->GetSource();
1478  if (context.HasLineage(src, "Picornaviridae") || context.HasLineage(src, "Potyviridae") || context.HasLineage(src, "Flaviviridae") || context.HasLineage(src, "Togaviridae")) {
1479  for (const CSeq_feat& feat : context.GetFeat()) {
1480  if (feat.IsSetData() && feat.GetData().IsGene()) {
1481  m_Objs["[n] virus gene[s] need to be removed"].Add(*context.SeqFeatObjRef(feat));
1482  }
1483  }
1484  }
1485  }
1486 }
1487 
1488 
1489 DISCREPANCY_SUMMARIZE(UNNECESSARY_VIRUS_GENE)
1490 {
1491  m_ReportItems = m_Objs.Export(*this, false)->GetSubitems();
1492 }
1493 
1494 
1495 // CDS_HAS_CDD_XREF
1496 
1497 DISCREPANCY_CASE(CDS_HAS_CDD_XREF, FEAT, eDisc | eOncaller, "CDS has CDD Xref")
1498 {
1499  for (const CSeq_feat& feat : context.GetFeat()) {
1500  if (feat.IsSetData() && feat.GetData().IsCdregion() && feat.IsSetDbxref()) {
1501  for (auto& x : feat.GetDbxref()) {
1502  if (x->IsSetDb() && NStr::EqualNocase(x->GetDb(), "CDD")) {
1503  m_Objs["[n] feature[s] [has] CDD Xrefs"].Add(*context.SeqFeatObjRef(feat));
1504  break;
1505  }
1506  }
1507  }
1508  }
1509 }
1510 
1511 
1512 DISCREPANCY_SUMMARIZE(CDS_HAS_CDD_XREF)
1513 {
1514  m_ReportItems = m_Objs.Export(*this, false)->GetSubitems();
1515 }
1516 
1517 
1518 // SHOW_TRANSL_EXCEPT
1519 
1520 DISCREPANCY_CASE(SHOW_TRANSL_EXCEPT, FEAT, eDisc | eSubmitter | eSmart, "Show translation exception")
1521 {
1522  for (const CSeq_feat& feat : context.GetFeat()) {
1523  if (feat.IsSetData() && feat.GetData().IsCdregion() && feat.GetData().GetCdregion().IsSetCode_break()) {
1524  m_Objs["[n] coding region[s] [has] a translation exception"].Add(*context.SeqFeatObjRef(feat));
1525  }
1526  }
1527 }
1528 
1529 
1530 DISCREPANCY_SUMMARIZE(SHOW_TRANSL_EXCEPT)
1531 {
1532  m_ReportItems = m_Objs.Export(*this, false)->GetSubitems();
1533 }
1534 
1535 
1536 // NO_PRODUCT_STRING
1537 
1538 static const string kNoProductStr = "[n] product[s] [has] \"no product string in file\"";
1539 
1540 DISCREPANCY_CASE(NO_PRODUCT_STRING, FEAT, eDisc, "Product has string \"no product string in file\"")
1541 {
1542  for (const CSeq_feat& feat : context.GetFeat()) {
1543  if (feat.IsSetData() && feat.GetData().IsProt()) {
1544  const CProt_ref& prot = feat.GetData().GetProt();
1545  if (prot.IsSetName()) {
1546  const string* no_prot_str = NStr::FindNoCase(prot.GetName(), "no product string in file");
1547  if (no_prot_str) {
1548  const CSeq_feat* product = sequence::GetCDSForProduct(context.CurrentBioseq(), &context.GetScope());
1549  if (product) {
1550  m_Objs[kNoProductStr].Add(*context.SeqFeatObjRef(*product), false);
1551  }
1552  }
1553  }
1554  }
1555  }
1556 }
1557 
1558 
1559 DISCREPANCY_SUMMARIZE(NO_PRODUCT_STRING)
1560 {
1561  m_ReportItems = m_Objs.Export(*this, false)->GetSubitems();
1562 }
1563 
1564 
1565 // UNWANTED_SPACER
1566 
1567 static const string kIntergenicSpacerNames[] = {
1568  "trnL-trnF intergenic spacer",
1569  "trnH-psbA intergenic spacer",
1570  "trnS-trnG intergenic spacer",
1571  "trnF-trnL intergenic spacer",
1572  "psbA-trnH intergenic spacer",
1573  "trnG-trnS intergenic spacer" };
1574 
1576 
1577 
1578 DISCREPANCY_CASE(UNWANTED_SPACER, FEAT, eOncaller, "Intergenic spacer without plastid location")
1579 {
1580  const CSeqdesc* biosrc = context.GetBiosource();
1581  if (biosrc && biosrc->GetSource().IsSetGenome() && (biosrc->GetSource().GetGenome() == CBioSource::eGenome_chloroplast || biosrc->GetSource().GetGenome() == CBioSource::eGenome_plastid)) {
1582  return;
1583  }
1584  if (biosrc && biosrc->GetSource().IsSetOrg() && biosrc->GetSource().GetOrg().IsSetTaxname() && CDiscrepancyContext::IsUnculturedNonOrganelleName(biosrc->GetSource().GetOrg().GetTaxname())) {
1585  return;
1586  }
1587  for (const CSeq_feat& feat : context.GetFeat()) {
1588  if (feat.IsSetData() && feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_misc_feature) {
1589  for (size_t i = 0; i < kIntergenicSpacerNames_len; i++) {
1590  if (feat.IsSetComment() && NStr::FindNoCase(feat.GetComment(), kIntergenicSpacerNames[i]) != NPOS) {
1591  m_Objs["[n] suspect intergenic spacer note[s] not organelle"].Add(*context.SeqFeatObjRef(feat));
1592  break;
1593  }
1594  }
1595  }
1596  }
1597 }
1598 
1599 
1600 // CHECK_RNA_PRODUCTS_AND_COMMENTS
1601 
1602 DISCREPANCY_CASE(CHECK_RNA_PRODUCTS_AND_COMMENTS, FEAT, eOncaller, "Check for gene or genes in rRNA and tRNA products and comments")
1603 {
1604  for (const CSeq_feat& feat : context.GetFeat()) {
1605  if (feat.IsSetData() && feat.GetData().IsRna()) {
1606  const CRNA_ref& rna = feat.GetData().GetRna();
1607  if ((rna.IsSetType() && rna.GetType() == CRNA_ref::eType_rRNA) || rna.GetType() == CRNA_ref::eType_tRNA) {
1608  string product = rna.GetRnaProductName();
1609  string comment;
1610  if (feat.IsSetComment()) {
1611  comment = feat.GetComment();
1612  }
1613  if (NStr::FindNoCase(product, "gene") != NPOS || NStr::FindNoCase(comment, "gene") != NPOS) {
1614  m_Objs["[n] RNA product_name or comment[s] contain[S] 'suspect phrase'"].Add(*context.SeqFeatObjRef(feat));
1615  }
1616  }
1617  }
1618  }
1619 }
1620 
1621 
1622 DISCREPANCY_SUMMARIZE(CHECK_RNA_PRODUCTS_AND_COMMENTS)
1623 {
1624  m_ReportItems = m_Objs.Export(*this, false)->GetSubitems();
1625 }
1626 
1627 
1628 // FEATURE_LOCATION_CONFLICT
1629 
1630 const string kFeatureLocationConflictTop = "[n] feature[s] [has] inconsistent gene location[s].";
1631 const string kFeatureLocationCodingRegion = "Coding region location does not match gene location";
1632 const string kFeatureLocationRNA = "RNA feature location does not match gene location";
1633 
1634 bool IsMixedStrand(const CSeq_loc& loc)
1635 {
1636  CSeq_loc_CI li(loc);
1637  if (!li) {
1638  return false;
1639  }
1640  ENa_strand first_strand = li.GetStrand();
1641  if (first_strand == eNa_strand_unknown) {
1642  first_strand = eNa_strand_plus;
1643  }
1644  ++li;
1645  while (li) {
1646  ENa_strand this_strand = li.GetStrand();
1647  if (this_strand == eNa_strand_unknown) {
1648  this_strand = eNa_strand_plus;
1649  }
1650  if (this_strand != first_strand) {
1651  return true;
1652  }
1653  ++li;
1654  }
1655  return false;
1656 }
1657 
1658 
1659 bool IsMixedStrandGeneLocationOk(const CSeq_loc& feat_loc, const CSeq_loc& gene_loc)
1660 {
1661  CSeq_loc_CI feat_i(feat_loc);
1662  CSeq_loc_CI gene_i(gene_loc);
1663 
1664  while (feat_i && gene_i) {
1665  ENa_strand gene_strand = gene_i.GetStrand();
1666  if (!StrandsMatch(feat_i.GetStrand(), gene_strand) ||
1668  return false;
1669  }
1670  bool found_stop = false;
1671  while (!found_stop && feat_i && StrandsMatch(feat_i.GetStrand(), gene_strand)) {
1673  found_stop = true;
1674  }
1675  ++feat_i;
1676  }
1677  if (!found_stop) {
1678  return false;
1679  }
1680  ++gene_i;
1681  }
1682  if ((feat_i && !gene_i) || (!feat_i && gene_i)) {
1683  return false;
1684  }
1685 
1686  return true;
1687 }
1688 
1689 
1690 bool StopAbutsGap(const CSeq_loc& loc, ENa_strand strand, CScope& scope)
1691 {
1692  if (CBioseq_Handle bsh = scope.GetBioseqHandle(loc); bsh) {
1693  TSeqPos stop = loc.GetStop(eExtreme_Biological);
1694  if (stop < 1 || stop > bsh.GetBioseqLength() - 2) {
1695  return false;
1696  }
1697  CRef<CSeq_loc> search(new CSeq_loc());
1698  search->SetInt().SetId().Assign(*(loc.GetId()));
1699  if (strand == eNa_strand_minus) {
1700  search->SetInt().SetFrom(stop - 1);
1701  search->SetInt().SetTo(stop - 1);
1702  search->SetInt().SetStrand(eNa_strand_minus);
1703  } else {
1704  search->SetInt().SetFrom(stop + 1);
1705  search->SetInt().SetTo(stop + 1);
1706  }
1707  CSeqVector vec(*search, scope);
1708  if (vec.size() && vec.IsInGap(0)) {
1709  return true;
1710  }
1711  }
1712  return false;
1713 }
1714 
1715 
1716 bool StartAbutsGap(const CSeq_loc& loc, ENa_strand strand, CScope& scope)
1717 {
1718  if (auto bsh = scope.GetBioseqHandle(loc); bsh) {
1719  TSeqPos start = loc.GetStart(eExtreme_Biological);
1720  if (start < 1 || start > bsh.GetBioseqLength() - 2) {
1721  return false;
1722  }
1723  CRef<CSeq_loc> search(new CSeq_loc());
1724  search->SetInt().SetId().Assign(*(loc.GetId()));
1725  if (strand == eNa_strand_minus) {
1726  search->SetInt().SetFrom(start + 1);
1727  search->SetInt().SetTo(start + 1);
1728  search->SetInt().SetStrand(eNa_strand_minus);
1729  } else {
1730  search->SetInt().SetFrom(start - 1);
1731  search->SetInt().SetTo(start - 1);
1732  }
1733  CSeqVector vec(*search, scope);
1734  if (vec.IsInGap(0)) {
1735  return true;
1736  }
1737  }
1738  return false;
1739 }
1740 
1741 
1742 // location is ok if:
1743 // 1. endpoints match exactly, or
1744 // 2. non-matching 5' endpoint can be extended by an RBS feature to match gene start, or
1745 // 3. if coding region non-matching endpoints are partial and abut a gap
1746 bool IsGeneLocationOk(const CSeq_loc& feat_loc, const CSeq_loc& gene_loc, ENa_strand feat_strand, ENa_strand gene_strand, bool is_coding_region, CScope& scope, const vector<const CSeq_feat*>& features)
1747 {
1748  if (IsMixedStrand(feat_loc) || IsMixedStrand(gene_loc)) {
1749  // special handling for trans-spliced
1750  return IsMixedStrandGeneLocationOk(feat_loc, gene_loc);
1751  } else if (!StrandsMatch(feat_strand, gene_strand)) {
1752  return false;
1753  } else if (gene_loc.GetStop(eExtreme_Biological) != feat_loc.GetStop(eExtreme_Biological)) {
1754  if (is_coding_region && feat_loc.IsPartialStop(eExtreme_Biological) && StopAbutsGap(feat_loc, feat_strand, scope)) {
1755  // ignore for now
1756  } else {
1757  return false;
1758  }
1759  }
1760  TSeqPos gene_start = gene_loc.GetStart(eExtreme_Biological);
1761  TSeqPos feat_start = feat_loc.GetStart(eExtreme_Biological);
1762 
1763  if (gene_start == feat_start) {
1764  return true;
1765  }
1766 
1767  CRef<CSeq_loc> rbs_search(new CSeq_loc());
1768  const CSeq_id* id = gene_loc.GetId();
1769  if (!id) {
1770  return false;
1771  }
1772  rbs_search->SetInt().SetId().Assign(*id);
1773  if (feat_loc.GetStrand() == eNa_strand_minus) {
1774  if (gene_start < feat_start) {
1775  return false;
1776  }
1777  rbs_search->SetInt().SetFrom(feat_start + 1);
1778  rbs_search->SetInt().SetTo(gene_start);
1779  rbs_search->SetStrand(eNa_strand_minus);
1780  } else {
1781  if (gene_start > feat_start) {
1782  return false;
1783  }
1784  rbs_search->SetInt().SetFrom(gene_start);
1785  rbs_search->SetInt().SetTo(feat_start - 1);
1786  }
1787  TSeqPos rbs_start = rbs_search->GetStart(eExtreme_Biological);
1788  for (const CSeq_feat* feat : features) {
1789  if (feat->GetLocation().GetStart(eExtreme_Biological) == rbs_start && IsRBS(*feat)) {
1790  return true;
1791  }
1792  }
1793  if (is_coding_region && feat_loc.IsPartialStart(eExtreme_Biological) && StartAbutsGap(feat_loc, feat_strand, scope)) {
1794  // check to see if 5' end is partial and abuts gap
1795  return true;
1796  }
1797  return false;
1798 }
1799 
1800 
1801 bool GeneRefMatch(const CGene_ref& g1, const CGene_ref& g2)
1802 {
1803  return g1.IsSetLocus() == g2.IsSetLocus() && (!g1.IsSetLocus() || g1.GetLocus() == g2.GetLocus())
1804  && g1.IsSetLocus_tag() == g2.IsSetLocus_tag() && (!g1.IsSetLocus_tag() || g1.GetLocus_tag() == g2.GetLocus_tag())
1805  && g1.IsSetAllele() == g2.IsSetAllele() && (!g1.IsSetAllele() || g1.GetAllele() == g2.GetAllele())
1806  && g1.IsSetDesc() == g2.IsSetDesc() && (!g1.IsSetDesc() || g1.GetDesc() == g2.GetDesc())
1807  && g1.IsSetMaploc() == g2.IsSetMaploc() && (!g1.IsSetMaploc() || g1.GetMaploc() == g2.GetMaploc())
1808  && g1.IsSetPseudo() == g2.IsSetPseudo()
1809  ;
1810 }
1811 
1812 
1813 static string GetNextSubitemId(size_t num)
1814 {
1815  string ret = "[*";
1816  ret += NStr::SizetToString(num);
1817  ret += "*]";
1818  return ret;
1819 }
1820 
1821 
1822 DISCREPANCY_CASE(FEATURE_LOCATION_CONFLICT, SEQUENCE, eDisc | eSubmitter | eSmart, "Feature Location Conflict")
1823 {
1824  if (context.InGenProdSet()) {
1825  return;
1826  }
1827  const CSeqdesc* biosrc = context.GetBiosource();
1828  bool eukaryotic = context.IsEukaryotic(biosrc ? &biosrc->GetSource() : nullptr);
1829  const auto& all = context.FeatAll();
1830  for (const CSeq_feat* feat : all) {
1831  if (feat->IsSetData() && feat->IsSetLocation() && (feat->GetData().IsRna() || (!eukaryotic && feat->GetData().IsCdregion()))) {
1832  ENa_strand feat_strand = feat->GetLocation().GetStrand();
1833  const CGene_ref* gx = feat->GetGeneXref();
1834  const CSeq_feat* gene = context.GetGeneForFeature(*feat);
1835  if (!gene || (gx && !gx->IsSuppressed() && !GeneRefMatch(*gx, gene->GetData().GetGene()))) {
1836  if (feat->GetGeneXref()) {
1837  string subitem_id = GetNextSubitemId(m_Objs[kFeatureLocationConflictTop].GetMap().size());
1838  if (feat->GetData().IsCdregion()) {
1839  m_Objs[kFeatureLocationConflictTop]["Coding region xref gene does not exist" + subitem_id].Ext().Add(*context.SeqFeatObjRef(*feat), false);
1840  }
1841  else {
1842  m_Objs[kFeatureLocationConflictTop]["RNA feature xref gene does not exist" + subitem_id].Ext().Add(*context.SeqFeatObjRef(*feat), false);
1843  }
1844  m_Objs[kFeatureLocationConflictTop].Incr();
1845  }
1846  }
1847  else if (gene->IsSetLocation()) {
1848  ENa_strand gene_strand = gene->GetLocation().GetStrand();
1849  if (!IsGeneLocationOk(feat->GetLocation(), gene->GetLocation(), feat_strand, gene_strand, feat->GetData().IsCdregion(), context.GetScope(), all)) {
1850  string subitem_id = GetNextSubitemId(m_Objs[kFeatureLocationConflictTop].GetMap().size());
1851  if (feat->GetData().IsCdregion()) {
1852  m_Objs[kFeatureLocationConflictTop][kFeatureLocationCodingRegion + subitem_id].Ext().Add(*context.SeqFeatObjRef(*feat), false).Add(*context.SeqFeatObjRef(*gene), false);
1853  }
1854  else {
1855  m_Objs[kFeatureLocationConflictTop][kFeatureLocationRNA + subitem_id].Ext().Add(*context.SeqFeatObjRef(*feat), false).Add(*context.SeqFeatObjRef(*gene), false);
1856  }
1857  m_Objs[kFeatureLocationConflictTop].Incr();
1858  }
1859  }
1860  }
1861  }
1862 }
1863 
1864 
1865 DISCREPANCY_SUMMARIZE(FEATURE_LOCATION_CONFLICT)
1866 {
1867  m_ReportItems = m_Objs.Export(*this, false)->GetSubitems();
1868 }
1869 
1870 
1871 // SUSPECT_PHRASES
1872 
1873 const string suspect_phrases[] =
1874 {
1875  "fragment",
1876  "frameshift",
1877  "%",
1878  "E-value",
1879  "E value",
1880  "Evalue",
1881  "..."
1882 };
1883 
1884 
1885 DISCREPANCY_CASE(SUSPECT_PHRASES, FEAT, eDisc | eSubmitter | eSmart, "Suspect Phrases")
1886 {
1887  for (const CSeq_feat& feat : context.GetFeat()) {
1888  if (feat.IsSetData()) {
1889  string check;
1890  if (feat.GetData().IsCdregion() && feat.IsSetComment()) {
1891  check = feat.GetComment();
1892  }
1893  else if (feat.GetData().IsProt() && feat.GetData().GetProt().IsSetDesc()) {
1894  check = feat.GetData().GetProt().GetDesc();
1895  }
1896  if (!check.empty()) {
1897  for (size_t i = 0; i < ArraySize(suspect_phrases); i++) {
1899  m_Objs["[n] cds comment[s] or protein description[s] contain[S] suspect_phrase[s]"]["[n] cds comment[s] or protein description[s] contain[S] '" + suspect_phrases[i] + "'"].Summ().Add(*context.SeqFeatObjRef(feat));
1900  break;
1901  }
1902  }
1903  }
1904  }
1905  }
1906 }
1907 
1908 
1909 DISCREPANCY_SUMMARIZE(SUSPECT_PHRASES)
1910 {
1911  m_ReportItems = m_Objs.Export(*this, false)->GetSubitems();
1912 }
1913 
1914 
1915 // UNUSUAL_MISC_RNA
1916 
1917 DISCREPANCY_CASE(UNUSUAL_MISC_RNA, FEAT, eDisc | eSubmitter | eSmart, "Unexpected misc_RNA features")
1918 {
1919  for (const CSeq_feat& feat : context.GetFeat()) {
1920  if (feat.IsSetData() && feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_otherRNA) {
1921  const CRNA_ref& rna = feat.GetData().GetRna();
1922  string product = rna.GetRnaProductName();
1923  if (NStr::FindCase(product, "ITS", 0) == NPOS && NStr::FindCase(product, "internal transcribed spacer", 0) == NPOS) {
1924  m_Objs["[n] unexpected misc_RNA feature[s] found. misc_RNAs are unusual in a genome, consider using ncRNA, misc_binding, or misc_feature as appropriate"].Add(*context.SeqFeatObjRef(feat));
1925  }
1926  }
1927  }
1928 }
1929 
1930 
1931 // CDS_WITHOUT_MRNA
1932 
1933 static bool IsProductMatch(const string& rna_product, const string& cds_product)
1934 {
1935  if (rna_product.empty() || cds_product.empty()) {
1936  return false;
1937  }
1938  if (rna_product == cds_product) {
1939  return true;
1940  }
1941  const string kmRNAVariant = ", transcript variant ";
1942  const string kCDSVariant = ", isoform ";
1943  size_t pos_in_rna = rna_product.find(kmRNAVariant);
1944  size_t pos_in_cds = cds_product.find(kCDSVariant);
1945  if (pos_in_rna == string::npos || pos_in_cds == string::npos || pos_in_rna != pos_in_cds ||
1946  !NStr::EqualCase(rna_product, 0, pos_in_rna, cds_product)) {
1947  return false;
1948  }
1949  string rna_rest = rna_product.substr(pos_in_rna + kmRNAVariant.size()), cds_rest = cds_product.substr(pos_in_cds + kCDSVariant.size());
1950  return rna_rest == cds_rest;
1951 }
1952 
1953 
1954 DISCREPANCY_CASE(CDS_WITHOUT_MRNA, SEQUENCE, eDisc | eOncaller | eSmart, "Coding regions on eukaryotic genomic DNA should have mRNAs with matching products")
1955 {
1956  const CBioseq& bioseq = context.CurrentBioseq();
1957  const CSeqdesc* biosrc = context.GetBiosource();
1958  const CBioSource* src = biosrc ? &biosrc->GetSource() : nullptr;
1959  if (!context.IsEukaryotic(src) || context.IsOrganelle(src) || !bioseq.GetInst().IsSetMol() || bioseq.GetInst().GetMol() != CSeq_inst::eMol_dna) {
1960  return;
1961  }
1962 
1963  vector<const CSeq_feat*> cds = context.FeatCDS();
1964  vector<const CSeq_feat*> mrnas = context.FeatMRNAs();
1965  auto cds_it = cds.begin();
1966  while (cds_it != cds.end()) {
1967  if (context.IsPseudo(**cds_it)) {
1968  cds_it = cds.erase(cds_it);
1969  continue;
1970  }
1971  const CSeq_feat* mrna = nullptr;
1972  if ((*cds_it)->IsSetXref()) {
1973  auto rna_it = mrnas.cbegin();
1974  while (rna_it != mrnas.end()) {
1975  if ((*rna_it)->IsSetId()) {
1976  auto& rnaid = (*rna_it)->GetId();
1977  if (rnaid.IsLocal()) {
1978  for (auto xref : (*cds_it)->GetXref()) {
1979  if (xref->IsSetId()) {
1980  auto& id = xref->GetId();
1981  if (id.IsLocal()) {
1982  if (!id.GetLocal().Compare(rnaid.GetLocal())) {
1983  mrna = *rna_it;
1984  break;
1985  }
1986  }
1987  }
1988  }
1989  }
1990  if (mrna) {
1991  mrnas.erase(rna_it);
1992  break;
1993  }
1994  }
1995  ++rna_it;
1996  }
1997  }
1998  if (mrna) {
1999  string prod = context.GetProdForFeature(**cds_it);
2000  if (!IsProductMatch(prod, mrna->GetData().GetRna().GetRnaProductName())) {
2001  m_Objs["[n] coding region[s] [has] mismatching mRNA"].Add(*context.SeqFeatObjRef(**cds_it));
2002  }
2003  cds_it = cds.erase(cds_it);
2004  continue;
2005  }
2006  ++cds_it;
2007  }
2008 
2009  for (size_t i = 0; i < cds.size(); i++) {
2010  if (context.IsPseudo(*cds[i])) {
2011  continue;
2012  }
2013  bool found = false;
2014  string prod = context.GetProdForFeature(*cds[i]);
2015  const CSeq_loc& loc_i = cds[i]->GetLocation();
2016  for (size_t j = 0; j < mrnas.size(); j++) {
2017  const CSeq_loc& loc_j = mrnas[j]->GetLocation();
2018  sequence::ECompare compare = context.Compare(loc_j, loc_i);
2019  if (compare == sequence::eContains || compare == sequence::eSame) {
2020  if (IsProductMatch(prod, mrnas[j]->GetData().GetRna().GetRnaProductName())) {
2021  found = true;
2022  break;
2023  }
2024  }
2025  }
2026  if (!found) {
2027  m_Objs["[n] coding region[s] [does] not have an mRNA"].Add(*context.SeqFeatObjRef(*cds[i], CDiscrepancyContext::eFixSet));
2028  }
2029  }
2030 }
2031 
2032 
2033 #if 0
2034 static bool AddmRNAForCDS(const CSeq_feat& cds, CScope& scope)
2035 {
2036  CConstRef<CSeq_feat> old_mRNA = sequence::GetmRNAforCDS(cds, scope);
2037  CRef<CSeq_feat> new_mRNA = edit::MakemRNAforCDS(cds, scope);
2038 
2039  if (old_mRNA.Empty()) {
2040  CSeq_feat_EditHandle cds_edit_handle(scope.GetSeq_featHandle(cds));
2041  CSeq_annot_EditHandle annot_handle = cds_edit_handle.GetAnnot();
2042  annot_handle.AddFeat(*new_mRNA);
2043  }
2044  else {
2045  CSeq_feat_EditHandle old_mRNA_edit(scope.GetSeq_featHandle(*old_mRNA));
2046  old_mRNA_edit.Replace(*new_mRNA);
2047  }
2048  return true;
2049 }
2050 #endif
2051 
2052 
2054 {
2057  for (; annot_ci; ++annot_ci) {
2058  if (annot_ci->IsFtable()) {
2059  ftable = *annot_ci;
2060  break;
2061  }
2062  }
2063  if (!ftable) {
2064  CBioseq_EditHandle eh = bsh.GetEditHandle();
2065  CRef<CSeq_annot> new_annot(new CSeq_annot());
2066  ftable = eh.AttachAnnot(*new_annot);
2067  }
2069  return aeh;
2070 }
2071 
2072 
2073 DISCREPANCY_AUTOFIX(CDS_WITHOUT_MRNA)
2074 {
2075  CScope& scope = context.GetScope();
2076  const CSeq_feat* sf = dynamic_cast<const CSeq_feat*>(context.FindObject(*obj));
2077  CConstRef<CSeq_feat> old_mRNA = sequence::GetmRNAforCDS(*sf, scope);
2078  CRef<CSeq_feat> new_mRNA = edit::MakemRNAforCDS(*sf, scope);
2079  if (old_mRNA.Empty()) {
2080  CBioseq_Handle bh = scope.GetBioseqHandle(new_mRNA->GetLocation());
2081  CSeq_annot_EditHandle annot_handle = GetAnnotHandle(bh);
2082  annot_handle.AddFeat(*new_mRNA);
2083  }
2084  else {
2085  CSeq_feat_EditHandle old_mRNA_edit(scope.GetSeq_featHandle(*old_mRNA));
2086  old_mRNA_edit.Replace(*new_mRNA);
2087  }
2088  obj->SetFixed();
2089  return CRef<CAutofixReport>(new CAutofixReport("CDS_WITHOUT_MRNA: Add mRNA for [n] CDS feature[s]", 1));
2090 }
2091 
2092 
2093 // PROTEIN_NAMES
2094 
2095 DISCREPANCY_CASE(PROTEIN_NAMES, FEAT, eDisc | eSubmitter | eSmart, "Frequently appearing proteins")
2096 {
2097  for (const CSeq_feat& feat : context.GetFeat()) {
2098  if (feat.IsSetData() && feat.GetData().IsProt()) {
2099  const CProt_ref& prot = feat.GetData().GetProt();
2100  if (prot.IsSetName() && !prot.GetName().empty()) {
2101  m_Objs[feat.GetData().GetProt().GetName().front()].Incr();
2102  }
2103  }
2104  }
2105 }
2106 
2107 
2109 {
2110  static const size_t MIN_REPORTABLE_AMOUNT = 100;
2111  auto& M = m_Objs.GetMap();
2112  if (M.size() == 1 && M.begin()->second->GetCount() >= MIN_REPORTABLE_AMOUNT) {
2113  CReportNode rep;
2114  rep["All proteins have same name [(]\"" + M.begin()->first + "\""];
2115  m_ReportItems = rep.Export(*this)->GetSubitems();
2116  }
2117 }
2118 
2119 
2120 // MRNA_SHOULD_HAVE_PROTEIN_TRANSCRIPT_IDS
2121 
2122 static bool IsmRnaQualsPresent(const CSeq_feat::TQual& quals)
2123 {
2124  bool protein_id = false,
2125  transcript_id = false;
2126 
2127  for (const auto& qual : quals) {
2128  if (qual->IsSetQual()) {
2129 
2130  if (qual->GetQual() == "orig_protein_id") {
2131  protein_id = true;
2132  }
2133 
2134  if (qual->GetQual() == "orig_transcript_id") {
2135  transcript_id = true;
2136  }
2137 
2138  if (protein_id && transcript_id) {
2139  break;
2140  }
2141  }
2142  }
2143 
2144  return protein_id && transcript_id;
2145 }
2146 
2147 
2148 DISCREPANCY_CASE(MRNA_SHOULD_HAVE_PROTEIN_TRANSCRIPT_IDS, SEQUENCE, eDisc | eSubmitter | eSmart | eFatal, "mRNA should have both protein_id and transcript_id")
2149 {
2150  const CBioseq& bioseq = context.CurrentBioseq();
2151  const CSeqdesc* biosrc = context.GetBiosource();
2152  if (biosrc && context.IsEukaryotic(&biosrc->GetSource()) && bioseq.GetInst().IsSetMol() && bioseq.GetInst().GetMol() == CSeq_inst::eMol_dna) {
2153  for (const CSeq_feat& feat : context.GetAllFeat()) {
2154  if (feat.IsSetData() && feat.GetData().IsCdregion() && !context.IsPseudo(feat)) {
2155  CConstRef<CSeq_feat> mRNA = sequence::GetmRNAforCDS(feat, context.GetScope());
2156  if (mRNA && (!mRNA->IsSetQual() || !IsmRnaQualsPresent(mRNA->GetQual()))) {
2157  m_Objs.Add(*context.SeqFeatObjRef(feat)).Fatal();
2158  }
2159  }
2160  }
2161  }
2162 }
2163 
2164 
2165 DISCREPANCY_SUMMARIZE(MRNA_SHOULD_HAVE_PROTEIN_TRANSCRIPT_IDS)
2166 {
2167  if (!m_Objs.empty()) {
2168  CReportNode out;
2169  out["no protein_id and transcript_id present"];
2170  m_ReportItems = out.Export(*this)->GetSubitems();
2171  }
2172 }
2173 
2174 
2175 // FEATURE_LIST
2176 
2177 static const string kFeatureList = "Feature List";
2178 
2179 DISCREPANCY_CASE(FEATURE_LIST, FEAT, eDisc | eSubmitter, "Feature List")
2180 {
2181  for (const CSeq_feat& feat : context.GetFeat()) {
2182  if (feat.IsSetData() && feat.GetData().GetSubtype() != CSeqFeatData::eSubtype_gap && feat.GetData().GetSubtype() != CSeqFeatData::eSubtype_prot) {
2183  string subitem = "[n] " + feat.GetData().GetKey();
2184  subitem += " feature[s]";
2185  m_Objs[kFeatureList].Info()[subitem].Info().Add(*context.SeqFeatObjRef(feat));
2186  }
2187  }
2188 }
2189 
2190 
2191 // MULTIPLE_QUALS
2192 
2193 DISCREPANCY_CASE(MULTIPLE_QUALS, FEAT, eDisc | eOncaller, "Multiple qualifiers")
2194 {
2195  for (const CSeq_feat& feat : context.GetFeat()) {
2196  if (feat.IsSetQual()) {
2197  size_t num_of_number_quals = 0;
2198  for (const auto& qual : feat.GetQual()) {
2199  if (qual->IsSetQual() && qual->GetQual() == "number") {
2200  ++num_of_number_quals;
2201  if (num_of_number_quals > 1) {
2202  m_Objs["[n] feature[s] contain[S] multiple /number qualifiers"].Add(*context.SeqFeatObjRef(feat));
2203  break;
2204  }
2205  }
2206  }
2207  }
2208  }
2209 }
2210 
2211 
2212 // MISC_FEATURE_WITH_PRODUCT_QUAL
2213 
2214 DISCREPANCY_CASE(MISC_FEATURE_WITH_PRODUCT_QUAL, FEAT, eDisc | eOncaller | eSubmitter | eSmart, "Misc features containing a product qualifier")
2215 {
2216  for (const CSeq_feat& feat : context.GetFeat()) {
2217  if (feat.IsSetData() && feat.IsSetQual() && feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_misc_feature) {
2218  for (const auto& qual : feat.GetQual()) {
2219  if (qual->IsSetQual() && qual->GetQual() == "product") {
2220  m_Objs["[n] feature[s] [has] a product qualifier"].Add(*context.SeqFeatObjRef(feat));
2221  }
2222  }
2223  }
2224  }
2225 }
2226 
2227 
2228 // CDS_HAS_NO_ADJACENT_TRNA
2229 
2230 const string kCDShasNoTRNA = "[n] coding region[s] [does] not have adjacent tRNA";
2231 
2232 static bool IsStopCodon(const CCode_break::C_Aa& aa)
2233 {
2234  int aa_idx = -1;
2235  switch (aa.Which()) {
2237  aa_idx = aa.GetNcbieaa();
2239  break;
2241  aa_idx = aa.GetNcbi8aa();
2242  break;
2244  aa_idx = aa.GetNcbistdaa();
2245  break;
2246  default:
2247  break;
2248  }
2249  static const int STOP_CODON = 25;
2250  return aa_idx == STOP_CODON;
2251 }
2252 
2253 
2254 DISCREPANCY_CASE(CDS_HAS_NO_ADJACENT_TRNA, SEQUENCE, eDisc, "CDSs should have adjacent tRNA")
2255 {
2256  const CSeqdesc* biosrc = context.GetBiosource();
2257  if (!biosrc || biosrc->GetSource().GetGenome() != CBioSource::eGenome_mitochondrion) {
2258  return;
2259  }
2260  const auto& cds = context.FeatCDS();
2261  const auto& trnas = context.FeatTRNAs();
2262  for (size_t i = 0; i < cds.size(); i++) {
2263  if (!cds[i]->GetData().GetCdregion().IsSetCode_break()) {
2264  continue;
2265  }
2266  const CCode_break& code_break = *cds[i]->GetData().GetCdregion().GetCode_break().front();
2267  if (!code_break.IsSetAa() || !IsStopCodon(code_break.GetAa())) {
2268  continue;
2269  }
2270  ENa_strand strand = cds[i]->GetLocation().IsSetStrand() ? cds[i]->GetLocation().GetStrand() : eNa_strand_unknown;
2272  const CSeq_feat* nearest_trna = nullptr;
2273  TSeqPos diff = UINT_MAX;
2274  for (const CSeq_feat* trna : trnas) {
2275  if (trna->IsSetLocation()) {
2277  TSeqPos cur_diff = UINT_MAX;
2278  if (strand == eNa_strand_minus) {
2279  if (start <= stop) {
2280  cur_diff = stop - start;
2281  }
2282  }
2283  else {
2284  if (start >= stop) {
2285  cur_diff = start - stop;
2286  }
2287  }
2288  if (cur_diff < diff) {
2289  diff = cur_diff;
2290  nearest_trna = trna;
2291  }
2292  }
2293  }
2294  if (nearest_trna) {
2295  ENa_strand trna_strand = nearest_trna->GetLocation().IsSetStrand() ? nearest_trna->GetLocation().GetStrand() : eNa_strand_unknown;
2296  if (trna_strand == strand && diff > 1) {
2297  m_Objs[kCDShasNoTRNA].Add(*context.SeqFeatObjRef(*cds[i]), false).Incr();
2298  m_Objs[kCDShasNoTRNA].Add(*context.SeqFeatObjRef(*nearest_trna), false);
2299  }
2300  }
2301  }
2302 }
2303 
2304 
2305 // MITO_RRNA
2306 
2307 DISCREPANCY_CASE(MITO_RRNA, SEQUENCE, eOncaller, "Non-mitochondrial rRNAs with 12S/16S")
2308 {
2309  const CSeqdesc* biosrc = context.GetBiosource();
2310  if (context.IsEukaryotic(biosrc ? &biosrc->GetSource() : nullptr)) {
2311  const auto& rnas = context.Feat_RNAs();
2312  for (size_t i = 0; i < rnas.size(); i++) {
2313  if (rnas[i]->GetData().GetSubtype() == CSeqFeatData::eSubtype_rRNA && rnas[i]->GetData().GetRna().IsSetExt() && rnas[i]->GetData().GetRna().GetExt().IsName()) {
2314  const string& name = rnas[i]->GetData().GetRna().GetExt().GetName();
2315  if (name.find("16S") != string::npos || name.find("12S") != string::npos) {
2316  m_Objs["[n] non mitochondrial rRNA name[s] contain[S] 12S/16S"].Add(*context.SeqFeatObjRef(*rnas[i]));
2317  }
2318  }
2319  }
2320  }
2321 }
2322 
2323 
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
@ eExtreme_Positional
numerical value
Definition: Na_strand.hpp:63
@ eExtreme_Biological
5' and 3'
Definition: Na_strand.hpp:62
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
CRef< CSeq_feat > MakemRNAforCDS(const CSeq_feat &cds, CScope &scope)
MakemRNAforCDS A function to create a CSeq_feat that represents the appropriate mRNA for a given CDS.
Definition: cds_fix.cpp:525
CBioseq_EditHandle –.
CBioseq_Handle –.
TSeqPos GetLength(void) const
Definition: Bioseq.cpp:360
bool IsAa(void) const
Definition: Bioseq.cpp:350
static bool SeqLocExtend(CSeq_loc &loc, size_t pos, CScope &scope)
Extends a location to the specificed position.
Definition: cleanup.cpp:1038
CCode_break –.
Definition: Code_break.hpp:66
static bool HasLineage(const CBioSource &biosrc, const string &def_lineage, const string &type)
static bool IsUnculturedNonOrganelleName(const string &taxname)
CFeat_CI –.
Definition: feat_ci.hpp:64
CFeat_id –.
Definition: Feat_id.hpp:66
@Gb_qual.hpp User-defined methods of the data storage class.
Definition: Gb_qual.hpp:61
bool IsSuppressed(void) const
Definition: Gene_ref.cpp:75
@RNA_ref.hpp User-defined methods of the data storage class.
Definition: RNA_ref.hpp:54
string GetRnaProductName(void) const
Definition: RNA_ref.cpp:145
virtual vector< CRef< CReportItem > > GetSubitems() const =0
static void Add(TReportObjectList &list, TReportObjectSet &hash, CReportObj &obj, bool unique=true)
CReportNode & Ext(bool b=true)
CRef< CReportItem > Export(CDiscrepancyCore &test, bool unique=true) const
CScope –.
Definition: scope.hpp:92
static const string & GetRegulatoryClass(ESubtype subtype)
static bool IsLegalProductNameForRibosomalSlippage(const string &product_name)
static CTempString SubtypeValueToName(ESubtype eSubtype)
Turns a ESubtype into its string value which is NOT necessarily related to the identifier of the enum...
CSeqVector –.
Definition: seq_vector.hpp:65
CSeq_annot_CI –.
CSeq_annot_Handle –.
CSeq_feat_EditHandle –.
namespace ncbi::objects::
Definition: Seq_feat.hpp:58
Seq-loc iterator class – iterates all intervals from a seq-loc in the correct order.
Definition: Seq_loc.hpp:453
CSeqdesc_CI –.
Definition: seqdesc_ci.hpp:65
static TIndex GetMapToIndex(CSeq_data::E_Choice from_type, CSeq_data::E_Choice to_type, TIndex from_idx)
bool IsShortrRNA(const objects::CSeq_feat &f, objects::CScope *scope)
@ eFatal
@ eDisc
@ eOncaller
@ eSubmitter
@ eSmart
#define DISCREPANCY_AUTOFIX(name)
#define DISCREPANCY_CASE(name, type, group, descr)
#define DISCREPANCY_SUMMARIZE(name)
int GetSubtype(CFieldNamePanel *field_name_panel, string &ncRNA_class)
vector< CConstRef< CObject > > GetObjects(CSeq_entry_Handle seh, const string &field, CFieldNamePanel::EFieldType field_type, int subtype, const string &ncRNA_class, CConstRef< objects::CSeq_submit > submit, CRef< CEditingActionConstraint > constraint, vector< CSeq_entry_Handle > *descr_context=nullptr)
std::ofstream out("events_result.xml")
main entry point for tests
static const string kNoProductStr
static bool IsRBS(const CSeq_feat &f)
static bool AddExceptionsToShortIntron(const CSeq_feat &sf, CScope &scope, std::list< CConstRef< CSeq_loc >> &to_remove)
USING_SCOPE(objects)
const string & kJoinedFeaturesException
static void AddException(const CSeq_feat &sf, CScope &scope, const string &exception_text)
EExtensibe IsExtendableRight(TSeqPos right, const CBioseq &seq, CScope *scope, TSeqPos &extend_len, ENa_strand strand)
bool IsPartialStopConflict(const CSeq_feat &feat, const CSeq_feat &gene, bool is_mrna=false)
const string kPseudoMismatch
const string kExtraGeneNonPseudoNonFrameshift
bool StartAbutsGap(const CSeq_loc &loc, ENa_strand strand, CScope &scope)
const string kConflictStart
const string kBadGeneStrand
const string kConflictBoth
static bool IsStopCodon(const CCode_break::C_Aa &aa)
static const size_t kIntergenicSpacerNames_len
static void AdjustBacterialGeneForCodingRegionWithShortIntron(CSeq_feat &sf, CSeq_feat &gene, bool is_bacterial)
const string kFeatureLocationCodingRegion
EExtensibe
@ eExtensibe_none
@ eExtensibe_abut
@ eExtensibe_fixable
const string kGenePartialConflictTop
bool StopAbutsGap(const CSeq_loc &loc, ENa_strand strand, CScope &scope)
const string kGenePartialConflictMiscFeat
EExtensibe IsExtendableLeft(TSeqPos left, const CBioseq &seq, CScope *scope, TSeqPos &extend_len, ENa_strand strand)
static bool IsmRnaQualsPresent(const CSeq_feat::TQual &quals)
const string kGenePartialConflictCodingRegion
const string kFeatureLocationConflictTop
bool IsMixedStrandGeneLocationOk(const CSeq_loc &feat_loc, const CSeq_loc &gene_loc)
static const string kSuspiciousNotePhrases[]
const string kExtraGene
const string kFeatureLocationRNA
static bool ExtendToGapsOrEnds(const CSeq_feat &cds, CScope &scope)
const string & kJoinedFeaturesNoException
bool IsGeneInXref(const CSeq_feat &gene, const CSeq_feat &feat, bool &have_gene_ref)
static CSeq_annot_EditHandle GetAnnotHandle(CBioseq_Handle bsh)
const string suspect_phrases[]
const string kShortIntronExcept
bool HasMixedStrands(const CSeq_loc &loc)
const string kExtraPseudo
bool StrandsMatch(ENa_strand s1, ENa_strand s2)
bool ReportGeneMissing(const CSeq_feat &f)
static const string kNewExceptions[]
const string kShortIntronTop
bool IsNonExtendable(const CSeq_loc &loc, const CBioseq &seq, CScope *scope)
static bool IsProductMatch(const string &rna_product, const string &cds_product)
bool IsPartialStartConflict(const CSeq_feat &feat, const CSeq_feat &gene, bool is_mrna=false)
const string kEukaryoteShouldHavemRNA
const string kGenePartialConflictOther
const size_t kNumSuspiciousNotePhrases
const string kEukaryoticCDSHasMrna
static const string kIntergenicSpacerNames[]
bool GeneRefMatch(const CGene_ref &g1, const CGene_ref &g2)
bool IsGeneLocationOk(const CSeq_loc &feat_loc, const CSeq_loc &gene_loc, ENa_strand feat_strand, ENa_strand gene_strand, bool is_coding_region, CScope &scope, const vector< const CSeq_feat * > &features)
static void FindSuspiciousNotePhrases(const string &s, CDiscrepancyContext &context, CReportNode &rep, const CSeq_feat &feat)
const string & kJoinedFeatures
const string kCDShasNoTRNA
const string kNonExtendableException
static const string kFeatureList
static string GetNextSubitemId(size_t num)
const string kConflictStop
static const string kPutativeFrameShift
bool IsMixedStrand(const CSeq_loc &loc)
static void ConvertToMiscFeature(CSeq_feat &sf, CScope &scope)
const string & kJoinedFeaturesBlankException
#define check(s)
Definition: describecol2.c:21
int offset
Definition: replacements.h:160
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
constexpr size_t ArraySize(const Element(&)[Size])
Definition: ncbimisc.hpp:1532
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Set object to copy of another one.
virtual bool Equals(const CSerialObject &object, ESerialRecursionMode how=eRecursive) const
Check if both objects contain the same values.
bool IsPartialStart(ESeqLocExtremes ext) const
check start or stop of location for e_Lim fuzz
Definition: Seq_loc.cpp:3222
ENa_strand GetStrand(void) const
Get the location's strand.
Definition: Seq_loc.cpp:882
TSeqPos GetStart(ESeqLocExtremes ext) const
Return start and stop positions of the seq-loc.
Definition: Seq_loc.cpp:915
CConstRef< CSeq_loc > GetRangeAsSeq_loc(void) const
Get seq-loc for the current iterator position.
Definition: Seq_loc.cpp:2585
void SetInt(TInt &v)
Definition: Seq_loc.hpp:983
bool IsSetStrand(EIsSetStrand flag=eIsSetStrand_Any) const
Check if strand is set for any/all part(s) of the seq-loc depending on the flag.
Definition: Seq_loc.cpp:858
const CSeq_id * GetId(void) const
Get the id of the location return NULL if has multiple ids or no id at all.
Definition: Seq_loc.hpp:941
TRange GetRange(void) const
Get the range.
Definition: Seq_loc.hpp:1042
ENa_strand GetStrand(void) const
Definition: Seq_loc.hpp:1056
void SetStrand(ENa_strand strand)
Set the strand for all of the location's ranges.
Definition: Seq_loc.cpp:5196
bool IsPartialStop(ESeqLocExtremes ext) const
Definition: Seq_loc.cpp:3251
TSeqPos GetStop(ESeqLocExtremes ext) const
Definition: Seq_loc.cpp:963
@ fMerge_All
Definition: Seq_loc.hpp:331
CMappedFeat GetBestOverlappingFeat(const CMappedFeat &feat, CSeqFeatData::ESubtype need_subtype, sequence::EOverlapType overlap_type, CFeatTree *feat_tree=0, const SAnnotSelector *base_sel=0)
Definition: feature.cpp:3653
TSeqPos GetLength(const CSeq_id &id, CScope *scope)
Get sequence length if scope not null, else return max possible TSeqPos.
sequence::ECompare Compare(const CSeq_loc &loc1, const CSeq_loc &loc2, CScope *scope)
Returns the sequence::ECompare containment relationship between CSeq_locs.
ECompare
@ eOverlap_Contained
2nd contained within 1st extremes
@ eContains
First CSeq_loc contains second.
@ eSame
CSeq_locs contain each other.
@ eContained
First CSeq_loc contained by second.
@ eNoOverlap
CSeq_locs do not overlap or abut.
const CSeq_feat * GetCDSForProduct(const CBioseq &product, CScope *scope)
Get the encoding CDS feature of a given protein sequence.
Definition: sequence.cpp:2549
CConstRef< CSeq_feat > GetGeneForFeature(const CSeq_feat &feat, CScope &scope)
Finds gene for feature, but obeys SeqFeatXref directives.
Definition: sequence.cpp:1529
CConstRef< CSeq_feat > GetmRNAforCDS(const CSeq_feat &cds, CScope &scope)
GetmRNAforCDS A function to find a CSeq_feat representing the appropriate mRNA for a given CDS.
Definition: sequence.cpp:1261
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
Definition: scope.cpp:95
CSeq_feat_Handle GetSeq_featHandle(const CSeq_feat &feat, EMissing action=eMissing_Default)
Definition: scope.cpp:200
CConstRef< CBioseq > GetCompleteBioseq(void) const
Get the complete bioseq.
bool IsFtable(void) const
CSeq_entry_Handle GetParentEntry(void) const
Get parent Seq-entry handle.
void Remove(void) const
Remove the feature from Seq-annot.
CSeq_feat_EditHandle AddFeat(const CSeq_feat &new_obj) const
void Remove(ERemoveMode mode=eRemoveSeq_entry) const
CSeq_annot_EditHandle AttachAnnot(CSeq_annot &annot) const
Attach an annotation.
CBioseq_EditHandle GetEditHandle(void) const
Get 'edit' version of handle.
void Replace(const CSeq_feat &new_feat) const
Replace the feature with new Seq-feat object.
bool IsInGap(TSeqPos pos) const
true if sequence at 0-based position 'pos' has gap Note: this method is not MT-safe,...
Definition: seq_vector.hpp:277
void GetSeqData(TSeqPos start, TSeqPos stop, string &buffer) const
Fill the buffer string with the sequence data for the interval [start, stop).
Definition: seq_vector.cpp:304
TSeqPos size(void) const
Definition: seq_vector.hpp:291
bool Empty(void) const THROWS_NONE
Check if CConstRef is empty – not pointing to any object which means having a null value.
Definition: ncbiobj.hpp:1385
TObjectType * GetPointer(void) const THROWS_NONE
Get pointer,.
Definition: ncbiobj.hpp:1684
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:1439
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
bool NotEmpty(void) const THROWS_NONE
Check if CRef is not empty – pointing to an object and has a non-null value.
Definition: ncbiobj.hpp:726
bool NotEmpty(void) const THROWS_NONE
Check if CConstRef is not empty – pointing to an object and has a non-null value.
Definition: ncbiobj.hpp:1392
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
static string SizetToString(size_t value, TNumToStringFlags flags=0, int base=10)
Convert size_t to string.
Definition: ncbistr.cpp:2751
#define kEmptyStr
Definition: ncbistr.hpp:123
static SIZE_TYPE FindNoCase(const CTempString str, const CTempString pattern, SIZE_TYPE start, SIZE_TYPE end, EOccurrence which=eFirst)
Find the pattern in the specified range of a string using a case insensitive search.
Definition: ncbistr.cpp:2993
static bool IsBlank(const CTempString str, SIZE_TYPE pos=0)
Check if a string is blank (has no text).
Definition: ncbistr.cpp:106
#define NPOS
Definition: ncbistr.hpp:133
static SIZE_TYPE Find(const CTempString str, const CTempString pattern, ECase use_case=eCase, EDirection direction=eForwardSearch, SIZE_TYPE occurrence=0)
Find the pattern in the string.
Definition: ncbistr.cpp:2891
static bool EqualCase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-sensitive equality of a substring with another string.
Definition: ncbistr.hpp:5325
static SIZE_TYPE FindCase(const CTempString str, const CTempString pattern, SIZE_TYPE start, SIZE_TYPE end, EOccurrence which=eFirst)
Find the pattern in the specified range of a string using a case sensitive search.
Definition: ncbistr.hpp:5490
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
Definition: ncbistr.hpp:5412
static bool EqualNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive equality of a substring with another string.
Definition: ncbistr.hpp:5353
static enable_if< is_arithmetic< TNumeric >::value||is_convertible< TNumeric, Int8 >::value, string >::type NumericToString(TNumeric value, TNumToStringFlags flags=0, int base=10)
Convert numeric value to string.
Definition: ncbistr.hpp:673
static bool Equal(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2, ECase use_case=eCase)
Test for equality of a substring with another string.
Definition: ncbistr.hpp:5384
@ eNocase
Case insensitive compare.
Definition: ncbistr.hpp:1206
static const char label[]
TGenome GetGenome(void) const
Get the Genome member data.
Definition: BioSource_.hpp:422
bool IsSetOrg(void) const
Check if a value has been assigned to Org data member.
Definition: BioSource_.hpp:497
const TOrg & GetOrg(void) const
Get the Org member data.
Definition: BioSource_.hpp:509
bool IsSetGenome(void) const
Check if a value has been assigned to Genome data member.
Definition: BioSource_.hpp:397
TTo GetTo(void) const
Get the To member data.
Definition: Range_.hpp:269
TFrom GetFrom(void) const
Get the From member data.
Definition: Range_.hpp:222
const TDesc & GetDesc(void) const
Get the Desc member data.
Definition: Gene_ref_.hpp:599
bool IsSetPseudo(void) const
pseudogene Check if a value has been assigned to Pseudo data member.
Definition: Gene_ref_.hpp:681
bool IsSetLocus_tag(void) const
systematic gene name (e.g., MI0001, ORF0069) Check if a value has been assigned to Locus_tag data mem...
Definition: Gene_ref_.hpp:781
bool IsSetLocus(void) const
Official gene symbol Check if a value has been assigned to Locus data member.
Definition: Gene_ref_.hpp:493
bool IsSetDesc(void) const
descriptive name Check if a value has been assigned to Desc data member.
Definition: Gene_ref_.hpp:587
bool IsSetAllele(void) const
Official allele designation Check if a value has been assigned to Allele data member.
Definition: Gene_ref_.hpp:540
bool IsSetMaploc(void) const
descriptive map location Check if a value has been assigned to Maploc data member.
Definition: Gene_ref_.hpp:634
const TLocus_tag & GetLocus_tag(void) const
Get the Locus_tag member data.
Definition: Gene_ref_.hpp:793
const TLocus & GetLocus(void) const
Get the Locus member data.
Definition: Gene_ref_.hpp:505
const TAllele & GetAllele(void) const
Get the Allele member data.
Definition: Gene_ref_.hpp:552
const TMaploc & GetMaploc(void) const
Get the Maploc member data.
Definition: Gene_ref_.hpp:646
const TTaxname & GetTaxname(void) const
Get the Taxname member data.
Definition: Org_ref_.hpp:372
bool IsSetTaxname(void) const
preferred formal name Check if a value has been assigned to Taxname data member.
Definition: Org_ref_.hpp:360
const TVal & GetVal(void) const
Get the Val member data.
Definition: Gb_qual_.hpp:259
bool IsSetComment(void) const
Check if a value has been assigned to Comment data member.
Definition: Seq_feat_.hpp:1037
bool IsSetData(void) const
the specific data Check if a value has been assigned to Data data member.
Definition: Seq_feat_.hpp:913
bool IsSetQual(void) const
qualifiers Check if a value has been assigned to Qual data member.
Definition: Seq_feat_.hpp:1135
TNcbi8aa GetNcbi8aa(void) const
Get the variant data.
E_Choice Which(void) const
Which variant is currently selected.
void SetLocation(TLocation &value)
Assign a value to Location data member.
Definition: Seq_feat_.cpp:131
bool IsCdregion(void) const
Check if variant Cdregion is selected.
void SetComment(const TComment &value)
Assign a value to Comment data member.
Definition: Seq_feat_.hpp:1058
const TQual & GetQual(void) const
Get the Qual member data.
Definition: Seq_feat_.hpp:1147
const TId & GetId(void) const
Get the Id member data.
Definition: Seq_feat_.hpp:904
const TLocation & GetLocation(void) const
Get the Location member data.
Definition: Seq_feat_.hpp:1117
void SetExcept(TExcept value)
Assign a value to Except data member.
Definition: Seq_feat_.hpp:1018
TFrame GetFrame(void) const
Get the Frame member data.
Definition: Cdregion_.hpp:534
const TData & GetData(void) const
Get the Data member data.
Definition: Seq_feat_.hpp:925
const TExcept_text & GetExcept_text(void) const
Get the Except_text member data.
Definition: Seq_feat_.hpp:1405
void ResetData(void)
Reset Data data member.
Definition: Seq_feat_.cpp:85
bool IsSetExcept_text(void) const
explain if except=TRUE Check if a value has been assigned to Except_text data member.
Definition: Seq_feat_.hpp:1393
bool CanGetId(void) const
Check if it is safe to call GetId method.
Definition: Seq_feat_.hpp:898
void SetData(TData &value)
Assign a value to Data data member.
Definition: Seq_feat_.cpp:94
void ResetComment(void)
Reset Comment data member.
Definition: Seq_feat_.cpp:99
const TCdregion & GetCdregion(void) const
Get the variant data.
bool IsSetAa(void) const
Check if a value has been assigned to Aa data member.
const TAa & GetAa(void) const
Get the Aa member data.
const TProduct & GetProduct(void) const
Get the Product member data.
Definition: Seq_feat_.hpp:1096
TNcbieaa GetNcbieaa(void) const
Get the variant data.
const TComment & GetComment(void) const
Get the Comment member data.
Definition: Seq_feat_.hpp:1049
bool CanGetFrame(void) const
Check if it is safe to call GetFrame method.
Definition: Cdregion_.hpp:515
void SetPseudo(TPseudo value)
Assign a value to Pseudo data member.
Definition: Seq_feat_.hpp:1374
const TGene & GetGene(void) const
Get the variant data.
void SetExcept_text(const TExcept_text &value)
Assign a value to Except_text data member.
Definition: Seq_feat_.hpp:1414
const TXref & GetXref(void) const
Get the Xref member data.
Definition: Seq_feat_.hpp:1308
void ResetProduct(void)
Reset Product data member.
Definition: Seq_feat_.cpp:105
vector< CRef< CGb_qual > > TQual
Definition: Seq_feat_.hpp:117
const TQual & GetQual(void) const
Get the Qual member data.
Definition: Gb_qual_.hpp:212
const TRna & GetRna(void) const
Get the variant data.
TNcbistdaa GetNcbistdaa(void) const
Get the variant data.
TQual & SetQual(void)
Assign a value to Qual data member.
Definition: Seq_feat_.hpp:1153
const TCode_break & GetCode_break(void) const
Get the Code_break member data.
Definition: Cdregion_.hpp:733
bool IsSetProduct(void) const
product of process Check if a value has been assigned to Product data member.
Definition: Seq_feat_.hpp:1084
bool IsRna(void) const
Check if variant Rna is selected.
bool IsSetLocation(void) const
feature made from Check if a value has been assigned to Location data member.
Definition: Seq_feat_.hpp:1105
@ eFrame_not_set
not set, code uses one
Definition: Cdregion_.hpp:95
@ e_Ncbi8aa
NCBI8aa code.
@ e_Ncbieaa
ASCII value of NCBIeaa code.
ENa_strand
strand of nucleic acid
Definition: Na_strand_.hpp:64
@ eNa_strand_plus
Definition: Na_strand_.hpp:66
@ eNa_strand_minus
Definition: Na_strand_.hpp:67
@ eNa_strand_unknown
Definition: Na_strand_.hpp:65
TRepr GetRepr(void) const
Get the Repr member data.
Definition: Seq_inst_.hpp:565
const TInst & GetInst(void) const
Get the Inst member data.
Definition: Bioseq_.hpp:336
TTopology GetTopology(void) const
Get the Topology member data.
Definition: Seq_inst_.hpp:733
bool IsSetRepr(void) const
Check if a value has been assigned to Repr data member.
Definition: Seq_inst_.hpp:546
bool IsSetMol(void) const
Check if a value has been assigned to Mol data member.
Definition: Seq_inst_.hpp:593
const TSource & GetSource(void) const
Get the variant data.
Definition: Seqdesc_.cpp:566
bool IsSetBiomol(void) const
Check if a value has been assigned to Biomol data member.
Definition: MolInfo_.hpp:422
bool IsSetExt(void) const
extensions for special types Check if a value has been assigned to Ext data member.
Definition: Seq_inst_.hpp:826
bool IsSetInst(void) const
the sequence data Check if a value has been assigned to Inst data member.
Definition: Bioseq_.hpp:324
TLength GetLength(void) const
Get the Length member data.
Definition: Seq_inst_.hpp:659
TMol GetMol(void) const
Get the Mol member data.
Definition: Seq_inst_.hpp:612
bool IsDelta(void) const
Check if variant Delta is selected.
Definition: Seq_ext_.hpp:336
const TExt & GetExt(void) const
Get the Ext member data.
Definition: Seq_inst_.hpp:838
TBiomol GetBiomol(void) const
Get the Biomol member data.
Definition: MolInfo_.hpp:447
const TDelta & GetDelta(void) const
Get the variant data.
Definition: Seq_ext_.cpp:180
const Tdata & Get(void) const
Get the member data.
Definition: Delta_ext_.hpp:164
const TMolinfo & GetMolinfo(void) const
Get the variant data.
Definition: Seqdesc_.cpp:588
@ eRepr_delta
sequence made by changes (delta) to others
Definition: Seq_inst_.hpp:100
@ e_Ncbieaa
extended ASCII 1 letter aa codes
Definition: Seq_data_.hpp:111
@ e_Ncbistdaa
consecutive codes for std aas
Definition: Seq_data_.hpp:113
@ e_Source
source of materials, includes Org-ref
Definition: Seqdesc_.hpp:133
where boath are integers</td > n< td ></td > n</tr > n< tr > n< td > tse</td > n< td > optional</td > n< td > String</td > n< td class=\"description\"> TSE option controls what blob is smart and slim</td> n<td> orig</td> n</tr> n<tr> n<td> last_modified</td> n<td> optional</td> n<td> Integer</td> n<td class=\"description\"> The blob last modification If provided then the exact match will be requested with n the Cassandra storage corresponding field value</td> n<td> Positive integer Not provided means that the most recent match will be selected</td> n<td></td> n</tr> n<tr> n<td> use_cache</td> n<td> optional</td> n<td> String</td> n<td class=\"description\"> The option controls if the Cassandra LMDB cache and or database should be used It n affects the seq id resolution step and the blob properties lookup step The following n options are BIOSEQ_INFO and BLOB_PROP at all
int i
yy_size_t n
int len
static string GetProductName(const CProt_ref &prot)
Definition: utils.cpp:62
constexpr bool empty(list< Ts... >) noexcept
const struct ncbi::grid::netcache::search::fields::SIZE size
const CharType(& source)[N]
Definition: pointer.h:1149
T max(T x_, T y_)
double f(double x_, const double &y_)
Definition: njn_root.hpp:188
#define fi
#define ftable
Definition: utilfeat.h:37
static const char *const features[]
static CS_CONTEXT * context
Definition: will_convert.c:21
#define const
Definition: zconf.h:232
Modified on Wed Apr 17 13:08:10 2024 by modify_doxy.py rev. 669887