1 /* $Id: feature_tests.cpp 98937 2023-01-25 15:40:02Z foleyjp $
2  * =========================================================================
3  *
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * =========================================================================
25  *
26  * Authors: Colleen Bollin, based on similar discrepancy tests
27  *
28  */
30 #include <ncbi_pch.hpp>
32 #include "discrepancy_core.hpp"
33 #include "utils.hpp"
42 #include <objects/seq/Seq_ext.hpp>
50 #include <objmgr/util/sequence.hpp>
51 #include <objmgr/feat_ci.hpp>
52 #include <objmgr/seqdesc_ci.hpp>
53 #include <objmgr/seq_annot_ci.hpp>
54 #include <objmgr/seq_vector.hpp>
55 #include <objmgr/tse_handle.hpp>
65 const string kPseudoMismatch = "[n] CDSs, RNAs, and genes have mismatching pseudos.";
67 DISCREPANCY_CASE(PSEUDO_MISMATCH, FEAT, eDisc | eOncaller | eSubmitter | eSmart | eFatal, "Pseudo Mismatch")
68 {
69  for (const CSeq_feat& feat : context.GetFeat()) {
70  if (feat.IsSetPseudo() && feat.GetPseudo() && (feat.GetData().IsCdregion() || feat.GetData().IsRna())) {
71  const CSeq_feat* gene = context.GetGeneForFeature(feat);
72  if (gene && !context.IsPseudo(*gene)) {
73  m_Objs[kPseudoMismatch].Add(*context.SeqFeatObjRef(feat, CDiscrepancyContext::eFixSelf), false).Fatal();
74  m_Objs[kPseudoMismatch].Add(*context.SeqFeatObjRef(*gene), false).Fatal();
75  }
76  }
77  }
78 }
82 {
83  const CSeq_feat* sf = dynamic_cast<const CSeq_feat*>(context.FindObject(*obj));
84  CRef<CSeq_feat> new_feat(new CSeq_feat());
85  new_feat->Assign(*sf);
86  new_feat->SetPseudo(true);
87  context.ReplaceSeq_feat(*obj, *sf, *new_feat);
88  obj->SetFixed();
89  return CRef<CAutofixReport>(new CAutofixReport("PSEUDO_MISMATCH: Set pseudo for [n] feature[s]", 1));
90 }
95 DISCREPANCY_CASE(SHORT_RRNA, FEAT, eDisc | eOncaller | eSubmitter | eSmart | eFatal, "Short rRNA Features")
96 {
97  for (const CSeq_feat& feat : context.GetFeat()) {
98  if (feat.IsSetData() && feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_rRNA && !feat.IsSetPartial() && IsShortrRNA(feat, &(context.GetScope()))) {
99  m_Objs["[n] rRNA feature[s] [is] too short"].Add(*context.SeqFeatObjRef(feat)).Fatal();
100  }
101  }
102 }
107 static bool IsRBS(const CSeq_feat& f)
108 {
109  if (f.GetData().GetSubtype() == CSeqFeatData::eSubtype_RBS) {
110  return true;
111  }
112  if (f.GetData().GetSubtype() != CSeqFeatData::eSubtype_regulatory) {
113  return false;
114  }
115  if (!f.IsSetQual()) {
116  return false;
117  }
118  for (const auto& it : f.GetQual()) {
119  if (it->IsSetQual() && NStr::Equal(it->GetQual(), "regulatory_class") &&
121  return true;
122  }
123  }
124  return false;
125 }
128 DISCREPANCY_CASE(RBS_WITHOUT_GENE, FEAT, eOncaller | eFatal, "RBS features should have an overlapping gene")
129 {
130  bool has_genes = false;
131  for (const CSeq_feat& feat : context.GetAllFeat()) {
132  if (feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_gene) {
133  has_genes = true;
134  break;
135  }
136  }
137  if (has_genes) {
138  for (const CSeq_feat& feat : context.GetFeat()) {
139  if (IsRBS(feat) && !context.GetGeneForFeature(feat)) {
140  m_Objs["[n] RBS feature[s] [does] not have overlapping gene[s]"].Add(*context.SeqFeatObjRef(feat)).Fatal();
141  }
142  }
143  }
144 }
150 {
151  CSeqFeatData::ESubtype subtype = f.GetData().GetSubtype();
152  if (subtype == CSeqFeatData::eSubtype_regulatory) {
153  return false;
154  }
155  if (IsRBS(f) ||
156  f.GetData().IsCdregion() ||
157  f.GetData().IsRna() ||
158  subtype == CSeqFeatData::eSubtype_exon ||
159  subtype == CSeqFeatData::eSubtype_intron) {
160  return true;
161  }
162  return false;
163 }
166 DISCREPANCY_CASE(MISSING_GENES, FEAT, eDisc | eSubmitter | eSmart | eFatal, "Missing Genes")
167 {
168  for (const CSeq_feat& feat : context.GetFeat()) {
169  if (!feat.GetGeneXref() && feat.IsSetData() && ReportGeneMissing(feat)) {
170  const CSeq_feat* gene_feat = context.GetGeneForFeature(feat);
171  if (!gene_feat) {
172  m_Objs["[n] feature[s] [has] no genes"].Add(*context.SeqFeatObjRef(feat)).Fatal();
173  }
174  }
175  }
176 }
181 const string kExtraGene = "[n] gene feature[s] [is] not associated with a CDS or RNA feature.";
182 const string kExtraPseudo = "[n] pseudo gene feature[s] [is] not associated with a CDS or RNA feature.";
183 const string kExtraGeneNonPseudoNonFrameshift = "[n] non-pseudo gene feature[s] are not associated with a CDS or RNA feature and [does] not have frameshift in the comment.";
185 bool IsGeneInXref(const CSeq_feat& gene, const CSeq_feat& feat, bool& have_gene_ref)
186 {
187  for (const auto& it : feat.GetXref()) {
188  if (it->IsSetId()) {
189  const CFeat_id& id = it->GetId();
190  if (gene.CanGetId() && gene.GetId().Equals(id)) {
191  return true;
192  }
193  }
194  if (it->IsSetData() && it->GetData().IsGene()) {
195  have_gene_ref = true;
196  const CGene_ref& gene_ref = it->GetData().GetGene();
197  const string& locus = gene.GetData().GetGene().IsSetLocus() ? gene.GetData().GetGene().GetLocus() : kEmptyStr;
198  const string& locus_tag = gene.GetData().GetGene().IsSetLocus_tag() ? gene.GetData().GetGene().GetLocus_tag() : kEmptyStr;
199  if ((gene_ref.IsSetLocus() || gene_ref.IsSetLocus_tag())
200  && (!gene_ref.IsSetLocus_tag() || gene_ref.GetLocus_tag() == locus_tag)
201  && (gene_ref.IsSetLocus_tag() || locus_tag.empty())
202  && (!gene_ref.IsSetLocus() || gene_ref.GetLocus() == locus)
203  && (gene_ref.IsSetLocus() || locus.empty())) {
204  return true;
205  }
206  }
207  }
208  return false;
209 }
212 DISCREPANCY_CASE(EXTRA_GENES, SEQUENCE, eDisc | eSubmitter | eSmart, "Extra Genes")
213 {
214  // TODO: Do not collect if mRNA sequence in Gen-prod set
215  const auto& genes = context.FeatGenes();
216  const auto& all = context.FeatAll();
217  for (const CSeq_feat* gene : genes) {
218  if ((gene->IsSetComment() && !gene->GetComment().empty()) || (gene->GetData().GetGene().IsSetDesc() && !gene->GetData().GetGene().GetDesc().empty())) {
219  continue;
220  }
221  const CSeq_loc& loc = gene->GetLocation();
222  bool found = false;
223  for (const CSeq_feat* feat : all) {
224  if (feat->GetData().IsCdregion() || feat->GetData().IsRna()) {
225  const CSeq_loc& loc_f = feat->GetLocation();
226  sequence::ECompare cmp = context.Compare(loc, loc_f);
228  bool have_gene_ref = false;
229  if (IsGeneInXref(*gene, *feat, have_gene_ref)) {
230  found = true;
231  break;
232  }
233  else if (!have_gene_ref) {
235  if (best_gene.NotEmpty() && &*best_gene == &*gene) {
236  found = true;
237  break;
238  }
239  }
240  }
241  }
242  }
243  if (!found) {
244  m_Objs[kExtraGene][context.IsPseudo(*gene) ? kExtraPseudo : kExtraGeneNonPseudoNonFrameshift].Ext().Add(*context.SeqFeatObjRef(*gene));
245  }
246  }
247 }
252 DISCREPANCY_CASE(SUPERFLUOUS_GENE, SEQUENCE, eDisc | eOncaller, "Superfluous Genes")
253 {
254  const auto& genes = context.FeatGenes();
255  const auto& feats = context.FeatAll();
256  for (size_t i = 0; i < genes.size(); i++) {
257  if (genes[i]->IsSetPseudo() && genes[i]->GetPseudo()) {
258  continue;
259  }
260  const CSeq_loc& loc_i = genes[i]->GetLocation();
261  bool found = false;
262  for (size_t j = 0; j < feats.size(); j++) {
263  if (feats[j]->GetData().IsGene()) {
264  continue;
265  }
266  const CSeq_loc& loc_j = feats[j]->GetLocation();
267  sequence::ECompare compare = context.Compare(loc_j, loc_i);
268  if (compare == sequence::eNoOverlap) {
269  continue;
270  }
271  if (genes[i] == context.GetGeneForFeature(*feats[j])) {
272  found = true;
273  break;
274  }
275  }
276  if (!found) {
277  m_Objs["[n] gene feature[s] [is] not associated with any feature and [is] not pseudo."].Add(*context.SeqFeatObjRef(*genes[i]));
278  }
279  }
280 }
288  eExtensibe_abut = 2
289 };
292 EExtensibe IsExtendableLeft(TSeqPos left, const CBioseq& seq, CScope* scope, TSeqPos& extend_len, ENa_strand strand)
293 {
294  bool circular = seq.IsSetInst() && seq.GetInst().GetTopology() == CSeq_inst::eTopology_circular;
296  if (left < 3) {
297  extend_len = left;
298  rval = extend_len ? circular ? eExtensibe_none : eExtensibe_fixable : eExtensibe_abut;
299  }
300  else if (seq.IsSetInst() && seq.GetInst().IsSetRepr() && seq.GetInst().GetRepr() == CSeq_inst::eRepr_delta && seq.GetInst().IsSetExt() && seq.GetInst().GetExt().IsDelta()) {
301  TSeqPos offset = 0;
302  TSeqPos last_gap_stop = 0;
303  bool gap = false;
304  for (const auto& it : seq.GetInst().GetExt().GetDelta().Get()) {
305  if (it->IsLiteral()) {
306  offset += it->GetLiteral().GetLength();
307  if (!it->GetLiteral().IsSetSeq_data()) {
308  last_gap_stop = offset;
309  gap = true;
310  }
311  else if (it->GetLiteral().GetSeq_data().IsGap()) {
312  last_gap_stop = offset;
313  gap = true;
314  }
315  }
316  else if (it->IsLoc()) {
317  offset += sequence::GetLength(it->GetLoc(), scope);
318  }
319  if (offset > left) {
320  break;
321  }
322  }
323  if (left >= last_gap_stop && left - last_gap_stop <= 3) {
324  extend_len = left - last_gap_stop;
325  rval = extend_len ? (circular && !gap) ? eExtensibe_none : eExtensibe_fixable : eExtensibe_abut;
326  }
327  }
328  if (rval == eExtensibe_abut) return rval;
329  CSeqVector svec(seq, scope, CBioseq_Handle::CBioseq_Handle::eCoding_Iupac);
330  string codon;
331  TSeqPos count = extend_len ? extend_len : 1;
332  svec.GetSeqData(left - count, left, codon);
333  for (unsigned i = 0; i < count; i++) {
334  if (codon[i] == 'N') {
335  count = i;
336  break;
337  }
338  }
339  if (!count) {
340  extend_len = 0;
341  return eExtensibe_abut;
342  }
343  if (rval == eExtensibe_fixable) {
344  svec.GetSeqData(left - extend_len, left - extend_len + 3, codon);
345  if (strand == eNa_strand_minus) {
346  if (codon == "CTA" || codon == "TTA" || codon == "TCA") { // reverse TAG / TAA / TGA
347  rval = eExtensibe_none;
348  }
349  }
350  else {
351  if (codon == "TAG" || codon == "TAA" || codon == "TGA") {
352  rval = eExtensibe_none;
353  }
354  }
355  }
356  return rval;
357 }
360 EExtensibe IsExtendableRight(TSeqPos right, const CBioseq& seq, CScope* scope, TSeqPos& extend_len, ENa_strand strand)
361 {
362  bool circular = seq.IsSetInst() && seq.GetInst().GetTopology() == CSeq_inst::eTopology_circular;
364  if (right > seq.GetLength() - 4) {
365  extend_len = seq.GetLength() - right - 1;
366  rval = extend_len ? circular ? eExtensibe_none : eExtensibe_fixable : eExtensibe_abut;
367  }
368  else if (seq.IsSetInst() && seq.GetInst().IsSetRepr() && seq.GetInst().GetRepr() == CSeq_inst::eRepr_delta && seq.GetInst().IsSetExt() && seq.GetInst().GetExt().IsDelta()) {
369  TSeqPos offset = 0;
370  TSeqPos next_gap_start = 0;
371  bool gap = false;
372  for (const auto& it : seq.GetInst().GetExt().GetDelta().Get()) {
373  if (it->IsLiteral()) {
374  if (!it->GetLiteral().IsSetSeq_data()) {
375  next_gap_start = offset;
376  gap = true;
377  }
378  else if (it->GetLiteral().GetSeq_data().IsGap()) {
379  next_gap_start = offset;
380  gap = true;
381  }
382  offset += it->GetLiteral().GetLength();
383  }
384  else if (it->IsLoc()) {
385  offset += sequence::GetLength(it->GetLoc(), scope);
386  }
387  if (offset > right + 3) {
388  break;
389  }
390  }
391  if (next_gap_start > right && next_gap_start - right - 1 <= 3) {
392  extend_len = next_gap_start - right - 1;
393  rval = extend_len ? (circular && !gap) ? eExtensibe_none : eExtensibe_fixable : eExtensibe_abut;
394  }
395  }
396  if (rval == eExtensibe_abut) return rval;
397  CSeqVector svec(seq, scope, CBioseq_Handle::CBioseq_Handle::eCoding_Iupac);
398  string codon;
399  TSeqPos count = extend_len ? extend_len : 1;
400  svec.GetSeqData(right + 1, right + count + 1, codon);
401  for (unsigned i = 0; i < count; i++) {
402  if (codon[i] == 'N') {
403  count = i;
404  break;
405  }
406  }
407  if (!count) {
408  extend_len = 0;
409  return eExtensibe_abut;
410  }
411  if (rval == eExtensibe_fixable) {
412  svec.GetSeqData(right + extend_len - 3, right + extend_len, codon);
413  if (strand == eNa_strand_minus) {
414  if (codon == "CTA" || codon == "TTA" || codon == "TCA") { // reverse TAG / TAA / TGA
415  rval = eExtensibe_none;
416  }
417  }
418  else {
419  if (codon == "TAG" || codon == "TAA" || codon == "TGA") {
420  rval = eExtensibe_none;
421  }
422  }
423  }
424  return rval;
425 }
428 // Cannot be extended and not abut the end or the gap
429 bool IsNonExtendable(const CSeq_loc& loc, const CBioseq& seq, CScope* scope)
430 {
431  bool rval = false;
433  TSeqPos start = loc.GetStart(eExtreme_Positional);
434  if (start > 0) {
435  TSeqPos extend_len = 0;
436  if (IsExtendableLeft(start, seq, scope, extend_len, loc.GetStrand()) == eExtensibe_none) {
437  rval = true;
438  }
439  }
440  }
441  if (!rval && loc.IsPartialStop(eExtreme_Positional)) {
442  TSeqPos stop = loc.GetStop(eExtreme_Positional);
443  if (stop < seq.GetLength() - 1) {
444  TSeqPos extend_len = 0;
445  if (IsExtendableRight(stop, seq, scope, extend_len, loc.GetStrand()) == eExtensibe_none) {
446  rval = true;
447  }
448  }
449  }
450  return rval;
451 }
454 DISCREPANCY_CASE(BACTERIAL_PARTIAL_NONEXTENDABLE_PROBLEMS, SEQUENCE, eDisc | eSubmitter | eSmart | eFatal, "Find partial feature ends on bacterial sequences that cannot be extended: on when non-eukaryote")
455 {
456  const CBioseq& bioseq = context.CurrentBioseq();
457  const CSeqdesc* biosrc = context.GetBiosource();
458  if (!biosrc || context.IsEukaryotic(&biosrc->GetSource()) || context.IsOrganelle(&biosrc->GetSource()) || bioseq.IsAa()) {
459  return;
460  }
461  for (const CSeq_feat& feat : context.GetAllFeat()) {
462  if (feat.IsSetData() && feat.GetData().IsCdregion() && IsNonExtendable(feat.GetLocation(), bioseq, &(context.GetScope()))) {
463  m_Objs["[n] feature[s] [has] partial ends that do not abut the end of the sequence or a gap, and cannot be extended by 3 or fewer nucleotides to do so"].Add(*context.SeqFeatObjRef(feat, &feat)).Fatal();
464  }
465  }
466 }
469 const string kNonExtendableException = "unextendable partial coding region";
472 {
473  const CSeq_feat* sf = dynamic_cast<const CSeq_feat*>(context.FindObject(*obj));
474  if (!sf->IsSetExcept_text() || sf->GetExcept_text().find(kNonExtendableException) == string::npos) {
475  CRef<CSeq_feat> new_feat(new CSeq_feat());
476  new_feat->Assign(*sf);
477  if (new_feat->IsSetExcept_text()) {
478  new_feat->SetExcept_text(sf->GetExcept_text() + "; " + kNonExtendableException);
479  }
480  else {
482  }
483  new_feat->SetExcept(true);
484  context.ReplaceSeq_feat(*obj, *sf, *new_feat);
485  obj->SetFixed();
486  return CRef<CAutofixReport>(new CAutofixReport("BACTERIAL_PARTIAL_NONEXTENDABLE_PROBLEMS: Set exception for [n] feature[s]", 1));
487  }
488  return CRef<CAutofixReport>();
489 }
494 DISCREPANCY_CASE(BACTERIAL_PARTIAL_NONEXTENDABLE_EXCEPTION, SEQUENCE, eDisc | eSubmitter | eSmart, "Find partial feature ends on bacterial sequences that cannot be extended but have exceptions: on when non-eukaryote")
495 {
496  const CBioseq& bioseq = context.CurrentBioseq();
497  const CSeqdesc* biosrc = context.GetBiosource();
498  if (!biosrc || context.IsEukaryotic(&biosrc->GetSource()) || context.IsOrganelle(&biosrc->GetSource()) || bioseq.IsAa()) {
499  return;
500  }
501  for (const CSeq_feat& feat : context.GetAllFeat()) {
502  if (feat.IsSetData() && feat.GetData().IsCdregion() && feat.IsSetExcept_text() && NStr::FindNoCase(feat.GetExcept_text(), kNonExtendableException) != NPOS && IsNonExtendable(feat.GetLocation(), bioseq, &(context.GetScope()))) {
503  m_Objs["[n] feature[s] [has] partial ends that do not abut the end of the sequence or a gap, and cannot be extended by 3 or fewer nucleotides to do so, but [has] the correct exception"].Add(*context.SeqFeatObjRef(feat));
504  }
505  }
506 }
511 DISCREPANCY_CASE(PARTIAL_PROBLEMS, SEQUENCE, eDisc | eOncaller | eSubmitter | eSmart | eFatal, "Find partial feature ends on bacterial sequences, but could be extended by 3 or fewer nucleotides")
512 {
513  const CBioseq& bioseq = context.CurrentBioseq();
514  const CSeqdesc* biosrc = context.GetBiosource();
515  if (!biosrc || context.IsEukaryotic(&biosrc->GetSource()) || context.IsOrganelle(&biosrc->GetSource()) || bioseq.IsAa()) {
516  return;
517  }
518  //bool circular = bioseq.IsSetInst() && bioseq.GetInst().GetTopology() == CSeq_inst::eTopology_circular;
519  for (const CSeq_feat& feat : context.GetAllFeat()) {
520  if (feat.IsSetData() && feat.GetData().IsCdregion()) {
521  if (feat.IsSetPseudo() && feat.GetPseudo() == true && !context.IsRefseq()) continue;
522  bool add_this = false;
523  if (feat.GetLocation().IsPartialStart(eExtreme_Positional)) {
524  TSeqPos start = feat.GetLocation().GetStart(eExtreme_Positional);
525  if (start > 0) {
526  TSeqPos extend_len = 0;
527  if (IsExtendableLeft(start, bioseq, &(context.GetScope()), extend_len, feat.GetLocation().GetStrand()) == eExtensibe_fixable) {
528  //cout << "extend start: " << extend_len << "\n";
529  add_this = extend_len > 0 && extend_len <= 3;
530  }
531  }
532  }
533  if (!add_this && feat.GetLocation().IsPartialStop(eExtreme_Positional)) {
534  TSeqPos stop = feat.GetLocation().GetStop(eExtreme_Positional);
535  if (stop < bioseq.GetLength() - 1) {
536  TSeqPos extend_len = 0;
537  if (IsExtendableRight(stop, bioseq, &(context.GetScope()), extend_len, feat.GetLocation().GetStrand()) == eExtensibe_fixable) {
538  //cout << "extend end: " << extend_len << "\n";
539  add_this = extend_len > 0 && extend_len <= 3;
540  }
541  }
542  }
543  if (add_this) {
544  m_Objs["[n] feature[s] [has] partial ends that do not abut the end of the sequence or a gap, but could be extended by 3 or fewer nucleotides to do so"].Add(*context.SeqFeatObjRef(feat, CDiscrepancyContext::eFixSet)).Fatal();
545  }
546  }
547  }
548 }
551 static bool ExtendToGapsOrEnds(const CSeq_feat& cds, CScope& scope)
552 {
553  bool rval = false;
555  CBioseq_Handle bsh = scope.GetBioseqHandle(cds.GetLocation());
556  if (!bsh) {
557  return rval;
558  }
560  //bool circular = seq->IsSetInst() && seq->GetInst().GetTopology() == CSeq_inst::eTopology_circular;
563  for (CFeat_CI gene_it(bsh, CSeqFeatData::eSubtype_gene); gene_it; ++gene_it) {
564  if (gene_it->GetLocation().GetStart(eExtreme_Positional) == cds.GetLocation().GetStart(eExtreme_Positional) && gene_it->GetLocation().GetStop(eExtreme_Positional) == cds.GetLocation().GetStop(eExtreme_Positional)) {
565  gene.Reset(&gene_it->GetMappedFeature());
566  break;
567  }
568  }
570  CRef<CSeq_feat> new_feat(new CSeq_feat());
571  new_feat->Assign(cds);
573  CRef<CSeq_feat> new_gene;
574  if (gene) {
575  new_gene.Reset(new CSeq_feat());
576  new_gene->Assign(*gene);
577  }
581  if (start > 0) {
582  TSeqPos extend_len = 0;
583  if (IsExtendableLeft(start, *seq, &scope, extend_len, cds.GetLocation().GetStrand()) && CCleanup::SeqLocExtend(new_feat->SetLocation(), start - extend_len, scope)) {
584  if (gene) {
585  CCleanup::SeqLocExtend(new_gene->SetLocation(), start - extend_len, scope);
586  }
587  if (new_feat->GetData().GetCdregion().CanGetFrame() && cds.GetLocation().GetStrand() != eNa_strand_minus) {
588  CCdregion::EFrame frame = new_feat->GetData().GetCdregion().GetFrame();
589  if (frame != CCdregion::eFrame_not_set) {
590  // eFrame_not_set = 0, ///< not set, code uses one
591  // eFrame_one = 1,
592  // eFrame_two = 2,
593  // eFrame_three = 3 ///< reading frame
594  unsigned fr = (unsigned)frame - 1;
595  fr = (fr + extend_len) % 3;
596  frame = (CCdregion::EFrame)(fr + 1);
597  new_feat->SetData().SetCdregion().SetFrame() = frame;
598  }
599  }
600  rval = true;
601  }
602  }
603  }
607  if (stop > 0) {
608  TSeqPos extend_len = 0;
609  if (IsExtendableRight(stop, *seq, &scope, extend_len, cds.GetLocation().GetStrand()) && CCleanup::SeqLocExtend(new_feat->SetLocation(), stop + extend_len, scope)) {
610  if (gene) {
611  CCleanup::SeqLocExtend(new_gene->SetLocation(), stop + extend_len, scope);
612  }
613  if (new_feat->GetData().GetCdregion().CanGetFrame() && cds.GetLocation().GetStrand() == eNa_strand_minus) {
614  CCdregion::EFrame frame = new_feat->GetData().GetCdregion().GetFrame();
615  if (frame != CCdregion::eFrame_not_set) {
616  unsigned fr = (unsigned)frame - 1;
617  fr = (fr + extend_len) % 3;
618  frame = (CCdregion::EFrame)(fr + 1);
619  new_feat->SetData().SetCdregion().SetFrame() = frame;
620  }
621  }
622  rval = true;
623  }
624  }
625  }
627  if (rval) {
628  CSeq_feat_EditHandle feh(scope.GetSeq_featHandle(cds));
629  feh.Replace(*new_feat);
630  if (gene) {
631  CSeq_feat_EditHandle geh(scope.GetSeq_featHandle(*gene));
632  geh.Replace(*new_gene);
633  }
634  }
635  return rval;
636 }
640 {
641  const CSeq_feat* sf = dynamic_cast<const CSeq_feat*>(context.FindObject(*obj));
642  if (ExtendToGapsOrEnds(*sf, context.GetScope())) {
643  obj->SetFixed();
644  return CRef<CAutofixReport>(new CAutofixReport("PARTIAL_PROBLEMS: [n] feature[s] [is] extended to end or gap", 1));
645  }
646  return CRef<CAutofixReport>();
647 }
652 const string kEukaryoteShouldHavemRNA = "no mRNA present";
653 const string kEukaryoticCDSHasMrna = "Eukaryotic CDS has mRNA";
655 DISCREPANCY_CASE(EUKARYOTE_SHOULD_HAVE_MRNA, SEQUENCE, eDisc | eSubmitter | eSmart | eFatal, "Eukaryote should have mRNA")
656 {
657  const CSeqdesc* molinfo = context.GetMolinfo();
658  if (!molinfo || !molinfo->GetMolinfo().IsSetBiomol() || molinfo->GetMolinfo().GetBiomol() != CMolInfo::eBiomol_genomic) {
659  return;
660  }
661  const CSeqdesc* biosrc = context.GetBiosource();
662  if (!context.IsEukaryotic(biosrc ? &biosrc->GetSource() : nullptr)) {
663  return;
664  }
665  for (const CSeq_feat& feat : context.GetAllFeat()) {
666  if (feat.IsSetData() && feat.GetData().IsCdregion() && !context.IsPseudo(feat)) {
667  CConstRef<CSeq_feat> mrna = sequence::GetmRNAforCDS(feat, context.GetScope());
668  if (mrna) {
669  m_Objs[kEukaryoticCDSHasMrna].Add(*context.SeqFeatObjRef(feat));
670  }
671  else if (m_Objs[kEukaryoteShouldHavemRNA].GetObjects().empty()) {
672  m_Objs[kEukaryoteShouldHavemRNA].Add(*context.SeqFeatObjRef(feat)).Fatal();
673  }
674  }
675  }
676 }
680 {
681  if (m_Objs.empty()) {
682  return;
683  }
685  m_Objs.GetMap().erase(kEukaryoticCDSHasMrna);
686  m_Objs[kEukaryoteShouldHavemRNA].clearObjs();
687  xSummarize();
688  }
689 }
694 DISCREPANCY_CASE(NON_GENE_LOCUS_TAG, FEAT, eDisc | eOncaller | eSubmitter | eSmart, "Nongene Locus Tag")
695 {
696  for (const CSeq_feat& feat : context.GetFeat()) {
697  if (feat.IsSetQual() && (!feat.IsSetData() || !feat.GetData().IsGene())) {
698  for (const auto& it : feat.GetQual()) {
699  if (it->IsSetQual() && NStr::EqualNocase(it->GetQual(), "locus_tag")) {
700  m_Objs["[n] non-gene feature[s] [has] locus tag[s]."].Add(*context.SeqFeatObjRef(feat));
701  break;
702  }
703  }
704  }
705  }
706 }
711 DISCREPANCY_CASE(FIND_BADLEN_TRNAS, FEAT, eDisc | eOncaller | eSubmitter | eSmart, "Find short and long tRNAs")
712 {
713  for (const CSeq_feat& feat : context.GetFeat()) {
714  if (feat.IsSetData() && feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_tRNA) {
715  TSeqPos len = sequence::GetLength(feat.GetLocation(), &(context.GetScope()));
716  if (!feat.IsSetPartial() && len < 50) {
717  m_Objs["[n] tRNA[s] [is] too short"].Add(*context.SeqFeatObjRef(feat));
718  }
719  else if (len >= 150) {
720  m_Objs["[n] tRNA[s] [is] too long - over 150 nucleotides"].Add(*context.SeqFeatObjRef(feat));
721  }
722  }
723  }
724 }
727 // ORG_TRNAS
729 DISCREPANCY_CASE(ORG_TRNAS, FEAT, eDisc | eOncaller, "Find long tRNAs > 90nt except Ser/Leu/Sec")
730 {
731  for (const CSeq_feat& feat : context.GetFeat()) {
732  if (feat.IsSetData() && feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_tRNA) {
733  TSeqPos len = sequence::GetLength(feat.GetLocation(), &(context.GetScope()));
734  if (len > 90) {
735  const string aa = context.GetAminoacidName(feat);
736  if (aa != "Ser" && aa != "Sec" && aa != "Leu") {
737  m_Objs["[n] tRNA[s] [is] too long"].Add(*context.SeqFeatObjRef(feat));
738  }
739  }
740  }
741  }
742 }
746 bool IsPartialStartConflict(const CSeq_feat& feat, const CSeq_feat& gene, bool is_mrna = false)
747 {
748  bool partial_feat = feat.GetLocation().IsPartialStart(eExtreme_Biological);
749  bool partial_gene = gene.GetLocation().IsPartialStart(eExtreme_Biological);
750  if (partial_feat != partial_gene) {
752  return true;
753  }
754  }
755  return false;
756 }
759 bool IsPartialStopConflict(const CSeq_feat& feat, const CSeq_feat& gene, bool is_mrna = false)
760 {
761  bool partial_feat = feat.GetLocation().IsPartialStop(eExtreme_Biological);
762  bool partial_gene = gene.GetLocation().IsPartialStop(eExtreme_Biological);
763  if (partial_feat != partial_gene) {
765  return true;
766  }
767  }
768  return false;
769 }
771 const string kGenePartialConflictTop = "[n/2] feature location[s] conflict with partialness of overlapping gene";
772 const string kGenePartialConflictOther = "[n/2] feature[s] that [is] not coding region[s] or misc_feature[s] conflict with partialness of overlapping gene";
773 const string kGenePartialConflictCodingRegion = "[n/2] coding region location[s] conflict with partialness of overlapping gene";
774 const string kGenePartialConflictMiscFeat = "[n/2] misc_feature location[s] conflict with partialness of overlapping gene";
775 const string kConflictBoth = " feature partialness conflicts with gene on both ends";
776 const string kConflictStart = " feature partialness conflicts with gene on 5' end";
777 const string kConflictStop = " feature partialness conflicts with gene on 3' end";
780 DISCREPANCY_CASE(GENE_PARTIAL_CONFLICT, SEQUENCE, eOncaller | eSubmitter | eSmart, "Feature partialness should agree with gene partialness if endpoints match")
781 {
782  const CSeqdesc* molinfo = context.GetMolinfo();
783  const CSeqdesc* biosrc = context.GetBiosource();
784  bool is_mrna = molinfo && molinfo->GetMolinfo().IsSetBiomol() && molinfo->GetMolinfo().GetBiomol() == CMolInfo::eBiomol_mRNA;
785  bool is_eukaryotic = context.IsEukaryotic(biosrc ? &biosrc->GetSource() : nullptr);
787  const auto& all = context.FeatAll();
788  for (const CSeq_feat* feat : all) {
789  if (!feat->IsSetData()) {
790  continue;
791  }
792  const CSeq_feat* gene = context.GetGeneForFeature(*feat);
793  if (!gene) {
794  continue;
795  }
796  bool conflict_start = false;
797  bool conflict_stop = false;
798  CSeqFeatData::ESubtype subtype = feat->GetData().GetSubtype();
799  string middle_label = kGenePartialConflictOther;
800  if (feat->GetData().IsCdregion()) {
801  if (!is_eukaryotic || is_mrna) {
802  middle_label = kGenePartialConflictCodingRegion;
803  conflict_start = IsPartialStartConflict(*feat, *gene, is_mrna);
804  conflict_stop = IsPartialStopConflict(*feat, *gene, is_mrna);
805  if (is_mrna) {
806  //look for 5' UTR
807  TSeqPos gene_start = gene->GetLocation().GetStart(eExtreme_Biological);
808  bool gene_start_partial = gene->GetLocation().IsPartialStart(eExtreme_Biological);
809  bool found_start = false;
810  bool found_utr5 = false;
811  for (const CSeq_feat* fi : all) {
812  if (fi->IsSetData() && fi->GetData().GetSubtype() == CSeqFeatData::eSubtype_5UTR) {
813  found_utr5 = true;
814  if (fi->GetLocation().GetStart(eExtreme_Biological) == gene_start && fi->GetLocation().IsPartialStart(eExtreme_Biological) == gene_start_partial) {
815  found_start = true;
816  conflict_start = false;
817  break;
818  }
819  }
820  }
821  if (found_utr5 && !found_start) {
822  conflict_start = true;
823  }
824  //look for 3' UTR
825  TSeqPos gene_stop = gene->GetLocation().GetStop(eExtreme_Biological);
826  bool gene_stop_partial = gene->GetLocation().IsPartialStop(eExtreme_Biological);
827  bool found_stop = false;
828  bool found_utr3 = false;
829  for (const CSeq_feat* fi : all) {
830  if (fi->IsSetData() && fi->GetData().GetSubtype() == CSeqFeatData::eSubtype_3UTR) {
831  found_utr3 = true;
832  if (fi->GetLocation().GetStop(eExtreme_Biological) == gene_stop && fi->GetLocation().IsPartialStop(eExtreme_Biological) == gene_stop_partial) {
833  found_stop = true;
834  conflict_stop = false;
835  break;
836  }
837  }
838  }
839  if (found_utr3 && !found_stop) {
840  conflict_stop = true;
841  }
842  }
843  }
844  }
845  else if (feat->GetData().IsRna() || subtype == CSeqFeatData::eSubtype_intron || subtype == CSeqFeatData::eSubtype_exon || subtype == CSeqFeatData::eSubtype_5UTR || subtype == CSeqFeatData::eSubtype_3UTR || subtype == CSeqFeatData::eSubtype_misc_feature) {
846  conflict_start = IsPartialStartConflict(*feat, *gene);
847  conflict_stop = IsPartialStopConflict(*feat, *gene);
848  if (subtype == CSeqFeatData::eSubtype_misc_feature) {
849  middle_label = kGenePartialConflictMiscFeat;
850  }
851  }
852  if (conflict_start || conflict_stop) {
853  string label = CSeqFeatData::SubtypeValueToName(subtype);
854  label += conflict_start && conflict_stop ? kConflictBoth : conflict_start ? kConflictStart : kConflictStop;
855  m_Objs[kGenePartialConflictTop][middle_label].Ext()[label].Ext().Add(*context.SeqFeatObjRef(*feat), false).Add(*context.SeqFeatObjRef(*gene), false);
856  }
857  }
858 }
862 {
863  m_ReportItems = m_Objs.Export(*this, false)->GetSubitems();
864 }
870 {
871  if (s1 == eNa_strand_minus && s2 != eNa_strand_minus) {
872  return false;
873  } else if (s1 != eNa_strand_minus && s2 == eNa_strand_minus) {
874  return false;
875  } else {
876  return true;
877  }
878 }
881 bool HasMixedStrands(const CSeq_loc& loc)
882 {
883  CSeq_loc_CI li(loc);
884  if (!li) {
885  return false;
886  }
887  ENa_strand first_strand = li.GetStrand();
888  ++li;
889  while (li) {
890  if (!StrandsMatch(li.GetStrand(), first_strand)) {
891  return true;
892  }
893  ++li;
894  }
895  return false;
896 }
899 const string kBadGeneStrand = "[n/2] feature location[s] conflict with gene location strand[s]";
902 DISCREPANCY_CASE(BAD_GENE_STRAND, SEQUENCE, eOncaller | eSubmitter | eSmart, "Genes and features that share endpoints should be on the same strand")
903 {
904  // note - use positional instead of biological, because we are *looking* for objects on the opposite strand
905  const auto& genes = context.FeatGenes();
906  const auto& feats = context.FeatAll();
908  for (size_t j = 0; j < feats.size(); j++) {
909  CSeqFeatData::ESubtype subtype = feats[j]->GetData().GetSubtype();
911  continue;
912  }
913  const CSeq_loc& loc_j = feats[j]->GetLocation();
914  TSeqPos feat_start = loc_j.GetStart(eExtreme_Positional);
915  TSeqPos feat_stop = loc_j.GetStop(eExtreme_Positional);
916  for (size_t i = 0; i < genes.size(); i++) {
917  if (!genes[i]->IsSetLocation()) {
918  continue;
919  }
920  const CSeq_loc& loc_i = genes[i]->GetLocation();
921  ENa_strand strand_i = loc_i.GetStrand();
922  TSeqPos gene_start = loc_i.GetStart(eExtreme_Positional);
923  TSeqPos gene_stop = loc_i.GetStop(eExtreme_Positional);
924  if (feat_start == gene_start || feat_stop == gene_stop) {
925  bool all_ok = true;
926  if (HasMixedStrands(loc_i)) {
927  // compare intervals, to make sure that for each pair of feature interval and gene interval
928  // where the gene interval contains the feature interval, the intervals are on the same strand
929  CSeq_loc_CI f_loc(loc_j);
930  bool found_bad = false;
931  while (f_loc && !found_bad) {
932  CConstRef<CSeq_loc> f_int = f_loc.GetRangeAsSeq_loc();
933  CSeq_loc_CI g_loc(loc_i);
934  while (g_loc && !found_bad) {
935  CConstRef<CSeq_loc> g_int = g_loc.GetRangeAsSeq_loc();
936  sequence::ECompare cmp = context.Compare(*f_int, *g_int);
938  if (!StrandsMatch(f_loc.GetStrand(), g_loc.GetStrand())) {
939  found_bad = true;
940  }
941  }
942  ++g_loc;
943  }
944  ++f_loc;
945  }
946  all_ok = !found_bad;
947  }
948  else {
949  all_ok = StrandsMatch(loc_j.GetStrand(), strand_i);
950  }
951  if (!all_ok) {
952  size_t offset = m_Objs[kBadGeneStrand].GetMap().size() + 1;
953  string label = "Gene and feature strands conflict (pair " + NStr::NumericToString(offset) + ")";
954  m_Objs[kBadGeneStrand][label].Ext().Add(*context.SeqFeatObjRef(*genes[i]), false).Add(*context.SeqFeatObjRef(*feats[j]), false);
955  }
956  }
957  }
958  }
959 }
963 {
964  m_ReportItems = m_Objs.Export(*this, false)->GetSubitems();
965 }
970 DISCREPANCY_CASE(MICROSATELLITE_REPEAT_TYPE, FEAT, eOncaller | eFatal, "Microsatellites must have repeat type of tandem")
971 {
972  for (const CSeq_feat& feat : context.GetFeat()) {
973  if (feat.IsSetData() && feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_repeat_region && feat.IsSetQual()) {
974  bool is_microsatellite = false;
975  bool is_tandem = false;
976  const CSeq_feat::TQual& quals = feat.GetQual();
977  for (auto it = quals.cbegin(); it != quals.cend() && (!is_microsatellite || !is_tandem); ++it) {
978  const CGb_qual& qual = **it;
979  if (NStr::EqualCase(qual.GetQual(), "satellite")) {
980  if (NStr::EqualNocase(qual.GetVal(), "microsatellite") ||
981  NStr::StartsWith(qual.GetVal(), "microsatellite:", NStr::eNocase)) {
982  is_microsatellite = true;
983  }
984  }
985  else if (NStr::EqualCase(qual.GetQual(), "rpt_type")) {
986  is_tandem = NStr::EqualCase(qual.GetVal(), "tandem");
987  }
988  }
989  if (is_microsatellite && !is_tandem) {
990  m_Objs["[n] microsatellite[s] do not have a repeat type of tandem"].Add(*context.SeqFeatObjRef(feat, &feat)).Fatal();
991  }
992  }
993  }
994 }
998 {
999  const CSeq_feat* sf = dynamic_cast<const CSeq_feat*>(context.FindObject(*obj));
1000  CRef<CSeq_feat> new_feat(new CSeq_feat());
1001  new_feat->Assign(*sf);
1002  CRef<CGb_qual> new_qual(new CGb_qual("rpt_type", "tandem"));
1003  new_feat->SetQual().push_back(new_qual);
1004  context.ReplaceSeq_feat(*obj, *sf, *new_feat);
1005  obj->SetFixed();
1006  return CRef<CAutofixReport>(new CAutofixReport("MICROSATELLITE_REPEAT_TYPE: added repeat type of tandem to [n] microsatellite[s]", 1));
1007 }
1011 static const string kSuspiciousNotePhrases[] =
1012 {
1013  "characterised",
1014  "recognised",
1015  "characterisation",
1016  "localisation",
1017  "tumour",
1018  "uncharacterised",
1019  "oxydase",
1020  "colour",
1021  "localise",
1022  "faecal",
1023  "orthologue",
1024  "paralogue",
1025  "homolog",
1026  "homologue",
1027  "intronless gene"
1028 };
1032 static void FindSuspiciousNotePhrases(const string& s, CDiscrepancyContext& context, CReportNode& rep, const CSeq_feat& feat)
1033 {
1034  for (size_t k = 0; k < kNumSuspiciousNotePhrases; k++) {
1036  rep["[n] note text[s] contain suspicious phrase[s]"]["[n] note text[s] contain '" + kSuspiciousNotePhrases[k] + "'"].Ext().Add(*context.SeqFeatObjRef(feat));
1037  }
1038  }
1039 }
1042 DISCREPANCY_CASE(SUSPICIOUS_NOTE_TEXT, FEAT, eOncaller, "Find Suspicious Phrases in Note Text")
1043 {
1044  for (const CSeq_feat& feat : context.GetFeat()) {
1045  if (feat.IsSetData()) {
1046  switch (feat.GetData().GetSubtype()) {
1048  // look in gene comment and gene description
1049  if (feat.IsSetComment()) {
1050  FindSuspiciousNotePhrases(feat.GetComment(), context, m_Objs, feat);
1051  }
1052  if (feat.GetData().GetGene().IsSetDesc()) {
1053  FindSuspiciousNotePhrases(feat.GetData().GetGene().GetDesc(), context, m_Objs, feat);
1054  }
1055  break;
1057  if (feat.GetData().GetProt().IsSetDesc()) {
1058  FindSuspiciousNotePhrases(feat.GetData().GetProt().GetDesc(), context, m_Objs, feat);
1059  }
1060  break;
1063  if (feat.IsSetComment()) {
1064  FindSuspiciousNotePhrases(feat.GetComment(), context, m_Objs, feat);
1065  }
1066  break;
1067  default:
1068  break;
1069  }
1070  }
1071  }
1072 }
1076 {
1077  m_ReportItems = m_Objs.Export(*this, false)->GetSubitems();
1078 }
1083 static const string kNewExceptions[] =
1084 {
1085  "annotated by transcript or proteomic data",
1086  "heterogeneous population sequenced",
1087  "low-quality sequence region",
1088  "unextendable partial coding region",
1089 };
1092 DISCREPANCY_CASE(CDS_HAS_NEW_EXCEPTION, FEAT, eDisc | eOncaller | eSmart, "Coding region has new exception")
1093 {
1094  static const size_t max = ArraySize(kNewExceptions);
1095  for (const CSeq_feat& feat : context.GetFeat()) {
1096  if (feat.IsSetData() && feat.GetData().IsCdregion() && feat.IsSetExcept_text()) {
1097  for (size_t i = 0; i < max; i++) {
1098  if (NStr::FindNoCase(feat.GetExcept_text(), kNewExceptions[i]) != NPOS) {
1099  m_Objs["[n] coding region[s] [has] new exception[s]"].Add(*context.SeqFeatObjRef(feat));
1100  break;
1101  }
1102  }
1103  }
1104  }
1105 }
1109 {
1110  m_ReportItems = m_Objs.Export(*this, false)->GetSubitems();
1111 }
1116 DISCREPANCY_CASE(SHORT_LNCRNA, FEAT, eDisc | eOncaller | eSubmitter | eSmart, "Short lncRNA sequences")
1117 {
1118  for (const CSeq_feat& feat : context.GetFeat()) {
1119  if (feat.IsSetData() && feat.GetData().IsRna() && feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_ncRNA
1120  && feat.GetData().GetRna().IsSetExt() && feat.GetData().GetRna().GetExt().IsGen() && feat.GetData().GetRna().GetExt().GetGen().IsSetClass()
1121  && NStr::EqualNocase(feat.GetData().GetRna().GetExt().GetGen().GetClass(), "lncrna") // only looking at lncrna features
1122  && !feat.GetLocation().IsPartialStart(eExtreme_Biological) && !feat.GetLocation().IsPartialStop(eExtreme_Biological) // ignore if partial
1123  && sequence::GetLength(feat.GetLocation(), &(context.GetScope())) < 200) {
1124  m_Objs["[n] lncRNA feature[s] [is] suspiciously short"].Add(*context.SeqFeatObjRef(feat));
1125  }
1126  }
1127 }
1131 {
1132  m_ReportItems = m_Objs.Export(*this, false)->GetSubitems();
1133 }
1138 const string& kJoinedFeatures = "[n] feature[s] [has] joined location[s].";
1139 const string& kJoinedFeaturesNoException = "[n] feature[s] [has] joined location but no exception";
1140 const string& kJoinedFeaturesException = "[n] feature[s] [has] joined location but exception '";
1141 const string& kJoinedFeaturesBlankException = "[n] feature[s] [has] joined location but a blank exception";
1143 DISCREPANCY_CASE(JOINED_FEATURES, FEAT, eDisc | eSubmitter | eSmart, "Joined Features: on when non-eukaryote")
1144 {
1145  const CSeqdesc* biosrc = context.GetBiosource();
1146  if (biosrc && !context.IsEukaryotic(&biosrc->GetSource()) && !context.IsOrganelle(&biosrc->GetSource())) {
1147  for (const CSeq_feat& feat : context.GetFeat()) {
1148  if (feat.IsSetLocation()) {
1149  if (feat.GetLocation().IsMix() || feat.GetLocation().IsPacked_int()) {
1150  if (feat.IsSetExcept_text()) {
1151  if (NStr::IsBlank(feat.GetExcept_text())) {
1152  m_Objs[kJoinedFeatures][kJoinedFeaturesBlankException].Ext().Add(*context.SeqFeatObjRef(feat));
1153  }
1154  else {
1155  m_Objs[kJoinedFeatures][kJoinedFeaturesException + feat.GetExcept_text() + "'"].Ext().Add(*context.SeqFeatObjRef(feat));
1156  }
1157  }
1158  else if (feat.IsSetExcept() && feat.GetExcept()) {
1159  m_Objs[kJoinedFeatures][kJoinedFeaturesBlankException].Ext().Add(*context.SeqFeatObjRef(feat));
1160  }
1161  else {
1162  m_Objs[kJoinedFeatures][kJoinedFeaturesNoException].Ext().Add(*context.SeqFeatObjRef(feat));
1163  }
1164  }
1165  }
1166  }
1167  }
1168 }
1172 {
1173  m_ReportItems = m_Objs.Export(*this, false)->GetSubitems();
1174 }
1179 DISCREPANCY_CASE(BACTERIAL_JOINED_FEATURES_NO_EXCEPTION, SEQUENCE, eDisc | eSubmitter | eSmart, "Joined Features on prokaryote without exception")
1180 {
1181  const CSeqdesc* biosrc = context.GetBiosource();
1182  if (biosrc && (context.IsEukaryotic(&biosrc->GetSource()) || context.IsOrganelle(&biosrc->GetSource()))) {
1183  return;
1184  }
1185  for (const CSeq_feat& feat : context.GetAllFeat()) {
1186  if (feat.IsSetLocation() && feat.CanGetData() && feat.GetData().IsCdregion() && !context.IsPseudo(feat)) {
1187  if (feat.GetLocation().IsMix() || feat.GetLocation().IsPacked_int()) {
1188  if ((feat.IsSetExcept_text() && !feat.GetExcept_text().empty()) || (feat.IsSetExcept() && feat.GetExcept())) {
1189  continue;
1190  }
1191  bool bad = true;
1192  if (context.CurrentBioseq().CanGetInst()) {
1193  const CSeq_inst& inst = context.CurrentBioseq().GetInst();
1195  unsigned int len = inst.GetLength();
1196  CSeq_loc_CI ci0(feat.GetLocation());
1197  if (ci0) {
1198  CSeq_loc_CI ci1 = ci0;
1199  ++ci1;
1200  if (ci1) {
1201  CSeq_loc_CI ci2 = ci1;
1202  ++ci2;
1203  if (!ci2) { // location has exactly 2 intervals
1204  if (ci0.GetStrand() == eNa_strand_plus && ci1.GetStrand() == eNa_strand_plus) {
1205  if (ci0.GetRange().GetTo() == len - 1 && ci1.GetRange().GetFrom() == 0) {
1206  bad = false;
1207  }
1208  }
1209  else if (ci0.GetStrand() == eNa_strand_minus && ci1.GetStrand() == eNa_strand_minus) {
1210  if (ci1.GetRange().GetTo() == len - 1 && ci0.GetRange().GetFrom() == 0) {
1211  bad = false;
1212  }
1213  }
1214  }
1215  }
1216  }
1217  }
1218  }
1219  m_Objs["[n] coding region[s] with joined location[s] [has] no exception[s]"][bad ? "[n] coding region[s] not over the origin of circular DNA" : "[n] coding region[s] over the origin of circular DNA"].Severity(bad ? CReportItem::eSeverity_error : CReportItem::eSeverity_warning).Add(*context.SeqFeatObjRef(feat));
1220  }
1221  }
1222  }
1223 }
1227 {
1228  m_ReportItems = m_Objs.Export(*this, false)->GetSubitems();
1229 }
1234 DISCREPANCY_CASE(RIBOSOMAL_SLIPPAGE, FEAT, eDisc | eSmart | eFatal, " Only a select number of proteins undergo programmed frameshifts due to ribosomal slippage")
1235 {
1236  const CSeqdesc* biosrc = context.GetBiosource();
1237  if (biosrc && !context.IsEukaryotic(&biosrc->GetSource()) && !context.IsOrganelle(&biosrc->GetSource())) {
1238  for (const CSeq_feat& feat : context.GetFeat()) {
1239  if (feat.IsSetLocation() && feat.CanGetData() && feat.GetData().IsCdregion() && feat.IsSetExcept_text() && (feat.GetLocation().IsMix() || feat.GetLocation().IsPacked_int())) {
1240  if (feat.GetExcept_text().find("ribosomal slippage") != string::npos) {
1241  //string product = GetProductForCDS(feat, context.GetScope()); // sema: may need to change when we start using CFeatTree
1242  string product = context.GetProdForFeature(feat);
1244  continue; // note: used to be "return" but that seems wrong
1245  }
1246  m_Objs["[n] coding region[s] [has] unexpected ribosomal slippage"].Fatal().Add(*context.SeqFeatObjRef(feat));
1247  }
1248  }
1249  }
1250  }
1251 }
1255 {
1256  m_ReportItems = m_Objs.Export(*this, false)->GetSubitems();
1257 }
1262 const string kShortIntronTop = "[n] intron[s] [is] shorter than 10 nt";
1263 const string kShortIntronExcept = "[n] intron[s] [is] shorter than 11 nt and [has] an exception";
1265 DISCREPANCY_CASE(SHORT_INTRON, FEAT, eDisc | eOncaller | eSubmitter | eSmart, "Introns shorter than 10 nt")
1266 {
1267  for (const CSeq_feat& feat : context.GetFeat()) {
1268  if (feat.IsSetData() && feat.GetData().IsCdregion() && feat.IsSetLocation() && !feat.IsSetExcept() && !context.IsPseudo(feat)) {
1269  CSeq_loc_CI li(feat.GetLocation());
1270  if (li) {
1271  bool found_short = false;
1272  TSeqPos last_start = li.GetRange().GetFrom();
1273  TSeqPos last_stop = li.GetRange().GetTo();
1274  ++li;
1275  while (li && !found_short) {
1276  TSeqPos start = li.GetRange().GetFrom();
1277  TSeqPos stop = li.GetRange().GetTo();
1278  if (start >= last_stop && start - last_stop < 11) {
1279  found_short = true;
1280  }
1281  else if (last_stop >= start && last_stop - start < 11) {
1282  found_short = true;
1283  }
1284  else if (stop >= last_start && stop - last_start < 11) {
1285  found_short = true;
1286  }
1287  else if (last_start >= stop && last_start - stop < 11) {
1288  found_short = true;
1289  }
1290  last_start = start;
1291  last_stop = stop;
1292  ++li;
1293  }
1294  if (found_short) {
1295  //if (obj.IsSetExcept() && obj.GetExcept()) {
1296  // m_Objs[kShortIntronTop][kShortIntronExcept].Ext().Add(*context.DiscrObj(obj, true));
1297  //}
1298  m_Objs[kShortIntronTop].Add(*context.SeqFeatObjRef(feat, CDiscrepancyContext::eFixSet));
1299  }
1300  }
1301  }
1302  }
1303 }
1307 {
1308  m_ReportItems = m_Objs.Export(*this, false)->GetSubitems();
1309 }
1312 static const string kPutativeFrameShift = "putative frameshift";
1314 static void AddException(const CSeq_feat& sf, CScope& scope, const string& exception_text)
1315 {
1316  CRef<CSeq_feat> new_feat(new CSeq_feat());
1317  new_feat->Assign(sf);
1318  if (new_feat->IsSetExcept_text() && !NStr::IsBlank(new_feat->GetExcept_text())) {
1319  new_feat->SetExcept_text(new_feat->GetExcept_text() + "; " + exception_text);
1320  } else {
1321  new_feat->SetExcept_text(exception_text);
1322  }
1323  new_feat->SetExcept(true);
1324  CSeq_feat_EditHandle feh(scope.GetSeq_featHandle(sf));
1325  feh.Replace(*new_feat);
1326 }
1330 {
1331  if (sf.IsSetComment() && !NStr::IsBlank(sf.GetComment())) {
1332  if (gene.IsSetComment() && !NStr::IsBlank(gene.GetComment())) {
1333  gene.SetComment(sf.GetComment() + ';' + gene.GetComment());
1334  }
1335  else {
1336  gene.ResetComment();
1338  if (sf.IsSetComment()) {
1339  gene.SetComment(sf.GetComment());
1340  sf.ResetComment();
1341  }
1343  if (is_bacterial) {
1344  sf.SetComment("contains short intron that may represent a frameshift");
1345  }
1346  }
1347  }
1349  if (!gene.IsSetComment() || NStr::Find(gene.GetComment(), kPutativeFrameShift) == NPOS) {
1350  if (gene.IsSetComment() && !NStr::IsBlank(gene.GetComment())) {
1351  gene.SetComment(kPutativeFrameShift + ';' + gene.GetComment());
1352  }
1353  else {
1354  gene.ResetComment();
1356  if (sf.IsSetComment()) {
1357  gene.SetComment(sf.GetComment());
1358  sf.ResetComment();
1359  }
1361  if (is_bacterial) {
1362  sf.SetComment("contains short intron that may represent a frameshift");
1363  }
1364  }
1365  }
1366 }
1369 static void ConvertToMiscFeature(CSeq_feat& sf, CScope& scope)
1370 {
1371  if (sf.IsSetData()) {
1373  if (sf.GetData().IsCdregion() || sf.GetData().IsRna()) {
1375  string prod_name;
1376  if (sf.GetData().IsCdregion()) {
1377  prod_name = GetProductName(sf, scope);
1378  sf.ResetProduct();
1379  }
1380  else {
1381  prod_name = sf.GetData().GetRna().GetRnaProductName();
1382  }
1384  if (!NStr::IsBlank(prod_name)) {
1385  if (sf.IsSetComment()) {
1386  sf.SetComment(prod_name + ';' + sf.GetComment());
1387  }
1388  else {
1389  sf.SetComment(prod_name);
1390  }
1391  }
1393  sf.ResetData();
1394  sf.SetData().SetImp().SetKey("misc_feature");
1395  }
1396  }
1397 }
1400 static bool AddExceptionsToShortIntron(const CSeq_feat& sf, CScope& scope, std::list<CConstRef<CSeq_loc>>& to_remove)
1401 {
1402  bool rval = false;
1403  const CBioSource* source = nullptr;
1404  {
1405  auto bsh = scope.GetBioseqHandle(sf.GetLocation());
1406  CSeqdesc_CI src(bsh, CSeqdesc::e_Source);
1407  if (src) {
1408  source = &src->GetSource();
1409  }
1410  }
1411  if (source) {
1412  if (source->IsSetGenome() && source->GetGenome() == CBioSource::eGenome_mitochondrion) {
1413  return false;
1414  }
1415  bool is_bacterial = CDiscrepancyContext::HasLineage(*source, "", "Bacteria");
1416  if (is_bacterial || CDiscrepancyContext::HasLineage(*source, "", "Archea")) {
1418  if (gene.NotEmpty()) {
1419  CSeq_feat* gene_edit = const_cast<CSeq_feat*>(gene.GetPointer());
1420  CSeq_feat& sf_edit = const_cast<CSeq_feat&>(sf);
1421  rval = true;
1422  gene_edit->SetPseudo(true);
1423  AdjustBacterialGeneForCodingRegionWithShortIntron(sf_edit, *gene_edit, is_bacterial);
1424  // Merge gene's location
1425  if (gene_edit->IsSetLocation()) {
1426  CRef<CSeq_loc> new_loc = gene_edit->SetLocation().Merge(CSeq_loc::fMerge_All, nullptr);
1427  if (new_loc.NotEmpty()) {
1428  gene_edit->SetLocation().Assign(*new_loc);
1429  }
1430  }
1431  if (sf.IsSetProduct()) {
1432  to_remove.push_back(CConstRef<CSeq_loc>(&sf.GetProduct()));
1433  }
1434  if (is_bacterial) {
1435  ConvertToMiscFeature(sf_edit, scope);
1436  }
1437  else {
1438  CSeq_feat_EditHandle sf_handle(scope.GetSeq_featHandle(sf));
1439  sf_handle.Remove();
1440  }
1441  }
1442  return rval;
1443  }
1444  }
1445  if (!sf.IsSetExcept_text() || NStr::Find(sf.GetExcept_text(), "low-quality sequence region") == string::npos) {
1446  AddException(sf, scope, "low-quality sequence region");
1447  rval = true;
1448  }
1449  return rval;
1450 }
1454 {
1455  unsigned int n = 0;
1456  const CSeq_feat* sf = dynamic_cast<const CSeq_feat*>(context.FindObject(*obj));
1457  std::list<CConstRef<CSeq_loc>> to_remove;
1458  if (AddExceptionsToShortIntron(*sf, context.GetScope(), to_remove)) {
1459  n++;
1460  }
1461  for (auto& loc : to_remove) {
1462  CBioseq_Handle bioseq_h = context.GetScope().GetBioseqHandle(*loc);
1463  CBioseq_EditHandle bioseq_edit = bioseq_h.GetEditHandle();
1464  bioseq_edit.Remove();
1465  }
1466  obj->SetFixed();
1467  return CRef<CAutofixReport>(n ? new CAutofixReport("SHORT_INTRON: Set exception for [n] feature[s]", n) : nullptr);
1468 }
1473 DISCREPANCY_CASE(UNNECESSARY_VIRUS_GENE, FEAT, eOncaller, "Unnecessary gene features on virus: on when lineage is not Picornaviridae,Potyviridae,Flaviviridae and Togaviridae")
1474 {
1475  const CSeqdesc* biosrc = context.GetBiosource();
1476  if (biosrc) {
1477  const CBioSource* src = &biosrc->GetSource();
1478  if (context.HasLineage(src, "Picornaviridae") || context.HasLineage(src, "Potyviridae") || context.HasLineage(src, "Flaviviridae") || context.HasLineage(src, "Togaviridae")) {
1479  for (const CSeq_feat& feat : context.GetFeat()) {
1480  if (feat.IsSetData() && feat.GetData().IsGene()) {
1481  m_Objs["[n] virus gene[s] need to be removed"].Add(*context.SeqFeatObjRef(feat));
1482  }
1483  }
1484  }
1485  }
1486 }
1490 {
1491  m_ReportItems = m_Objs.Export(*this, false)->GetSubitems();
1492 }
1497 DISCREPANCY_CASE(CDS_HAS_CDD_XREF, FEAT, eDisc | eOncaller, "CDS has CDD Xref")
1498 {
1499  for (const CSeq_feat& feat : context.GetFeat()) {
1500  if (feat.IsSetData() && feat.GetData().IsCdregion() && feat.IsSetDbxref()) {
1501  for (auto& x : feat.GetDbxref()) {
1502  if (x->IsSetDb() && NStr::EqualNocase(x->GetDb(), "CDD")) {
1503  m_Objs["[n] feature[s] [has] CDD Xrefs"].Add(*context.SeqFeatObjRef(feat));
1504  break;
1505  }
1506  }
1507  }
1508  }
1509 }
1513 {
1514  m_ReportItems = m_Objs.Export(*this, false)->GetSubitems();
1515 }
1520 DISCREPANCY_CASE(SHOW_TRANSL_EXCEPT, FEAT, eDisc | eSubmitter | eSmart, "Show translation exception")
1521 {
1522  for (const CSeq_feat& feat : context.GetFeat()) {
1523  if (feat.IsSetData() && feat.GetData().IsCdregion() && feat.GetData().GetCdregion().IsSetCode_break()) {
1524  m_Objs["[n] coding region[s] [has] a translation exception"].Add(*context.SeqFeatObjRef(feat));
1525  }
1526  }
1527 }
1531 {
1532  m_ReportItems = m_Objs.Export(*this, false)->GetSubitems();
1533 }
1538 static const string kNoProductStr = "[n] product[s] [has] \"no product string in file\"";
1540 DISCREPANCY_CASE(NO_PRODUCT_STRING, FEAT, eDisc, "Product has string \"no product string in file\"")
1541 {
1542  for (const CSeq_feat& feat : context.GetFeat()) {
1543  if (feat.IsSetData() && feat.GetData().IsProt()) {
1544  const CProt_ref& prot = feat.GetData().GetProt();
1545  if (prot.IsSetName()) {
1546  const string* no_prot_str = NStr::FindNoCase(prot.GetName(), "no product string in file");
1547  if (no_prot_str) {
1548  const CSeq_feat* product = sequence::GetCDSForProduct(context.CurrentBioseq(), &context.GetScope());
1549  if (product) {
1550  m_Objs[kNoProductStr].Add(*context.SeqFeatObjRef(*product), false);
1551  }
1552  }
1553  }
1554  }
1555  }
1556 }
1560 {
1561  m_ReportItems = m_Objs.Export(*this, false)->GetSubitems();
1562 }
1567 static const string kIntergenicSpacerNames[] = {
1568  "trnL-trnF intergenic spacer",
1569  "trnH-psbA intergenic spacer",
1570  "trnS-trnG intergenic spacer",
1571  "trnF-trnL intergenic spacer",
1572  "psbA-trnH intergenic spacer",
1573  "trnG-trnS intergenic spacer" };
1578 DISCREPANCY_CASE(UNWANTED_SPACER, FEAT, eOncaller, "Intergenic spacer without plastid location")
1579 {
1580  const CSeqdesc* biosrc = context.GetBiosource();
1581  if (biosrc && biosrc->GetSource().IsSetGenome() && (biosrc->GetSource().GetGenome() == CBioSource::eGenome_chloroplast || biosrc->GetSource().GetGenome() == CBioSource::eGenome_plastid)) {
1582  return;
1583  }
1584  if (biosrc && biosrc->GetSource().IsSetOrg() && biosrc->GetSource().GetOrg().IsSetTaxname() && CDiscrepancyContext::IsUnculturedNonOrganelleName(biosrc->GetSource().GetOrg().GetTaxname())) {
1585  return;
1586  }
1587  for (const CSeq_feat& feat : context.GetFeat()) {
1588  if (feat.IsSetData() && feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_misc_feature) {
1589  for (size_t i = 0; i < kIntergenicSpacerNames_len; i++) {
1590  if (feat.IsSetComment() && NStr::FindNoCase(feat.GetComment(), kIntergenicSpacerNames[i]) != NPOS) {
1591  m_Objs["[n] suspect intergenic spacer note[s] not organelle"].Add(*context.SeqFeatObjRef(feat));
1592  break;
1593  }
1594  }
1595  }
1596  }
1597 }
1602 DISCREPANCY_CASE(CHECK_RNA_PRODUCTS_AND_COMMENTS, FEAT, eOncaller, "Check for gene or genes in rRNA and tRNA products and comments")
1603 {
1604  for (const CSeq_feat& feat : context.GetFeat()) {
1605  if (feat.IsSetData() && feat.GetData().IsRna()) {
1606  const CRNA_ref& rna = feat.GetData().GetRna();
1607  if ((rna.IsSetType() && rna.GetType() == CRNA_ref::eType_rRNA) || rna.GetType() == CRNA_ref::eType_tRNA) {
1608  string product = rna.GetRnaProductName();
1609  string comment;
1610  if (feat.IsSetComment()) {
1611  comment = feat.GetComment();
1612  }
1613  if (NStr::FindNoCase(product, "gene") != NPOS || NStr::FindNoCase(comment, "gene") != NPOS) {
1614  m_Objs["[n] RNA product_name or comment[s] contain[S] 'suspect phrase'"].Add(*context.SeqFeatObjRef(feat));
1615  }
1616  }
1617  }
1618  }
1619 }
1623 {
1624  m_ReportItems = m_Objs.Export(*this, false)->GetSubitems();
1625 }
1630 const string kFeatureLocationConflictTop = "[n] feature[s] [has] inconsistent gene location[s].";
1631 const string kFeatureLocationCodingRegion = "Coding region location does not match gene location";
1632 const string kFeatureLocationRNA = "RNA feature location does not match gene location";
1634 bool IsMixedStrand(const CSeq_loc& loc)
1635 {
1636  CSeq_loc_CI li(loc);
1637  if (!li) {
1638  return false;
1639  }
1640  ENa_strand first_strand = li.GetStrand();
1641  if (first_strand == eNa_strand_unknown) {
1642  first_strand = eNa_strand_plus;
1643  }
1644  ++li;
1645  while (li) {
1646  ENa_strand this_strand = li.GetStrand();
1647  if (this_strand == eNa_strand_unknown) {
1648  this_strand = eNa_strand_plus;
1649  }
1650  if (this_strand != first_strand) {
1651  return true;
1652  }
1653  ++li;
1654  }
1655  return false;
1656 }
1659 bool IsMixedStrandGeneLocationOk(const CSeq_loc& feat_loc, const CSeq_loc& gene_loc)
1660 {
1661  CSeq_loc_CI feat_i(feat_loc);
1662  CSeq_loc_CI gene_i(gene_loc);
1664  while (feat_i && gene_i) {
1665  ENa_strand gene_strand = gene_i.GetStrand();
1666  if (!StrandsMatch(feat_i.GetStrand(), gene_strand) ||
1668  return false;
1669  }
1670  bool found_stop = false;
1671  while (!found_stop && feat_i && StrandsMatch(feat_i.GetStrand(), gene_strand)) {
1673  found_stop = true;
1674  }
1675  ++feat_i;
1676  }
1677  if (!found_stop) {
1678  return false;
1679  }
1680  ++gene_i;
1681  }
1682  if ((feat_i && !gene_i) || (!feat_i && gene_i)) {
1683  return false;
1684  }
1686  return true;
1687 }
1690 bool StopAbutsGap(const CSeq_loc& loc, ENa_strand strand, CScope& scope)
1691 {
1692  if (CBioseq_Handle bsh = scope.GetBioseqHandle(loc); bsh) {
1693  TSeqPos stop = loc.GetStop(eExtreme_Biological);
1694  if (stop < 1 || stop > bsh.GetBioseqLength() - 2) {
1695  return false;
1696  }
1697  CRef<CSeq_loc> search(new CSeq_loc());
1698  search->SetInt().SetId().Assign(*(loc.GetId()));
1699  if (strand == eNa_strand_minus) {
1700  search->SetInt().SetFrom(stop - 1);
1701  search->SetInt().SetTo(stop - 1);
1702  search->SetInt().SetStrand(eNa_strand_minus);
1703  } else {
1704  search->SetInt().SetFrom(stop + 1);
1705  search->SetInt().SetTo(stop + 1);
1706  }
1707  CSeqVector vec(*search, scope);
1708  if (vec.size() && vec.IsInGap(0)) {
1709  return true;
1710  }
1711  }
1712  return false;
1713 }
1716 bool StartAbutsGap(const CSeq_loc& loc, ENa_strand strand, CScope& scope)
1717 {
1718  if (auto bsh = scope.GetBioseqHandle(loc); bsh) {
1719  TSeqPos start = loc.GetStart(eExtreme_Biological);
1720  if (start < 1 || start > bsh.GetBioseqLength() - 2) {
1721  return false;
1722  }
1723  CRef<CSeq_loc> search(new CSeq_loc());
1724  search->SetInt().SetId().Assign(*(loc.GetId()));
1725  if (strand == eNa_strand_minus) {
1726  search->SetInt().SetFrom(start + 1);
1727  search->SetInt().SetTo(start + 1);
1728  search->SetInt().SetStrand(eNa_strand_minus);
1729  } else {
1730  search->SetInt().SetFrom(start - 1);
1731  search->SetInt().SetTo(start - 1);
1732  }
1733  CSeqVector vec(*search, scope);
1734  if (vec.IsInGap(0)) {
1735  return true;
1736  }
1737  }
1738  return false;
1739 }
1742 // location is ok if:
1743 // 1. endpoints match exactly, or
1744 // 2. non-matching 5' endpoint can be extended by an RBS feature to match gene start, or
1745 // 3. if coding region non-matching endpoints are partial and abut a gap
1746 bool IsGeneLocationOk(const CSeq_loc& feat_loc, const CSeq_loc& gene_loc, ENa_strand feat_strand, ENa_strand gene_strand, bool is_coding_region, CScope& scope, const vector<const CSeq_feat*>& features)
1747 {
1748  if (IsMixedStrand(feat_loc) || IsMixedStrand(gene_loc)) {
1749  // special handling for trans-spliced
1750  return IsMixedStrandGeneLocationOk(feat_loc, gene_loc);
1751  } else if (!StrandsMatch(feat_strand, gene_strand)) {
1752  return false;
1753  } else if (gene_loc.GetStop(eExtreme_Biological) != feat_loc.GetStop(eExtreme_Biological)) {
1754  if (is_coding_region && feat_loc.IsPartialStop(eExtreme_Biological) && StopAbutsGap(feat_loc, feat_strand, scope)) {
1755  // ignore for now
1756  } else {
1757  return false;
1758  }
1759  }
1760  TSeqPos gene_start = gene_loc.GetStart(eExtreme_Biological);
1761  TSeqPos feat_start = feat_loc.GetStart(eExtreme_Biological);
1763  if (gene_start == feat_start) {
1764  return true;
1765  }
1767  CRef<CSeq_loc> rbs_search(new CSeq_loc());
1768  const CSeq_id* id = gene_loc.GetId();
1769  if (!id) {
1770  return false;
1771  }
1772  rbs_search->SetInt().SetId().Assign(*id);
1773  if (feat_loc.GetStrand() == eNa_strand_minus) {
1774  if (gene_start < feat_start) {
1775  return false;
1776  }
1777  rbs_search->SetInt().SetFrom(feat_start + 1);
1778  rbs_search->SetInt().SetTo(gene_start);
1779  rbs_search->SetStrand(eNa_strand_minus);
1780  } else {
1781  if (gene_start > feat_start) {
1782  return false;
1783  }
1784  rbs_search->SetInt().SetFrom(gene_start);
1785  rbs_search->SetInt().SetTo(feat_start - 1);
1786  }
1787  TSeqPos rbs_start = rbs_search->GetStart(eExtreme_Biological);
1788  for (const CSeq_feat* feat : features) {
1789  if (feat->GetLocation().GetStart(eExtreme_Biological) == rbs_start && IsRBS(*feat)) {
1790  return true;
1791  }
1792  }
1793  if (is_coding_region && feat_loc.IsPartialStart(eExtreme_Biological) && StartAbutsGap(feat_loc, feat_strand, scope)) {
1794  // check to see if 5' end is partial and abuts gap
1795  return true;
1796  }
1797  return false;
1798 }
1801 bool GeneRefMatch(const CGene_ref& g1, const CGene_ref& g2)
1802 {
1803  return g1.IsSetLocus() == g2.IsSetLocus() && (!g1.IsSetLocus() || g1.GetLocus() == g2.GetLocus())
1804  && g1.IsSetLocus_tag() == g2.IsSetLocus_tag() && (!g1.IsSetLocus_tag() || g1.GetLocus_tag() == g2.GetLocus_tag())
1805  && g1.IsSetAllele() == g2.IsSetAllele() && (!g1.IsSetAllele() || g1.GetAllele() == g2.GetAllele())
1806  && g1.IsSetDesc() == g2.IsSetDesc() && (!g1.IsSetDesc() || g1.GetDesc() == g2.GetDesc())
1807  && g1.IsSetMaploc() == g2.IsSetMaploc() && (!g1.IsSetMaploc() || g1.GetMaploc() == g2.GetMaploc())
1808  && g1.IsSetPseudo() == g2.IsSetPseudo()
1809  ;
1810 }
1813 static string GetNextSubitemId(size_t num)
1814 {
1815  string ret = "[*";
1816  ret += NStr::SizetToString(num);
1817  ret += "*]";
1818  return ret;
1819 }
1822 DISCREPANCY_CASE(FEATURE_LOCATION_CONFLICT, SEQUENCE, eDisc | eSubmitter | eSmart, "Feature Location Conflict")
1823 {
1824  if (context.InGenProdSet()) {
1825  return;
1826  }
1827  const CSeqdesc* biosrc = context.GetBiosource();
1828  bool eukaryotic = context.IsEukaryotic(biosrc ? &biosrc->GetSource() : nullptr);
1829  const auto& all = context.FeatAll();
1830  for (const CSeq_feat* feat : all) {
1831  if (feat->IsSetData() && feat->IsSetLocation() && (feat->GetData().IsRna() || (!eukaryotic && feat->GetData().IsCdregion()))) {
1832  ENa_strand feat_strand = feat->GetLocation().GetStrand();
1833  const CGene_ref* gx = feat->GetGeneXref();
1834  const CSeq_feat* gene = context.GetGeneForFeature(*feat);
1835  if (!gene || (gx && !gx->IsSuppressed() && !GeneRefMatch(*gx, gene->GetData().GetGene()))) {
1836  if (feat->GetGeneXref()) {
1837  string subitem_id = GetNextSubitemId(m_Objs[kFeatureLocationConflictTop].GetMap().size());
1838  if (feat->GetData().IsCdregion()) {
1839  m_Objs[kFeatureLocationConflictTop]["Coding region xref gene does not exist" + subitem_id].Ext().Add(*context.SeqFeatObjRef(*feat), false);
1840  }
1841  else {
1842  m_Objs[kFeatureLocationConflictTop]["RNA feature xref gene does not exist" + subitem_id].Ext().Add(*context.SeqFeatObjRef(*feat), false);
1843  }
1844  m_Objs[kFeatureLocationConflictTop].Incr();
1845  }
1846  }
1847  else if (gene->IsSetLocation()) {
1848  ENa_strand gene_strand = gene->GetLocation().GetStrand();
1849  if (!IsGeneLocationOk(feat->GetLocation(), gene->GetLocation(), feat_strand, gene_strand, feat->GetData().IsCdregion(), context.GetScope(), all)) {
1850  string subitem_id = GetNextSubitemId(m_Objs[kFeatureLocationConflictTop].GetMap().size());
1851  if (feat->GetData().IsCdregion()) {
1852  m_Objs[kFeatureLocationConflictTop][kFeatureLocationCodingRegion + subitem_id].Ext().Add(*context.SeqFeatObjRef(*feat), false).Add(*context.SeqFeatObjRef(*gene), false);
1853  }
1854  else {
1855  m_Objs[kFeatureLocationConflictTop][kFeatureLocationRNA + subitem_id].Ext().Add(*context.SeqFeatObjRef(*feat), false).Add(*context.SeqFeatObjRef(*gene), false);
1856  }
1857  m_Objs[kFeatureLocationConflictTop].Incr();
1858  }
1859  }
1860  }
1861  }
1862 }
1866 {
1867  m_ReportItems = m_Objs.Export(*this, false)->GetSubitems();
1868 }
1873 const string suspect_phrases[] =
1874 {
1875  "fragment",
1876  "frameshift",
1877  "%",
1878  "E-value",
1879  "E value",
1880  "Evalue",
1881  "..."
1882 };
1885 DISCREPANCY_CASE(SUSPECT_PHRASES, FEAT, eDisc | eSubmitter | eSmart, "Suspect Phrases")
1886 {
1887  for (const CSeq_feat& feat : context.GetFeat()) {
1888  if (feat.IsSetData()) {
1889  string check;
1890  if (feat.GetData().IsCdregion() && feat.IsSetComment()) {
1891  check = feat.GetComment();
1892  }
1893  else if (feat.GetData().IsProt() && feat.GetData().GetProt().IsSetDesc()) {
1894  check = feat.GetData().GetProt().GetDesc();
1895  }
1896  if (!check.empty()) {
1897  for (size_t i = 0; i < ArraySize(suspect_phrases); i++) {
1899  m_Objs["[n] cds comment[s] or protein description[s] contain[S] suspect_phrase[s]"]["[n] cds comment[s] or protein description[s] contain[S] '" + suspect_phrases[i] + "'"].Summ().Add(*context.SeqFeatObjRef(feat));
1900  break;
1901  }
1902  }
1903  }
1904  }
1905  }
1906 }
1910 {
1911  m_ReportItems = m_Objs.Export(*this, false)->GetSubitems();
1912 }
1917 DISCREPANCY_CASE(UNUSUAL_MISC_RNA, FEAT, eDisc | eSubmitter | eSmart, "Unexpected misc_RNA features")
1918 {
1919  for (const CSeq_feat& feat : context.GetFeat()) {
1920  if (feat.IsSetData() && feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_otherRNA) {
1921  const CRNA_ref& rna = feat.GetData().GetRna();
1922  string product = rna.GetRnaProductName();
1923  if (NStr::FindCase(product, "ITS", 0) == NPOS && NStr::FindCase(product, "internal transcribed spacer", 0) == NPOS) {
1924  m_Objs["[n] unexpected misc_RNA feature[s] found. misc_RNAs are unusual in a genome, consider using ncRNA, misc_binding, or misc_feature as appropriate"].Add(*context.SeqFeatObjRef(feat));
1925  }
1926  }
1927  }
1928 }
1933 static bool IsProductMatch(const string& rna_product, const string& cds_product)
1934 {
1935  if (rna_product.empty() || cds_product.empty()) {
1936  return false;
1937  }
1938  if (rna_product == cds_product) {
1939  return true;
1940  }
1941  const string kmRNAVariant = ", transcript variant ";
1942  const string kCDSVariant = ", isoform ";
1943  size_t pos_in_rna = rna_product.find(kmRNAVariant);
1944  size_t pos_in_cds = cds_product.find(kCDSVariant);
1945  if (pos_in_rna == string::npos || pos_in_cds == string::npos || pos_in_rna != pos_in_cds ||
1946  !NStr::EqualCase(rna_product, 0, pos_in_rna, cds_product)) {
1947  return false;
1948  }
1949  string rna_rest = rna_product.substr(pos_in_rna + kmRNAVariant.size()), cds_rest = cds_product.substr(pos_in_cds + kCDSVariant.size());
1950  return rna_rest == cds_rest;
1951 }
1954 DISCREPANCY_CASE(CDS_WITHOUT_MRNA, SEQUENCE, eDisc | eOncaller | eSmart, "Coding regions on eukaryotic genomic DNA should have mRNAs with matching products")
1955 {
1956  const CBioseq& bioseq = context.CurrentBioseq();
1957  const CSeqdesc* biosrc = context.GetBiosource();
1958  const CBioSource* src = biosrc ? &biosrc->GetSource() : nullptr;
1959  if (!context.IsEukaryotic(src) || context.IsOrganelle(src) || !bioseq.GetInst().IsSetMol() || bioseq.GetInst().GetMol() != CSeq_inst::eMol_dna) {
1960  return;
1961  }
1963  vector<const CSeq_feat*> cds = context.FeatCDS();
1964  vector<const CSeq_feat*> mrnas = context.FeatMRNAs();
1965  auto cds_it = cds.begin();
1966  while (cds_it != cds.end()) {
1967  if (context.IsPseudo(**cds_it)) {
1968  cds_it = cds.erase(cds_it);
1969  continue;
1970  }
1971  const CSeq_feat* mrna = nullptr;
1972  if ((*cds_it)->IsSetXref()) {
1973  auto rna_it = mrnas.cbegin();
1974  while (rna_it != mrnas.end()) {
1975  if ((*rna_it)->IsSetId()) {
1976  auto& rnaid = (*rna_it)->GetId();
1977  if (rnaid.IsLocal()) {
1978  for (auto xref : (*cds_it)->GetXref()) {
1979  if (xref->IsSetId()) {
1980  auto& id = xref->GetId();
1981  if (id.IsLocal()) {
1982  if (!id.GetLocal().Compare(rnaid.GetLocal())) {
1983  mrna = *rna_it;
1984  break;
1985  }
1986  }
1987  }
1988  }
1989  }
1990  if (mrna) {
1991  mrnas.erase(rna_it);
1992  break;
1993  }
1994  }
1995  ++rna_it;
1996  }
1997  }
1998  if (mrna) {
1999  string prod = context.GetProdForFeature(**cds_it);
2000  if (!IsProductMatch(prod, mrna->GetData().GetRna().GetRnaProductName())) {
2001  m_Objs["[n] coding region[s] [has] mismatching mRNA"].Add(*context.SeqFeatObjRef(**cds_it));
2002  }
2003  cds_it = cds.erase(cds_it);
2004  continue;
2005  }
2006  ++cds_it;
2007  }
2009  for (size_t i = 0; i < cds.size(); i++) {
2010  if (context.IsPseudo(*cds[i])) {
2011  continue;
2012  }
2013  bool found = false;
2014  string prod = context.GetProdForFeature(*cds[i]);
2015  const CSeq_loc& loc_i = cds[i]->GetLocation();
2016  for (size_t j = 0; j < mrnas.size(); j++) {
2017  const CSeq_loc& loc_j = mrnas[j]->GetLocation();
2018  sequence::ECompare compare = context.Compare(loc_j, loc_i);
2019  if (compare == sequence::eContains || compare == sequence::eSame) {
2020  if (IsProductMatch(prod, mrnas[j]->GetData().GetRna().GetRnaProductName())) {
2021  found = true;
2022  break;
2023  }
2024  }
2025  }
2026  if (!found) {
2027  m_Objs["[n] coding region[s] [does] not have an mRNA"].Add(*context.SeqFeatObjRef(*cds[i], CDiscrepancyContext::eFixSet));
2028  }
2029  }
2030 }
2033 #if 0
2034 static bool AddmRNAForCDS(const CSeq_feat& cds, CScope& scope)
2035 {
2036  CConstRef<CSeq_feat> old_mRNA = sequence::GetmRNAforCDS(cds, scope);
2037  CRef<CSeq_feat> new_mRNA = edit::MakemRNAforCDS(cds, scope);
2039  if (old_mRNA.Empty()) {
2040  CSeq_feat_EditHandle cds_edit_handle(scope.GetSeq_featHandle(cds));
2041  CSeq_annot_EditHandle annot_handle = cds_edit_handle.GetAnnot();
2042  annot_handle.AddFeat(*new_mRNA);
2043  }
2044  else {
2045  CSeq_feat_EditHandle old_mRNA_edit(scope.GetSeq_featHandle(*old_mRNA));
2046  old_mRNA_edit.Replace(*new_mRNA);
2047  }
2048  return true;
2049 }
2050 #endif
2054 {
2057  for (; annot_ci; ++annot_ci) {
2058  if (annot_ci->IsFtable()) {
2059  ftable = *annot_ci;
2060  break;
2061  }
2062  }
2063  if (!ftable) {
2064  CBioseq_EditHandle eh = bsh.GetEditHandle();
2065  CRef<CSeq_annot> new_annot(new CSeq_annot());
2066  ftable = eh.AttachAnnot(*new_annot);
2067  }
2069  return aeh;
2070 }
2074 {
2075  CScope& scope = context.GetScope();
2076  const CSeq_feat* sf = dynamic_cast<const CSeq_feat*>(context.FindObject(*obj));
2077  CConstRef<CSeq_feat> old_mRNA = sequence::GetmRNAforCDS(*sf, scope);
2078  CRef<CSeq_feat> new_mRNA = edit::MakemRNAforCDS(*sf, scope);
2079  if (old_mRNA.Empty()) {
2080  CBioseq_Handle bh = scope.GetBioseqHandle(new_mRNA->GetLocation());
2081  CSeq_annot_EditHandle annot_handle = GetAnnotHandle(bh);
2082  annot_handle.AddFeat(*new_mRNA);
2083  }
2084  else {
2085  CSeq_feat_EditHandle old_mRNA_edit(scope.GetSeq_featHandle(*old_mRNA));
2086  old_mRNA_edit.Replace(*new_mRNA);
2087  }
2088  obj->SetFixed();
2089  return CRef<CAutofixReport>(new CAutofixReport("CDS_WITHOUT_MRNA: Add mRNA for [n] CDS feature[s]", 1));
2090 }
2095 DISCREPANCY_CASE(PROTEIN_NAMES, FEAT, eDisc | eSubmitter | eSmart, "Frequently appearing proteins")
2096 {
2097  for (const CSeq_feat& feat : context.GetFeat()) {
2098  if (feat.IsSetData() && feat.GetData().IsProt()) {
2099  const CProt_ref& prot = feat.GetData().GetProt();
2100  if (prot.IsSetName() && !prot.GetName().empty()) {
2101  m_Objs[feat.GetData().GetProt().GetName().front()].Incr();
2102  }
2103  }
2104  }
2105 }
2109 {
2110  static const size_t MIN_REPORTABLE_AMOUNT = 100;
2111  auto& M = m_Objs.GetMap();
2112  if (M.size() == 1 && M.begin()->second->GetCount() >= MIN_REPORTABLE_AMOUNT) {
2113  CReportNode rep;
2114  rep["All proteins have same name [(]\"" + M.begin()->first + "\""];
2115  m_ReportItems = rep.Export(*this)->GetSubitems();
2116  }
2117 }
2122 static bool IsmRnaQualsPresent(const CSeq_feat::TQual& quals)
2123 {
2124  bool protein_id = false,
2125  transcript_id = false;
2127  for (const auto& qual : quals) {
2128  if (qual->IsSetQual()) {
2130  if (qual->GetQual() == "orig_protein_id") {
2131  protein_id = true;
2132  }
2134  if (qual->GetQual() == "orig_transcript_id") {
2135  transcript_id = true;
2136  }
2138  if (protein_id && transcript_id) {
2139  break;
2140  }
2141  }
2142  }
2144  return protein_id && transcript_id;
2145 }
2148 DISCREPANCY_CASE(MRNA_SHOULD_HAVE_PROTEIN_TRANSCRIPT_IDS, SEQUENCE, eDisc | eSubmitter | eSmart | eFatal, "mRNA should have both protein_id and transcript_id")
2149 {
2150  const CBioseq& bioseq = context.CurrentBioseq();
2151  const CSeqdesc* biosrc = context.GetBiosource();
2152  if (biosrc && context.IsEukaryotic(&biosrc->GetSource()) && bioseq.GetInst().IsSetMol() && bioseq.GetInst().GetMol() == CSeq_inst::eMol_dna) {
2153  for (const CSeq_feat& feat : context.GetAllFeat()) {
2154  if (feat.IsSetData() && feat.GetData().IsCdregion() && !context.IsPseudo(feat)) {
2155  CConstRef<CSeq_feat> mRNA = sequence::GetmRNAforCDS(feat, context.GetScope());
2156  if (mRNA && (!mRNA->IsSetQual() || !IsmRnaQualsPresent(mRNA->GetQual()))) {
2157  m_Objs.Add(*context.SeqFeatObjRef(feat)).Fatal();
2158  }
2159  }
2160  }
2161  }
2162 }
2166 {
2167  if (!m_Objs.empty()) {
2168  CReportNode out;
2169  out["no protein_id and transcript_id present"];
2170  m_ReportItems = out.Export(*this)->GetSubitems();
2171  }
2172 }
2177 static const string kFeatureList = "Feature List";
2179 DISCREPANCY_CASE(FEATURE_LIST, FEAT, eDisc | eSubmitter, "Feature List")
2180 {
2181  for (const CSeq_feat& feat : context.GetFeat()) {
2182  if (feat.IsSetData() && feat.GetData().GetSubtype() != CSeqFeatData::eSubtype_gap && feat.GetData().GetSubtype() != CSeqFeatData::eSubtype_prot) {
2183  string subitem = "[n] " + feat.GetData().GetKey();
2184  subitem += " feature[s]";
2185  m_Objs[kFeatureList].Info()[subitem].Info().Add(*context.SeqFeatObjRef(feat));
2186  }
2187  }
2188 }
2193 DISCREPANCY_CASE(MULTIPLE_QUALS, FEAT, eDisc | eOncaller, "Multiple qualifiers")
2194 {
2195  for (const CSeq_feat& feat : context.GetFeat()) {
2196  if (feat.IsSetQual()) {
2197  size_t num_of_number_quals = 0;
2198  for (const auto& qual : feat.GetQual()) {
2199  if (qual->IsSetQual() && qual->GetQual() == "number") {
2200  ++num_of_number_quals;
2201  if (num_of_number_quals > 1) {
2202  m_Objs["[n] feature[s] contain[S] multiple /number qualifiers"].Add(*context.SeqFeatObjRef(feat));
2203  break;
2204  }
2205  }
2206  }
2207  }
2208  }
2209 }
2214 DISCREPANCY_CASE(MISC_FEATURE_WITH_PRODUCT_QUAL, FEAT, eDisc | eOncaller | eSubmitter | eSmart, "Misc features containing a product qualifier")
2215 {
2216  for (const CSeq_feat& feat : context.GetFeat()) {
2217  if (feat.IsSetData() && feat.IsSetQual() && feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_misc_feature) {
2218  for (const auto& qual : feat.GetQual()) {
2219  if (qual->IsSetQual() && qual->GetQual() == "product") {
2220  m_Objs["[n] feature[s] [has] a product qualifier"].Add(*context.SeqFeatObjRef(feat));
2221  }
2222  }
2223  }
2224  }
2225 }
2230 const string kCDShasNoTRNA = "[n] coding region[s] [does] not have adjacent tRNA";
2232 static bool IsStopCodon(const CCode_break::C_Aa& aa)
2233 {
2234  int aa_idx = -1;
2235  switch (aa.Which()) {
2237  aa_idx = aa.GetNcbieaa();
2239  break;
2241  aa_idx = aa.GetNcbi8aa();
2242  break;
2244  aa_idx = aa.GetNcbistdaa();
2245  break;
2246  default:
2247  break;
2248  }
2249  static const int STOP_CODON = 25;
2250  return aa_idx == STOP_CODON;
2251 }
2255 {
2256  const CSeqdesc* biosrc = context.GetBiosource();
2257  if (!biosrc || biosrc->GetSource().GetGenome() != CBioSource::eGenome_mitochondrion) {
2258  return;
2259  }
2260  const auto& cds = context.FeatCDS();
2261  const auto& trnas = context.FeatTRNAs();
2262  for (size_t i = 0; i < cds.size(); i++) {
2263  if (!cds[i]->GetData().GetCdregion().IsSetCode_break()) {
2264  continue;
2265  }
2266  const CCode_break& code_break = *cds[i]->GetData().GetCdregion().GetCode_break().front();
2267  if (!code_break.IsSetAa() || !IsStopCodon(code_break.GetAa())) {
2268  continue;
2269  }
2270  ENa_strand strand = cds[i]->GetLocation().IsSetStrand() ? cds[i]->GetLocation().GetStrand() : eNa_strand_unknown;
2272  const CSeq_feat* nearest_trna = nullptr;
2273  TSeqPos diff = UINT_MAX;
2274  for (const CSeq_feat* trna : trnas) {
2275  if (trna->IsSetLocation()) {
2277  TSeqPos cur_diff = UINT_MAX;
2278  if (strand == eNa_strand_minus) {
2279  if (start <= stop) {
2280  cur_diff = stop - start;
2281  }
2282  }
2283  else {
2284  if (start >= stop) {
2285  cur_diff = start - stop;
2286  }
2287  }
2288  if (cur_diff < diff) {
2289  diff = cur_diff;
2290  nearest_trna = trna;
2291  }
2292  }
2293  }
2294  if (nearest_trna) {
2295  ENa_strand trna_strand = nearest_trna->GetLocation().IsSetStrand() ? nearest_trna->GetLocation().GetStrand() : eNa_strand_unknown;
2296  if (trna_strand == strand && diff > 1) {
2297  m_Objs[kCDShasNoTRNA].Add(*context.SeqFeatObjRef(*cds[i]), false).Incr();
2298  m_Objs[kCDShasNoTRNA].Add(*context.SeqFeatObjRef(*nearest_trna), false);
2299  }
2300  }
2301  }
2302 }
2305 // MITO_RRNA
2307 DISCREPANCY_CASE(MITO_RRNA, SEQUENCE, eOncaller, "Non-mitochondrial rRNAs with 12S/16S")
2308 {
2309  const CSeqdesc* biosrc = context.GetBiosource();
2310  if (context.IsEukaryotic(biosrc ? &biosrc->GetSource() : nullptr)) {
2311  const auto& rnas = context.Feat_RNAs();
2312  for (size_t i = 0; i < rnas.size(); i++) {
2313  if (rnas[i]->GetData().GetSubtype() == CSeqFeatData::eSubtype_rRNA && rnas[i]->GetData().GetRna().IsSetExt() && rnas[i]->GetData().GetRna().GetExt().IsName()) {
2314  const string& name = rnas[i]->GetData().GetRna().GetExt().GetName();
2315  if (name.find("16S") != string::npos || name.find("12S") != string::npos) {
2316  m_Objs["[n] non mitochondrial rRNA name[s] contain[S] 12S/16S"].Add(*context.SeqFeatObjRef(*rnas[i]));
2317  }
2318  }
2319  }
2320  }
2321 }
