NCBI C++ ToolKit
overlapping_features.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: overlapping_features.cpp 99182 2023-02-23 12:39:54Z gotvyans $
2  * =========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * =========================================================================
25  *
26  * Authors: Sema Kachalo
27  *
28  */
29 
30 #include <ncbi_pch.hpp>
31 #include "discrepancy_core.hpp"
33 
37 
38 // CDS_TRNA_OVERLAP
39 
40 static const string kCDSoverlapTRNA = "[n] Bioseq[s] [has] coding regions that overlap tRNAs";
41 
42 DISCREPANCY_CASE(CDS_TRNA_OVERLAP, SEQUENCE, eDisc | eSubmitter | eSmart, "CDS tRNA Overlap")
43 {
44  const auto& cds = context.FeatCDS();
45  const auto& trnas = context.FeatTRNAs();
46 
47  bool increase_count = false;
48  static size_t bs_count = 0;
49  string report_item_str = "[n] coding region[s] [has] overlapping tRNAs[*" + to_string(++bs_count) + "*]";
50 
51  for (size_t i = 0; i < cds.size(); i++) {
52  const CSeq_loc& loc_i = cds[i]->GetLocation();
53  bool has_overlap = false;
54  string cur_report_cds_trna_pair_str = "Coding region overlaps tRNAs[*" + to_string(i) + "*]";
55  ENa_strand cds_strand = loc_i.IsSetStrand() ? loc_i.GetStrand() : eNa_strand_unknown;
56  for (size_t j = 0; j < trnas.size(); j++) {
57  const CSeq_loc& loc_j = trnas[j]->GetLocation();
59  ENa_strand trna_strand = loc_j.IsSetStrand() ? loc_j.GetStrand() : eNa_strand_unknown;
60  bool need_to_compare = (cds_strand == eNa_strand_minus) == (trna_strand == eNa_strand_minus);
61  if (need_to_compare) {
62  ovlp = context.Compare(loc_i, loc_j);
63  }
64  if (ovlp != sequence::eNoOverlap) {
65  increase_count = true;
66  CReportNode& out = m_Objs[kCDSoverlapTRNA][report_item_str].Ext();
67  CReportNode& subitem = out[cur_report_cds_trna_pair_str];
68  if (!has_overlap) {
69  out.Incr();
70  has_overlap = true;
71  subitem.Ext().Add(*context.SeqFeatObjRef(*cds[i]));
72  }
73  subitem.Ext().Add(*context.SeqFeatObjRef(*trnas[j]));
74  }
75  }
76  }
77  if (increase_count) {
78  m_Objs[kCDSoverlapTRNA].Incr();
79  }
80 }
81 
82 
83 static const string kCdsTrnaOverlapComment = "TAA stop codon is completed by the addition of 3' A residues to the mRNA";
84 
85 DISCREPANCY_CASE(_CDS_TRNA_OVERLAP, SEQUENCE, 0, "CDS tRNA Overlap - autofix") {}
86 DISCREPANCY_SUMMARIZE(_CDS_TRNA_OVERLAP) {}
87 
88 DISCREPANCY_AUTOFIX(_CDS_TRNA_OVERLAP)
89 {
90  const CSeq_feat& cds = dynamic_cast<const CSeq_feat&>(*context.FindObject(*obj));
91  const CSeq_loc& loc = cds.GetLocation();
92  CBioseq_Handle bsh = context.GetScope().GetBioseqHandle(loc);
94  ENa_strand cds_strand = loc.IsSetStrand() ? loc.GetStrand() : eNa_strand_unknown;
96  CConstRef<CSeq_loc> other;
97  int ovlp_len = 0;
98  while (f) {
99  if (f->GetData().GetSubtype() == CSeqFeatData::eSubtype_tRNA) {
100  const CSeq_loc& loc_t = f->GetLocation();
101  ENa_strand trna_strand = loc_t.IsSetStrand() ? loc_t.GetStrand() : eNa_strand_unknown;
102  if ((cds_strand == eNa_strand_minus) == (trna_strand == eNa_strand_minus)) {
103  CSeq_loc::TRange r2 = loc_t.GetTotalRange();
104  if (!(r1.GetFrom() >= r2.GetToOpen() || r2.GetFrom() >= r1.GetToOpen())) {
105  ovlp_len = r1.GetToOpen() - r2.GetFrom();
106  if (ovlp_len > 0 && ovlp_len < 3 && cds_strand != eNa_strand_minus) {
107  other.Reset(&loc_t);
108  break;
109  }
110  else {
111  ovlp_len = r2.GetToOpen() - r1.GetFrom();
112  if (ovlp_len > 0 && ovlp_len < 3 && cds_strand == eNa_strand_minus) {
113  other.Reset(&loc_t);
114  break;
115  }
116  else {
117  ovlp_len = 0;
118  }
119  }
120  }
121  }
122  }
123  ++f;
124  }
125  if (ovlp_len) {
126  CConstRef<CSeq_feat> gene = sequence::GetGeneForFeature(cds, context.GetScope());
127  CRef<CSeq_feat> new_cds(new CSeq_feat());
128  new_cds->Assign(cds);
129  new_cds->SetLocation().Assign(*sequence::Seq_loc_Subtract(new_cds->GetLocation(), *other, CSeq_loc::fStrand_Ignore, &context.GetScope()));
130  CRef<CCode_break> code_break(new CCode_break);
131  CRef<CSeq_loc> br_loc(new CSeq_loc);
132  br_loc->Assign(new_cds->GetLocation());
133  if (!br_loc->GetId()) {
134  return CRef<CAutofixReport>(new CAutofixReport("CDS_TRNA_OVERLAP: cannot trim [n] CDS", 1));
135  }
136  CSeq_loc::TRange rr = br_loc->GetTotalRange();
137  CRef<CSeq_id> seq_id(new CSeq_id);
138  seq_id->Assign(*br_loc->GetId());
139  if (br_loc->GetStrand() == eNa_strand_minus) {
140  br_loc->SetInt().SetId(*seq_id);
141  br_loc->SetInt().SetFrom(rr.GetFrom());
142  br_loc->SetInt().SetTo(rr.GetFrom() + 2 - ovlp_len);
143  }
144  else {
145  br_loc->SetInt().SetId(*seq_id);
146  br_loc->SetInt().SetFrom(rr.GetTo() - 2 + ovlp_len);
147  br_loc->SetInt().SetTo(rr.GetTo());
148  }
149  br_loc->SetPartialStart(false, eExtreme_Positional);
150  br_loc->SetPartialStop(false, eExtreme_Positional);
151  code_break->SetLoc().Assign(*br_loc);
152  code_break->SetAa().SetNcbieaa('*');
153  string comment;
154  if (new_cds->CanGetComment()) {
155  comment = new_cds->GetComment();
156  }
157  if (comment.find(kCdsTrnaOverlapComment) == string::npos) {
158  if (comment.length()) {
159  comment += "; ";
160  }
161  comment += kCdsTrnaOverlapComment;
162  }
163  new_cds->SetComment(comment);
164  new_cds->SetData().SetCdregion().SetCode_break().push_back(code_break);
165  context.ReplaceSeq_feat(*obj, cds, *new_cds);
166  //CSeq_feat_EditHandle feh(context.GetScope().GetSeq_featHandle(cds));
167  //feh.Replace(*new_cds);
168 
169  if (gene) {
170  CRef<CSeq_feat> new_gene(new CSeq_feat());
171  new_gene->Assign(*gene);
172  new_gene->SetLocation().Assign(*sequence::Seq_loc_Subtract(new_gene->GetLocation(), *other, CSeq_loc::fStrand_Ignore, &context.GetScope()));
173  CSeq_feat_EditHandle feh(context.GetScope().GetSeq_featHandle(*gene));
174  feh.Replace(*new_gene);
175  }
176  return CRef<CAutofixReport>(new CAutofixReport("CDS_TRNA_OVERLAP: [n] CDS trimmed", 1));
177  }
178  return CRef<CAutofixReport>();
179 }
180 
181 
182 // RNA_CDS_OVERLAP
183 
184 typedef pair<size_t, bool> TRNALength;
186 
188  { "16S", { 1000, false } },
189  { "18S", { 1000, false } },
190  { "23S", { 2000, false } },
191  { "25S", { 1000, false } },
192  { "26S", { 1000, false } },
193  { "28S", { 3300, false } },
194  { "small", { 1000, false } },
195  { "large", { 1000, false } },
196  { "5.8S", { 130, true } },
197  { "5S", { 90, true } }
198  // possible problem: if it matches /25S/ it would also match /5S/
199  // luckily, if it fails the /5S/ rule it would fail the /25S/ rule
200 };
201 
202 
203 bool IsShortrRNA(const CSeq_feat& f, CScope* scope) // used in feature_tests.cpp
204 {
205  if (f.GetData().GetSubtype() != CSeqFeatData::eSubtype_rRNA) {
206  return false;
207  }
208  bool is_bad = false;
209  size_t len = sequence::GetLength(f.GetLocation(), scope);
210  string rrna_name = f.GetData().GetRna().GetRnaProductName();
211  for (const auto& it : kTrnaLengthMap) {
212  SIZE_TYPE pos = NStr::FindNoCase(rrna_name, it.first);
213  if (pos != NPOS && len < it.second.first && !(it.second.second && f.IsSetPartial() && f.GetPartial()) ) {
214  is_bad = true;
215  break;
216  }
217  }
218  return is_bad;
219 }
220 
221 
222 const string kCDSRNAAnyOverlap = "[n/2] coding region[s] overlap RNA feature[s]";
223 const string kCDSRNAExactMatch = "[n/2] coding region location[s] exactly match an RNA location";
224 const string kCDSRNAContainedIn = "[n/2] coding region[s] [is] completely contained in RNA[s]";
225 const string kCDSRNAContains = "[n/2] coding region[s] completely contain RNA[s]";
226 const string kCDSRNAContainstRNA = "[n/2] coding region[s] completely contain tRNA[s]";
227 const string kCDSRNAOverlapNoContain = "[n/2] coding regions overlap RNA[s] (no containment)";
228 const string kCDSRNAOverlapNoContainSameStrand = "[n/2] coding region[s] overlap RNA[s] on the same strand (no containment)";
229 const string kCDSRNAOverlapNoContainOppStrand = "[n/2] coding region[s] overlap RNA[s] on the opposite strand (no containment)";
230 
231 
232 DISCREPANCY_CASE(RNA_CDS_OVERLAP, SEQUENCE, eDisc | eSubmitter | eSmart | eFatal, "CDS RNA Overlap")
233 {
234  const CSeqdesc* biosrc = context.GetBiosource();
235  bool is_eukariotic = context.IsEukaryotic(biosrc ? &biosrc->GetSource() : nullptr);
236 
237  const auto& cds = context.FeatCDS();
238  const auto& rnas = context.Feat_RNAs();
239  for (size_t i = 0; i < rnas.size(); i++) {
240  const CSeq_loc& loc_i = rnas[i]->GetLocation();
241  CSeqFeatData::ESubtype subtype = rnas[i]->GetData().GetSubtype();
242  if ((subtype == CSeqFeatData::eSubtype_tRNA && is_eukariotic) || subtype == CSeqFeatData::eSubtype_mRNA || subtype == CSeqFeatData::eSubtype_ncRNA) {
243  continue;
244  }
245  else if (subtype == CSeqFeatData::eSubtype_rRNA) {
246  size_t len = sequence::GetLength(loc_i, &context.GetScope());
247  string rrna_name = rnas[i]->GetData().GetRna().GetRnaProductName();
248  bool is_bad = false;
249  for (const auto& it : kTrnaLengthMap) {
250  if (NStr::FindNoCase(rrna_name, it.first) != NPOS && len < it.second.first && (!it.second.second || (rnas[i]->IsSetPartial() && rnas[i]->GetPartial())) ) {
251  is_bad = true;
252  break;
253  }
254  }
255  if (is_bad) {
256  continue;
257  }
258  }
259  for (size_t j = 0; j < cds.size(); j++) {
260  const CSeq_loc& loc_j = cds[j]->GetLocation();
261  sequence::ECompare compare = context.Compare(loc_j, loc_i);
262  if (compare == sequence::eSame) {
263  m_Objs[kCDSRNAAnyOverlap][kCDSRNAExactMatch].Add(*context.SeqFeatObjRef(*rnas[i]), false).Add(*context.SeqFeatObjRef(*cds[j]), false).Fatal();
264  }
265  else if (compare == sequence::eContained) {
266  m_Objs[kCDSRNAAnyOverlap][kCDSRNAContainedIn].Add(*context.SeqFeatObjRef(*rnas[i]), false).Add(*context.SeqFeatObjRef(*cds[j]), false); // no Fatal();
267  }
268  else if (compare == sequence::eContains) {
269  if (rnas[i]->GetData().GetSubtype() == CSeqFeatData::eSubtype_tRNA) {
270  m_Objs[kCDSRNAAnyOverlap][kCDSRNAContainstRNA].Add(*context.SeqFeatObjRef(*rnas[i]), false).Add(*context.SeqFeatObjRef(*cds[j]), false).Fatal();
271  }
272  else {
273  m_Objs[kCDSRNAAnyOverlap][kCDSRNAContains].Add(*context.SeqFeatObjRef(*rnas[i]), false).Add(*context.SeqFeatObjRef(*cds[j]), false).Fatal();
274  }
275  }
276  else if (compare != sequence::eNoOverlap) {
277  ENa_strand cds_strand = cds[j]->GetLocation().GetStrand();
278  ENa_strand rna_strand = rnas[i]->GetLocation().GetStrand();
279  if (cds_strand == eNa_strand_minus && rna_strand != eNa_strand_minus) {
280  m_Objs[kCDSRNAAnyOverlap][kCDSRNAOverlapNoContain][kCDSRNAOverlapNoContainOppStrand].Add(*context.SeqFeatObjRef(*rnas[i]), false).Add(*context.SeqFeatObjRef(*cds[j]), false); // no Fatal();
281  }
282  else if (cds_strand != eNa_strand_minus && rna_strand == eNa_strand_minus) {
283  m_Objs[kCDSRNAAnyOverlap][kCDSRNAOverlapNoContain][kCDSRNAOverlapNoContainOppStrand].Add(*context.SeqFeatObjRef(*rnas[i]), false).Add(*context.SeqFeatObjRef(*cds[j]), false); // no Fatal();
284  }
285  else {
286  m_Objs[kCDSRNAAnyOverlap][kCDSRNAOverlapNoContain][kCDSRNAOverlapNoContainSameStrand].Add(*context.SeqFeatObjRef(*rnas[i]), false).Add(*context.SeqFeatObjRef(*cds[j]), false); // no Fatal();
287  }
288  }
289  }
290  }
291 }
292 
293 
294 DISCREPANCY_SUMMARIZE(RNA_CDS_OVERLAP)
295 {
296  m_ReportItems = m_Objs.Export(*this, false)->GetSubitems();
297 }
298 
299 
300 DISCREPANCY_CASE(OVERLAPPING_RRNAS, SEQUENCE, eDisc | eSubmitter | eSmart | eFatal, "Overlapping rRNAs")
301 {
302  const auto& rrnas = context.FeatRRNAs();
303  for (size_t i = 0; i < rrnas.size(); i++) {
304  const CSeq_loc& loc_i = rrnas[i]->GetLocation();
305  for (size_t j = i + 1; j < rrnas.size(); j++) {
306  const CSeq_loc& loc_j = rrnas[j]->GetLocation();
307  if (context.Compare(loc_j, loc_i) != sequence::eNoOverlap) {
308  m_Objs["[n] rRNA feature[s] overlap[S] another rRNA feature."].Add(*context.SeqFeatObjRef(*rrnas[i])).Add(*context.SeqFeatObjRef(*rrnas[j])).Fatal();
309  }
310  }
311  }
312 }
313 
314 
315 // OVERLAPPING_GENES
316 
317 DISCREPANCY_CASE(OVERLAPPING_GENES, SEQUENCE, eDisc, "Overlapping Genes")
318 {
319  const auto& genes = context.FeatGenes();
320  for (size_t i = 0; i < genes.size(); i++) {
321  const CSeq_loc& loc_i = genes[i]->GetLocation();
322  ENa_strand strand_i = loc_i.GetStrand();
323  for (size_t j = i + 1; j < genes.size(); j++) {
324  const CSeq_loc& loc_j = genes[j]->GetLocation();
325  if (loc_j.GetStrand() == strand_i && context.Compare(loc_j, loc_i) != sequence::eNoOverlap) {
326  m_Objs["[n] gene[s] overlap[S] another gene on the same strand."].Add(*context.SeqFeatObjRef(*genes[i])).Add(*context.SeqFeatObjRef(*genes[j]));
327  }
328  }
329  }
330 }
331 
332 
333 // FIND_OVERLAPPED_GENES
334 
335 DISCREPANCY_CASE(FIND_OVERLAPPED_GENES, SEQUENCE, eDisc | eSmart, "Genes completely contained by another gene on the same strand")
336 {
337  const auto& genes = context.FeatGenes();
338  for (size_t i = 0; i < genes.size(); i++) {
339  const CSeq_loc& loc_i = genes[i]->GetLocation();
340  ENa_strand strand_i = loc_i.IsSetStrand() ? loc_i.GetStrand() : eNa_strand_unknown;
341 
342  for (size_t j = i + 1; j < genes.size(); j++) {
343  const CSeq_loc& loc_j = genes[j]->GetLocation();
344  ENa_strand strand_j = loc_j.IsSetStrand() ? loc_j.GetStrand() : eNa_strand_unknown;
345 
346  if (strand_i == strand_j) {
347 
348  sequence::ECompare ovlp = context.Compare(loc_i, loc_j);
349  if (ovlp == sequence::eContained || ovlp == sequence::eSame) {
350  m_Objs["[n] gene[s] completely overlapped by other genes"].Add(*context.SeqFeatObjRef(*genes[i]));
351  }
352  else if (ovlp == sequence::eContains) {
353  m_Objs["[n] gene[s] completely overlapped by other genes"].Add(*context.SeqFeatObjRef(*genes[j]));
354  }
355  }
356  }
357  }
358 }
359 
360 
361 // DUP_GENES_OPPOSITE_STRANDS
362 
363 DISCREPANCY_CASE(DUP_GENES_OPPOSITE_STRANDS, SEQUENCE, eDisc | eOncaller | eSubmitter | eSmart, "Genes match other genes in the same location, but on the opposite strand")
364 {
365  const auto& genes = context.FeatGenes();
366  for (size_t i = 0; i < genes.size(); i++) {
367  const CSeq_loc& loc_i = genes[i]->GetLocation();
368  ENa_strand strand_i = loc_i.GetStrand();
369  for (size_t j = i + 1; j < genes.size(); j++) {
370  const CSeq_loc& loc_j = genes[j]->GetLocation();
371  if ((loc_j.GetStrand() == eNa_strand_minus) == (strand_i == eNa_strand_minus)) {
372  continue;
373  }
374  sequence::ECompare ovlp = context.Compare(loc_i, loc_j);
375  if (ovlp == sequence::eSame) {
376  m_Objs["[n] genes match other genes in the same location, but on the opposite strand"].Add(*context.SeqFeatObjRef(*genes[i])).Add(*context.SeqFeatObjRef(*genes[j]));
377  }
378  }
379  }
380 }
381 
382 
383 // MRNA_OVERLAPPING_PSEUDO_GENE
384 
385 DISCREPANCY_CASE(MRNA_OVERLAPPING_PSEUDO_GENE, SEQUENCE, eOncaller, "mRNA overlapping pseudo gene")
386 {
387  const auto& pseudo = context.FeatPseudo();
388  const auto& mrnas = context.FeatMRNAs();
389  for (size_t i = 0; i < mrnas.size(); i++) {
390  const CSeq_loc& loc_i = mrnas[i]->GetLocation();
391  for (size_t j = 0; j < pseudo.size(); j++) {
392  const CSeq_loc& loc_j = pseudo[j]->GetLocation();
393  sequence::ECompare ovlp = context.Compare(loc_i, loc_j);
394  if (ovlp != sequence::eNoOverlap) {
395  m_Objs["[n] Pseudogene[s] [has] overlapping mRNA[s]."].Add(*context.SeqFeatObjRef(*mrnas[i], CDiscrepancyContext::eFixSet)); // should say "n mRNAs overlapping pseudogenes", but C Toolkit reports this way.
396  break;
397  }
398  }
399  }
400 }
401 
402 
403 DISCREPANCY_AUTOFIX(MRNA_OVERLAPPING_PSEUDO_GENE)
404 {
405  const CSeq_feat* sf = dynamic_cast<const CSeq_feat*>(context.FindObject(*obj));
406  CSeq_feat_EditHandle eh = CSeq_feat_EditHandle(context.GetScope().GetSeq_featHandle(*sf));
407  eh.Remove();
408  obj->SetFixed();
409  return CRef<CAutofixReport>(new CAutofixReport("MRNA_OVERLAPPING_PSEUDO_GENE: [n] mRNA[s] removed", 1));
410 }
411 
412 
413 // EXON_INTRON_CONFLICT
414 
415 static bool less(const CSeq_feat* A, const CSeq_feat* B)
416 {
417  unsigned int a = A->GetLocation().GetStart(eExtreme_Positional);
418  unsigned int b = B->GetLocation().GetStart(eExtreme_Positional);
419  if (a != b) {
420  return a < b;
421  }
422  a = A->GetLocation().GetStop(eExtreme_Positional);
424  return a < b;
425 }
426 
427 static const string kIntronExon = "[n] introns and exons are incorrectly positioned";
428 
429 static void CollectExonsIntrons(CReportNode& out, CDiscrepancyContext& context, vector<const CSeq_feat*>& vex, vector<const CSeq_feat*>& vint)
430 {
431  sort(vex.begin(), vex.end(), less);
432  sort(vint.begin(), vint.end(), less);
433  auto Iex = vex.cbegin();
434  auto Iint = vint.cbegin();
435  while (Iex != vex.cend() && Iint != vint.cend()) {
436  const unsigned int e0 = (*Iex)->GetLocation().GetStart(eExtreme_Positional);
437  const unsigned int e1 = (*Iex)->GetLocation().GetStop(eExtreme_Positional);
438  const unsigned int i0 = (*Iint)->GetLocation().GetStart(eExtreme_Positional);
439  const unsigned int i1 = (*Iint)->GetLocation().GetStop(eExtreme_Positional);
440  if (i0 <= e0) {
441  if (i1 != e0 - 1) {
442  out[kIntronExon].Add(*context.SeqFeatObjRef(**Iint)).Add(*context.SeqFeatObjRef(**Iex));
443  }
444  ++Iint;
445  }
446  else /*if (e0 < i0)*/ {
447  if (e1 != i0 - 1) {
448  out[kIntronExon].Add(*context.SeqFeatObjRef(**Iex)).Add(*context.SeqFeatObjRef(**Iint));
449  }
450  ++Iex;
451  }
452  }
453 }
454 
455 
456 DISCREPANCY_CASE(EXON_INTRON_CONFLICT, SEQUENCE, eOncaller | eSubmitter | eSmart, "Exon and intron locations should abut (unless gene is trans-spliced)")
457 {
458  const auto& genes = context.FeatGenes();
459  const auto& exons = context.FeatExons();
460  const auto& introns = context.FeatIntrons();
461  if (exons.empty() || introns.empty()) {
462  return;
463  }
464  if (genes.empty()) {
465  vector<const CSeq_feat*> vex;
466  vector<const CSeq_feat*> vint;
467  vex.insert(vex.end(), exons.cbegin(), exons.cend());
468  vint.insert(vint.end(), introns.cbegin(), introns.cend());
469  CollectExonsIntrons(m_Objs, context, vex, vint);
470  }
471  else {
472  for (const CSeq_feat* gg : genes) {
473  if (gg->CanGetExcept_text() && gg->GetExcept_text() == "trans-splicing") {
474  continue;
475  }
476  const unsigned int g0 = gg->GetLocation().GetStart(eExtreme_Positional);
477  const unsigned int g1 = gg->GetLocation().GetStop(eExtreme_Positional);
478  vector<const CSeq_feat*> vex;
479  vector<const CSeq_feat*> vint;
480  for (const CSeq_feat* ff : exons) {
481  if (ff->GetLocation().GetStart(eExtreme_Positional) <= g1 && ff->GetLocation().GetStop(eExtreme_Positional) >= g0) {
482  vex.push_back(ff);
483  }
484  }
485  for (const CSeq_feat* ff : introns) {
486  if (ff->GetLocation().GetStart(eExtreme_Positional) <= g1 && ff->GetLocation().GetStop(eExtreme_Positional) >= g0) {
487  vint.push_back(ff);
488  }
489  }
490  CollectExonsIntrons(m_Objs, context, vex, vint);
491  }
492  }
493 }
494 
495 
496 // GENE_MISC_IGS_OVERLAP
497 
498 static const string kGeneMisc = "[n] gene[s] overlap[S] with IGS misc features";
499 
500 DISCREPANCY_CASE(GENE_MISC_IGS_OVERLAP, SEQUENCE, eOncaller, "Gene with misc feature overlap")
501 {
502  for (const CSeq_feat* gene : context.FeatGenes()) {
503  if (gene->IsSetLocation() && gene->IsSetData() && gene->GetData().GetGene().IsSetLocus() &&
504  NStr::StartsWith(gene->GetData().GetGene().GetLocus(), "trn")) {
505 
506  const CSeq_loc& loc_gene = gene->GetLocation();
507  bool gene_added = false;
508 
509  for (const CSeq_feat* misc : context.FeatMisc()) {
510  if (misc->IsSetLocation() && misc->IsSetComment() && NStr::FindNoCase(misc->GetComment(), "intergenic spacer") != NPOS) {
511  const CSeq_loc& loc_misc = misc->GetLocation();
512  if (context.Compare(loc_gene, loc_misc) != sequence::eNoOverlap) {
513  if (!gene_added) {
514  m_Objs[kGeneMisc].Add(*context.SeqFeatObjRef(*gene)).Incr();
515  gene_added = true;
516  }
517  m_Objs[kGeneMisc].Add(*context.SeqFeatObjRef(*misc));
518  }
519  }
520  }
521  }
522  }
523 }
524 
525 
526 // GENE_LOCUS_MISSING
527 
528 DISCREPANCY_CASE(GENE_LOCUS_MISSING, SEQUENCE, eOncaller, "Gene locus missing")
529 {
530  const auto& genes = context.FeatGenes();
531  const auto& cds = context.FeatCDS();
532  const auto& mrnas = context.FeatMRNAs();
533  for (const CSeq_feat* gene : genes) {
534  const CGene_ref& gref = gene->GetData().GetGene();
535  if (context.IsPseudo(*gene) || !gref.CanGetDesc() || gref.GetDesc().empty() || (gref.CanGetLocus() && !gref.GetLocus().empty())) {
536  continue;
537  }
538  bool found = false;
539  for (const CSeq_feat* feat : cds) {
540  if (context.GetGeneForFeature(*feat) == &*gene) {
541  found = true;
542  break;
543  }
544  }
545  if (!found) {
546  for (const CSeq_feat* feat : mrnas) {
547  if (context.GetGeneForFeature(*feat) == &*gene) {
548  found = true;
549  break;
550  }
551  }
552  }
553  if (found) {
554  m_Objs["[n] gene[s] missing locus"].Add(*context.SeqFeatObjRef(*gene, gene));
555  }
556  }
557 }
558 
559 
560 DISCREPANCY_AUTOFIX(GENE_LOCUS_MISSING)
561 {
562  const CSeq_feat* sf = dynamic_cast<const CSeq_feat*>(context.FindObject(*obj));
563  CRef<CSeq_feat> new_feat(new CSeq_feat());
564  new_feat->Assign(*sf);
565  new_feat->SetData().SetGene().SetLocus(new_feat->GetData().GetGene().GetDesc());
566  new_feat->SetData().SetGene().ResetDesc();
567  context.ReplaceSeq_feat(*obj, *sf, *new_feat);
568  obj->SetFixed();
569  return CRef<CAutofixReport>(new CAutofixReport("GENE_LOCUS_MISSING: [n] gene[s] fixed", 1));
570 }
571 
572 
User-defined methods of the data storage class.
#define static
@ eExtreme_Positional
numerical value
Definition: Na_strand.hpp:63
CBioseq_Handle –.
CCode_break –.
Definition: Code_break.hpp:66
CRef< CDiscrepancyObject > SeqFeatObjRef(const CSeq_feat &feat, EFixType fix=eFixNone, const CObject *more=nullptr)
CFeat_CI –.
Definition: feat_ci.hpp:64
static void Add(TReportObjectList &list, TReportObjectSet &hash, CReportObj &obj, bool unique=true)
CReportNode & Ext(bool b=true)
CScope –.
Definition: scope.hpp:92
CSeq_feat_EditHandle –.
namespace ncbi::objects::
Definition: Seq_feat.hpp:58
Definition: map.hpp:338
@ eFatal
@ eDisc
@ eOncaller
@ eSubmitter
@ eSmart
#define DISCREPANCY_AUTOFIX(name)
#define DISCREPANCY_CASE(name, type, group, descr)
#define DISCREPANCY_SUMMARIZE(name)
#define A(i)
Definition: ecp_curves.c:948
int GetSubtype(CFieldNamePanel *field_name_panel, string &ncRNA_class)
std::ofstream out("events_result.xml")
main entry point for tests
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Set object to copy of another one.
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Optimized implementation of CSerialObject::Assign, which is not so efficient.
Definition: Seq_id.cpp:318
ENa_strand GetStrand(void) const
Get the location's strand.
Definition: Seq_loc.cpp:882
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Override Assign() to incorporate cache invalidation.
Definition: Seq_loc.cpp:337
TRange GetTotalRange(void) const
Definition: Seq_loc.hpp:913
TSeqPos GetStart(ESeqLocExtremes ext) const
Return start and stop positions of the seq-loc.
Definition: Seq_loc.cpp:915
void SetInt(TInt &v)
Definition: Seq_loc.hpp:983
void Add(const CSeq_loc &other)
Simple adding of seq-locs.
Definition: Seq_loc.cpp:3875
bool IsSetStrand(EIsSetStrand flag=eIsSetStrand_Any) const
Check if strand is set for any/all part(s) of the seq-loc depending on the flag.
Definition: Seq_loc.cpp:858
const CSeq_id * GetId(void) const
Get the id of the location return NULL if has multiple ids or no id at all.
Definition: Seq_loc.hpp:941
void SetPartialStart(bool val, ESeqLocExtremes ext)
set / remove e_Lim fuzz on start or stop (lt/gt - indicating partial interval)
Definition: Seq_loc.cpp:3280
void SetPartialStop(bool val, ESeqLocExtremes ext)
Definition: Seq_loc.cpp:3313
TSeqPos GetStop(ESeqLocExtremes ext) const
Definition: Seq_loc.cpp:963
@ fStrand_Ignore
Definition: Seq_loc.hpp:325
TSeqPos GetLength(const CSeq_id &id, CScope *scope)
Get sequence length if scope not null, else return max possible TSeqPos.
CRef< CSeq_loc > Seq_loc_Subtract(const CSeq_loc &loc1, const CSeq_loc &loc2, CSeq_loc::TOpFlags flags, CScope *scope)
Subtract the second seq-loc from the first one.
ECompare
@ eContains
First CSeq_loc contains second.
@ eSame
CSeq_locs contain each other.
@ eContained
First CSeq_loc contained by second.
@ eNoOverlap
CSeq_locs do not overlap or abut.
CConstRef< CSeq_feat > GetGeneForFeature(const CSeq_feat &feat, CScope &scope)
Finds gene for feature, but obeys SeqFeatXref directives.
Definition: sequence.cpp:1529
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
Definition: scope.cpp:95
void Remove(void) const
Remove the feature from Seq-annot.
CScope & GetScope(void) const
Get scope this handle belongs to.
void Replace(const CSeq_feat &new_feat) const
Replace the feature with new Seq-feat object.
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:1439
position_type GetToOpen(void) const
Definition: range.hpp:138
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
NCBI_NS_STD::string::size_type SIZE_TYPE
Definition: ncbistr.hpp:132
static SIZE_TYPE FindNoCase(const CTempString str, const CTempString pattern, SIZE_TYPE start, SIZE_TYPE end, EOccurrence which=eFirst)
Find the pattern in the specified range of a string using a case insensitive search.
Definition: ncbistr.cpp:2989
#define NPOS
Definition: ncbistr.hpp:133
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
Definition: ncbistr.hpp:5411
TTo GetTo(void) const
Get the To member data.
Definition: Range_.hpp:269
TFrom GetFrom(void) const
Get the From member data.
Definition: Range_.hpp:222
const TDesc & GetDesc(void) const
Get the Desc member data.
Definition: Gene_ref_.hpp:599
bool CanGetLocus(void) const
Check if it is safe to call GetLocus method.
Definition: Gene_ref_.hpp:499
bool CanGetDesc(void) const
Check if it is safe to call GetDesc method.
Definition: Gene_ref_.hpp:593
const TLocus & GetLocus(void) const
Get the Locus member data.
Definition: Gene_ref_.hpp:505
void SetAa(TAa &value)
Assign a value to Aa data member.
void SetLocation(TLocation &value)
Assign a value to Location data member.
Definition: Seq_feat_.cpp:131
void SetComment(const TComment &value)
Assign a value to Comment data member.
Definition: Seq_feat_.hpp:1058
const TLocation & GetLocation(void) const
Get the Location member data.
Definition: Seq_feat_.hpp:1117
const TData & GetData(void) const
Get the Data member data.
Definition: Seq_feat_.hpp:925
void SetData(TData &value)
Assign a value to Data data member.
Definition: Seq_feat_.cpp:94
void SetLoc(TLoc &value)
Assign a value to Loc data member.
const TComment & GetComment(void) const
Get the Comment member data.
Definition: Seq_feat_.hpp:1049
const TGene & GetGene(void) const
Get the variant data.
bool CanGetComment(void) const
Check if it is safe to call GetComment method.
Definition: Seq_feat_.hpp:1043
ENa_strand
strand of nucleic acid
Definition: Na_strand_.hpp:64
@ eNa_strand_minus
Definition: Na_strand_.hpp:67
@ eNa_strand_unknown
Definition: Na_strand_.hpp:65
const TSource & GetSource(void) const
Get the variant data.
Definition: Seqdesc_.cpp:566
int i
yy_size_t n
int len
constexpr auto sort(_Init &&init)
unsigned int a
Definition: ncbi_localip.c:102
double f(double x_, const double &y_)
Definition: njn_root.hpp:188
const string kCDSRNAContainstRNA
USING_SCOPE(objects)
const string kCDSRNAOverlapNoContainSameStrand
const string kCDSRNAOverlapNoContain
const string kCDSRNAExactMatch
static const string kGeneMisc
static const string kCdsTrnaOverlapComment
static const TRNALengthMap kTrnaLengthMap
static void CollectExonsIntrons(CReportNode &out, CDiscrepancyContext &context, vector< const CSeq_feat * > &vex, vector< const CSeq_feat * > &vint)
static bool less(const CSeq_feat *A, const CSeq_feat *B)
const string kCDSRNAOverlapNoContainOppStrand
const string kCDSRNAContains
bool IsShortrRNA(const CSeq_feat &f, CScope *scope)
const string kCDSRNAContainedIn
static const string kIntronExon
map< string, TRNALength > TRNALengthMap
pair< size_t, bool > TRNALength
static const string kCDSoverlapTRNA
const string kCDSRNAAnyOverlap
float g0(Seg_Nsm *spn, Thd_Cxe *cxe)
Definition: thrddgri.c:90
float Overlap(iterator1 iter1, iterator1 end1, iterator2 iter2, iterator2 end2)
Overlap measure.
#define const
Definition: zconf.h:230
Modified on Fri Dec 08 08:21:15 2023 by modify_doxy.py rev. 669887