NCBI C++ ToolKit
splice_problems.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: splice_problems.cpp 98416 2022-11-09 17:44:49Z gotvyans $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Colleen Bollin
27  *
28  * File Description:
29  * validation of Seq_feat splice sites
30  * .......
31  *
32  */
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbistd.hpp>
35 #include <corelib/ncbistr.hpp>
38 #include <objmgr/bioseq_handle.hpp>
39 #include <objmgr/seqdesc_ci.hpp>
40 #include <objmgr/seq_vector.hpp>
41 #include <objmgr/util/sequence.hpp>
43 
44 
47 BEGIN_SCOPE(validator)
48 using namespace sequence;
49 
50 
51 
53 (ENa_strand strand,
54  TSeqPos stop,
55  const CSeqVector& vec_donor,
56  TSeqPos seq_len_donor,
57  TSeqPos start,
58  const CSeqVector& vec_acceptor,
59  TSeqPos seq_len_acceptor)
60 {
61  char donor[2];
62  char acceptor[2];
63 
64  ESpliceSiteRead good_donor = ReadDonorSpliceSite(strand, stop, vec_donor, seq_len_donor, donor);
65  ESpliceSiteRead good_acceptor = ReadAcceptorSpliceSite(strand, start, vec_acceptor, seq_len_acceptor, acceptor);
66  bool donor_ok = (good_donor == eSpliceSiteRead_OK || good_donor == eSpliceSiteRead_WrongNT);
67  bool acceptor_ok = (good_acceptor == eSpliceSiteRead_OK || good_acceptor == eSpliceSiteRead_WrongNT);
68 
69  if (donor_ok && acceptor_ok) {
70  // Check canonical adjacent splice sites: "GT-AG"
71  // Check non-canonical adjacent splice sites: "GC-AG"
72  // Check non-canonical adjacent splice sites: "AT-AC"
73  if (CheckAdjacentSpliceSites(kSpliceSiteGTAG, strand, donor, acceptor) ||
74  CheckAdjacentSpliceSites(kSpliceSiteGCAG, strand, donor, acceptor) ||
75  CheckAdjacentSpliceSites(kSpliceSiteATAC, strand, donor, acceptor)) {
76  return; // canonical splice site found
77  }
78  m_DonorProblems.push_back(TSpliceProblem(good_donor, stop));
79  m_AcceptorProblems.push_back(TSpliceProblem(good_acceptor, start));
80  } else {
81  m_DonorProblems.push_back(TSpliceProblem(good_donor, stop));
82  m_AcceptorProblems.push_back(TSpliceProblem(good_acceptor, start));
83  }
84 }
85 
86 
89 {
90  try {
91  bool in_gap;
92  bool bad_seq = false;
93 
94  if (strand == eNa_strand_minus) {
95  // check donor and acceptor on minus strand
96  if (stop > 1 && stop <= seq_len) {
97  in_gap = (vec.IsInGap(stop - 2) && vec.IsInGap(stop - 1));
98  if (!in_gap) {
99  bad_seq = (vec[stop - 1] > 250 || vec[stop - 2] > 250);
100  }
101 
102  if (in_gap) {
103  return eSpliceSiteRead_Gap;
104  } else if (bad_seq) {
105  return eSpliceSiteRead_BadSeq;
106  }
107 
108  // Read splice site seq
109  site[0] = vec[stop - 2];
110  site[1] = vec[stop - 1];
111  } else {
112  return eSpliceSiteRead_OutOfRange;
113  }
114  }
115  // Read donor splice site from plus strand
116  else {
117  if (stop < seq_len - 2) {
118  in_gap = (vec.IsInGap(stop + 1) && vec.IsInGap(stop + 2));
119  if (!in_gap) {
120  bad_seq = (vec[stop + 1] > 250 || vec[stop + 2] > 250);
121  }
122  if (in_gap) {
123  return eSpliceSiteRead_Gap;
124  } else if (bad_seq) {
125  return eSpliceSiteRead_BadSeq;
126  }
127  site[0] = vec[stop + 1];
128  site[1] = vec[stop + 2];
129  } else {
130  return eSpliceSiteRead_OutOfRange;
131  }
132  }
133 
134  // Check canonical donor site: "GT" and non-canonical donor site: "GC"
136  return eSpliceSiteRead_OK;
137  } else {
138  return eSpliceSiteRead_WrongNT;
139  }
140  } catch (CException&) {
141  return eSpliceSiteRead_OK;
142  }
143 }
144 
145 
148 {
149  char site[2];
150 
151  return ReadDonorSpliceSite(strand, stop, vec, seq_len, site);
152 }
153 
154 
155 
158 (ENa_strand strand,
159  TSeqPos start,
160  const CSeqVector& vec,
161  TSeqPos seq_len,
162  TSpliceSite& site)
163 {
164  try {
165  bool in_gap;
166  bool bad_seq = false;
167 
168  if (strand == eNa_strand_minus) {
169  // check donor and acceptor on minus strand
170  if (start < seq_len - 2) {
171  in_gap = (vec.IsInGap(start + 1) && vec.IsInGap(start + 2));
172  if (!in_gap) {
173  bad_seq = (vec[start + 1] > 250 || vec[start + 2] > 250);
174  }
175 
176  if (in_gap) {
177  return eSpliceSiteRead_Gap;
178  } else if (bad_seq) {
179  return eSpliceSiteRead_BadSeq;
180  }
181  site[0] = vec[start + 1];
182  site[1] = vec[start + 2];
183  } else {
184  return eSpliceSiteRead_OutOfRange;
185  }
186  }
187  // read acceptor splice site from plus strand
188  else {
189  if (start > 1 && start <= seq_len) {
190  in_gap = (vec.IsInGap(start - 2) && vec.IsInGap(start - 1));
191  if (!in_gap) {
192  bad_seq = (vec[start - 2] > 250 || vec[start - 1] > 250);
193  }
194 
195  if (in_gap) {
196  return eSpliceSiteRead_Gap;
197  } else if (bad_seq) {
198  return eSpliceSiteRead_BadSeq;
199  }
200  site[0] = vec[start - 2];
201  site[1] = vec[start - 1];
202  } else {
203  return eSpliceSiteRead_OutOfRange;
204  }
205  }
206  // Check canonical acceptor site: "AG"
207  if (CheckSpliceSite(kSpliceSiteAG, strand, site)) {
208  return eSpliceSiteRead_OK;
209  } else {
210  return eSpliceSiteRead_WrongNT;
211  }
212  } catch (CException&) {
213  return eSpliceSiteRead_BadSeq;
214  }
215 }
216 
217 
220 (ENa_strand strand,
221 TSeqPos start,
222 const CSeqVector& vec,
223 TSeqPos seq_len)
224 {
225  char site[2];
226  return ReadAcceptorSpliceSite(strand, start, vec, seq_len, site);
227 }
228 
229 
231 {
232  bool has_errors = false;
233  // donors
234  for (auto it = m_DonorProblems.begin(); it != m_DonorProblems.end() && !has_errors; it++) {
235  if (it->first == eSpliceSiteRead_BadSeq || it->first == eSpliceSiteRead_Gap ||
236  it->first == eSpliceSiteRead_WrongNT) {
237  has_errors = true;
238  }
239  }
240  // acceptors
241  for (auto it = m_AcceptorProblems.begin(); it != m_AcceptorProblems.end() && !has_errors; it++) {
242  if (it->first == eSpliceSiteRead_BadSeq || it->first == eSpliceSiteRead_Gap ||
243  it->first == eSpliceSiteRead_WrongNT) {
244  has_errors = true;
245  }
246  }
247 
248  return has_errors;
249 }
250 
251 
252 void CSpliceProblems::CalculateSpliceProblems(const CSeq_feat& feat, bool check_all, bool pseudo, CBioseq_Handle loc_handle)
253 {
254  m_DonorProblems.clear();
255  m_AcceptorProblems.clear();
256  m_ExceptionUnnecessary = false;
257  m_ErrorsNotExpected = true;
258 
259  bool has_errors = false, ribo_slip = false;
260 
261  const CSeq_loc& loc = feat.GetLocation();
262 
263  // skip if organelle
264  if (!loc_handle || IsOrganelle(loc_handle)) {
265  return;
266  }
267 
268  // suppress for specific biological exceptions
269  if (feat.IsSetExcept() && feat.IsSetExcept_text()
270  && (NStr::FindNoCase(feat.GetExcept_text(), "low-quality sequence region") != string::npos)) {
271  return;
272  }
273  if (feat.IsSetExcept() && feat.IsSetExcept_text()
274  && (NStr::FindNoCase(feat.GetExcept_text(), "ribosomal slippage") != string::npos)) {
275  m_ErrorsNotExpected = false;
276  ribo_slip = true;
277  }
278  if (feat.IsSetExcept() && feat.IsSetExcept_text()
279  && (NStr::FindNoCase(feat.GetExcept_text(), "artificial frameshift") != string::npos
280  || NStr::FindNoCase(feat.GetExcept_text(), "nonconsensus splice site") != string::npos
281  || NStr::FindNoCase(feat.GetExcept_text(), "adjusted for low-quality genome") != string::npos
282  || NStr::FindNoCase(feat.GetExcept_text(), "heterogeneous population sequenced") != string::npos
283  || NStr::FindNoCase(feat.GetExcept_text(), "low-quality sequence region") != string::npos
284  || NStr::FindNoCase(feat.GetExcept_text(), "artificial location") != string::npos)) {
285  m_ErrorsNotExpected = false;
286  }
287 
288 
289  // look for mixed strands, skip if found
291 
292  int num_parts = 0;
293  for (CSeq_loc_CI si(loc); si; ++si) {
294  if (si.IsSetStrand()) {
295  ENa_strand tmp = si.GetStrand();
296  if (tmp == eNa_strand_plus || tmp == eNa_strand_minus) {
297  if (strand == eNa_strand_unknown) {
298  strand = si.GetStrand();
299  } else if (strand != tmp) {
300  return;
301  }
302  }
303  }
304  num_parts++;
305  }
306 
307  if (!check_all && num_parts < 2) {
308  return;
309  }
310 
311  // Default value for a strand is '+'
312  if (eNa_strand_unknown == strand) {
313  strand = eNa_strand_plus;
314  }
315 
316  // only check for errors if overlapping gene is not pseudo
317  if (!pseudo) {
318  CSeqFeatData::ESubtype subtype = feat.GetData().GetSubtype();
319  switch (subtype) {
321  ValidateSpliceExon(feat, loc_handle, strand);
322  break;
324  ValidateSpliceMrna(feat, loc_handle, strand);
325  break;
327  ValidateSpliceCdregion(feat, loc_handle, strand);
328  break;
329  default:
330  break;
331  }
332  }
333  has_errors = SpliceSitesHaveErrors();
334 
335  if (!m_ErrorsNotExpected && !has_errors && !ribo_slip) {
336  m_ExceptionUnnecessary = true;
337  }
338 }
339 
340 
342 {
343  const CSeq_loc& loc = feat.GetLocation();
344 
345  // Find overlapping feature - mRNA or gene - to identify start / stop exon
346  bool overlap_feat_partial_5 = false; // set to true if 5'- most start of overlapping feature is partial
347  bool overlap_feat_partial_3 = false; // set to true if 3'- most end of overlapping feature is partial
348  TSeqPos overlap_feat_start = 0; // start position of overlapping feature
349  TSeqPos overlap_feat_stop = 0; // stop position of overlapping feature
350 
351  bool overlap_feat_exists = false;
352  // Locate overlapping mRNA feature
354  loc,
357  bsh.GetScope());
358  if (mrna) {
359  overlap_feat_exists = true;
360  overlap_feat_partial_5 = mrna->GetLocation().IsPartialStart(eExtreme_Biological);
361  overlap_feat_start = mrna->GetLocation().GetStart(eExtreme_Biological);
362 
363  overlap_feat_partial_3 = mrna->GetLocation().IsPartialStop(eExtreme_Biological);
364  overlap_feat_stop = mrna->GetLocation().GetStop(eExtreme_Biological);
365  }
366  else {
367  // Locate overlapping gene feature.
369  loc,
372  bsh.GetScope());
373  if (gene) {
374  overlap_feat_exists = true;
375  overlap_feat_partial_5 = gene->GetLocation().IsPartialStart(eExtreme_Biological);
376  overlap_feat_start = gene->GetLocation().GetStart(eExtreme_Biological);
377 
378  overlap_feat_partial_3 = gene->GetLocation().IsPartialStop(eExtreme_Biological);
379  overlap_feat_stop = gene->GetLocation().GetStop(eExtreme_Biological);
380  }
381  }
382 
383  CSeq_loc_CI si(loc);
384  try{
385  CSeq_loc::TRange range = si.GetRange();
386  CConstRef<CSeq_loc> cur_int = si.GetRangeAsSeq_loc();
387  if (cur_int) {
388  CBioseq_Handle bsh_si = bsh.GetScope().GetBioseqHandle(*cur_int);
389 
390  if (bsh_si) {
392 
393  TSeqPos start, stop;
394  if (eNa_strand_minus == strand) {
395  start = range.GetTo();
396  stop = range.GetFrom();
397  } else {
398  start = range.GetFrom();
399  stop = range.GetTo();
400  }
401 
402  if (overlap_feat_exists) {
403  if (!cur_int->IsPartialStop(eExtreme_Biological)) {
404  if (stop == overlap_feat_stop) {
405  if (overlap_feat_partial_3) {
406  ESpliceSiteRead good_donor = ReadDonorSpliceSite(strand, stop, vec, bsh_si.GetInst_Length());
407  m_DonorProblems.push_back(TSpliceProblem(good_donor, stop));
408  }
409  } else {
410  ESpliceSiteRead good_donor = ReadDonorSpliceSite(strand, stop, vec, bsh_si.GetInst_Length());
411  m_DonorProblems.push_back(TSpliceProblem(good_donor, stop));
412  }
413  }
414 
415  if (!cur_int->IsPartialStart(eExtreme_Biological)) {
416  if (start == overlap_feat_start) {
417  if (overlap_feat_partial_5) {
418  ESpliceSiteRead good_acceptor = ReadAcceptorSpliceSite(strand, start, vec, bsh_si.GetInst_Length());
419  m_AcceptorProblems.push_back(TSpliceProblem(good_acceptor, start));
420  }
421  } else {
422  ESpliceSiteRead good_acceptor = ReadAcceptorSpliceSite(strand, start, vec, bsh_si.GetInst_Length());
423  m_AcceptorProblems.push_back(TSpliceProblem(good_acceptor, start));
424  }
425  }
426  } else {
427  // Overlapping feature - mRNA or gene - not found.
428  if (!cur_int->IsPartialStop(eExtreme_Biological)) {
429  ESpliceSiteRead good_donor = ReadDonorSpliceSite(strand, stop, vec, bsh_si.GetInst_Length());
430  m_DonorProblems.push_back(TSpliceProblem(good_donor, stop));
431  }
432  if (!cur_int->IsPartialStart(eExtreme_Biological)) {
433  ESpliceSiteRead good_acceptor = ReadAcceptorSpliceSite(strand, start, vec, bsh_si.GetInst_Length());
434  m_AcceptorProblems.push_back(TSpliceProblem(good_acceptor, start));
435  }
436  }
437  }
438  }
439  } catch (const CException& ) {
440  ;
441  } catch (const std::exception& ) {
442  ;// could get errors from CSeqVector
443  }
444 }
445 
447 {
448  const CSeq_loc& loc = feat.GetLocation();
449 
450  bool ignore_mrna_partial5 = false;
451  bool ignore_mrna_partial3 = false;
452 
453  // Retrieve overlapping cdregion
455  loc,
458  bsh.GetScope());
459  if (cds) {
460  // If there is no UTR information, then the mRNA location should agree with its CDS location,
461  // but the mRNA should be marked partial at its 5' and 3' ends
462  // Do not check splice site (either donor or acceptor) if CDS location's start / stop is complete.
465  ignore_mrna_partial5 = true;
466  }
469  ignore_mrna_partial3 = true;
470  }
471  }
472 
473  TSeqPos start;
474  TSeqPos stop;
475 
476  CSeq_loc_CI head(loc);
477  if (head) {
478  // Validate acceptor site of 5'- most feature
479  const CSeq_loc& part = head.GetEmbeddingSeq_loc();
480  CSeq_loc::TRange range = head.GetRange();
481  CBioseq_Handle bsh_head = bsh.GetScope().GetBioseqHandle(*head.GetRangeAsSeq_loc());
482  if (bsh_head) {
484 
485  if (strand == eNa_strand_minus) {
486  start = range.GetTo();
487  } else {
488  start = range.GetFrom();
489  }
490  if (part.IsPartialStart(eExtreme_Biological) && !ignore_mrna_partial5) {
491  ESpliceSiteRead good_acceptor = ReadAcceptorSpliceSite(strand, start, vec, bsh_head.GetInst_Length());
492  m_AcceptorProblems.push_back(TSpliceProblem(good_acceptor, start));
493  }
494  }
495 
496  CSeq_loc_CI tail(loc);
497  ++tail;
498 
499  // Validate adjacent (donor...acceptor) splice sites.
500  // @head is a location of exon that contibutes `donor site`
501  // @tail is a location of exon that contibutes `acceptor site`
502  for(; tail; ++head, ++tail) {
503  CSeq_loc::TRange range_head = head.GetRange();
504  CSeq_loc::TRange range_tail = tail.GetRange();
505  CBioseq_Handle bsh_head = bsh.GetScope().GetBioseqHandle(*head.GetRangeAsSeq_loc());
506  CBioseq_Handle bsh_tail = bsh.GetScope().GetBioseqHandle(*tail.GetRangeAsSeq_loc());
507  if (bsh_head && bsh_tail) {
508  try {
511 
512  if (strand == eNa_strand_minus) {
513  start = range_tail.GetTo();
514  stop = range_head.GetFrom();
515  } else {
516  start = range_tail.GetFrom();
517  stop = range_head.GetTo();
518  }
519  ValidateDonorAcceptorPair(strand,
520  stop, vec_head, bsh_head.GetInst_Length(),
521  start, vec_tail, bsh_tail.GetInst_Length());
522  } catch (CSeqVectorException& ) {
523  }
524  }
525  }
526  }
527 
528  // Validate donor site of 3'most feature
529  if(head) {
530  const CSeq_loc& part = head.GetEmbeddingSeq_loc();
531  CSeq_loc::TRange range = head.GetRange();
532  CBioseq_Handle bsh_head = bsh.GetScope().GetBioseqHandle(*head.GetRangeAsSeq_loc());
533  if (bsh_head) {
535 
536  if (strand == eNa_strand_minus) {
537  stop = range.GetFrom();
538  } else {
539  stop = range.GetTo();
540  }
541  if (part.IsPartialStop(eExtreme_Biological) && !ignore_mrna_partial3) {
542  ESpliceSiteRead good_donor = ReadDonorSpliceSite(strand, stop, vec, bsh_head.GetInst_Length());
543  m_DonorProblems.push_back(TSpliceProblem(good_donor, stop));
544  }
545  }
546  }
547 }
548 
550 {
551  const CSeq_loc& loc = feat.GetLocation();
552 
553  TSeqPos start;
554  TSeqPos stop;
555 
556  CSeq_loc_CI head(loc);
557  if (head) {
558  // Validate acceptor site of 5'- most feature
559  const CSeq_loc& part = head.GetEmbeddingSeq_loc();
560  CSeq_loc::TRange range = head.GetRange();
561  CBioseq_Handle bsh_head = bsh.GetScope().GetBioseqHandle(*head.GetRangeAsSeq_loc());
562  if (bsh_head) {
563  try {
565  if (part.IsPartialStart(eExtreme_Biological)) {
566  if (strand == eNa_strand_minus) {
567  start = range.GetTo();
568  } else {
569  start = range.GetFrom();
570  }
571  ESpliceSiteRead good_acceptor = ReadAcceptorSpliceSite(strand, start, vec, bsh_head.GetInst_Length());
572  m_AcceptorProblems.push_back(TSpliceProblem(good_acceptor, start));
573  }
574  } catch (CSeqVectorException&) {
575  }
576  }
577 
578  CSeq_loc_CI tail(loc);
579  ++tail;
580 
581  // Validate adjacent (donor...acceptor) splice sites.
582  // @head is a location of exon that contibutes `donor site`
583  // @tail is a location of exon that contibutes `acceptor site`
584  for(; tail; ++head, ++tail) {
585  CSeq_loc::TRange range_head = head.GetRange();
586  CSeq_loc::TRange range_tail = tail.GetRange();
587  CBioseq_Handle bsh_head = bsh.GetScope().GetBioseqHandle(*head.GetRangeAsSeq_loc());
588  CBioseq_Handle bsh_tail = bsh.GetScope().GetBioseqHandle(*tail.GetRangeAsSeq_loc());
589  if (bsh_head && bsh_tail) {
590  try {
593 
594 
595  if (strand == eNa_strand_minus) {
596  start = range_tail.GetTo();
597  stop = range_head.GetFrom();
598  } else {
599  start = range_tail.GetFrom();
600  stop = range_head.GetTo();
601  }
602  ValidateDonorAcceptorPair(strand,
603  stop, vec_head, bsh_head.GetInst_Length(),
604  start, vec_tail, bsh_tail.GetInst_Length());
605  } catch (CSeqVectorException&) {
606  }
607  }
608  }
609  }
610 
611  // Validate donor site of 3'most feature
612  if(head) {
613  const CSeq_loc& part = head.GetEmbeddingSeq_loc();
614  CSeq_loc::TRange range = head.GetRange();
615  CBioseq_Handle bsh_head = bsh.GetScope().GetBioseqHandle(*head.GetRangeAsSeq_loc());
616  if (bsh_head) {
617  try {
619 
620  if (strand == eNa_strand_minus) {
621  stop = range.GetFrom();
622  } else {
623  stop = range.GetTo();
624  }
625  if (part.IsPartialStop(eExtreme_Biological)) {
626  ESpliceSiteRead good_donor = ReadDonorSpliceSite(strand, stop, vec, bsh_head.GetInst_Length());
627  m_DonorProblems.push_back(TSpliceProblem(good_donor, stop));
628  }
629  } catch (CSeqVectorException&) {
630  }
631  }
632  }
633 
634 }
635 
636 
638 {
639  return c == 'G';
640 }
641 
643 {
644  return c == 'C';
645 }
646 
648 {
649  return c == 'A';
650 }
651 
653 {
654  return c == 'T';
655 }
656 
657 bool CheckAdjacentSpliceSites(const string& signature, ENa_strand strand, TConstSpliceSite donor, TConstSpliceSite acceptor)
658 {
659  static
660  struct tagSpliceSiteInfo
661  {
662  const string& id;
663  ENa_strand strand;
664  bool(*check_donor0)(Char);
665  bool(*check_donor1)(Char);
666  bool(*check_acceptor0)(Char);
667  bool(*check_acceptor1)(Char);
668  }
669  SpliceSiteInfo[] = {
670  // 5' << GT...AG <<
672  // >> CT...AC >>, reverse complement
674  // 5' << GC...AG <<
676  // >> CT...GC >>, reverse complement
678  // 5' << AT...AC <<
680  // >> GT...AT >>, reverse complement
682  };
683  static int size = sizeof(SpliceSiteInfo) / sizeof(struct tagSpliceSiteInfo);
684 
685  for (int i = 0; i < size; ++i) {
686  struct tagSpliceSiteInfo* entry = &SpliceSiteInfo[i];
687  if (strand == entry->strand && entry->id == signature) {
688  return (entry->check_donor0(donor[0]) && entry->check_donor1(donor[1]) &&
689  entry->check_acceptor0(acceptor[0]) && entry->check_acceptor1(acceptor[1]));
690  }
691  }
692 
693  NCBI_THROW(CCoreException, eCore, "Unknown splice site signature.");
694 }
695 
696 
697 bool CheckSpliceSite(const string& signature, ENa_strand strand, TConstSpliceSite site)
698 {
699  static
700  struct tagSpliceSiteInfo
701  {
702  const string& id;
703  ENa_strand strand;
704  bool(*check_site0)(Char);
705  bool(*check_site1)(Char);
706  }
707  SpliceSiteInfo[] = {
708  // 5' << GT... <<
710  // >> ...AC >>, reverse complement
712  // 5' << ...AG <<
714  // >> CT...>>, reverse complement
716  // 5' << GC... <<
718  // >> ...GC >>, reverse complement
720  };
721  static int size = sizeof(SpliceSiteInfo) / sizeof(struct tagSpliceSiteInfo);
722 
723  for (int i = 0; i < size; ++i) {
724  struct tagSpliceSiteInfo* entry = &SpliceSiteInfo[i];
725  if (strand == entry->strand && entry->id == signature) {
726  return (entry->check_site0(site[0]) && entry->check_site1(site[1]));
727  }
728  }
729 
730  NCBI_THROW(CCoreException, eCore, "Unknown splice site signature.");
731 }
732 
733 
735 {
736  return (CheckAdjacentSpliceSites(kSpliceSiteGTAG, strand, donor, acceptor) ||
737  CheckAdjacentSpliceSites(kSpliceSiteGCAG, strand, donor, acceptor) ||
738  CheckAdjacentSpliceSites(kSpliceSiteATAC, strand, donor, acceptor));
739 }
740 
742 {
743  return (CheckSpliceSite(kSpliceSiteGT, strand, donor) ||
744  CheckSpliceSite(kSpliceSiteGC, strand, donor));
745 }
746 
748 {
749  return CheckSpliceSite(kSpliceSiteAG, strand, acceptor);
750 }
751 
752 END_SCOPE(validator)
@ eExtreme_Biological
5' and 3'
Definition: Na_strand.hpp:62
#define bool
Definition: bool.h:34
CBioseq_Handle –.
CCoreException –.
Definition: ncbiexpt.hpp:1476
ESubtype GetSubtype(void) const
SeqVector related exceptions.
CSeqVector –.
Definition: seq_vector.hpp:65
namespace ncbi::objects::
Definition: Seq_feat.hpp:58
void ValidateSpliceCdregion(const CSeq_feat &feat, const CBioseq_Handle &bsh, ENa_strand strand)
void CalculateSpliceProblems(const CSeq_feat &feat, bool check_all, bool pseudo, CBioseq_Handle loc_handle)
void ValidateSpliceExon(const CSeq_feat &feat, const CBioseq_Handle &bsh, ENa_strand strand)
ESpliceSiteRead ReadAcceptorSpliceSite(ENa_strand strand, TSeqPos start, const CSeqVector &vec, TSeqPos seq_len, TSpliceSite &site)
ESpliceSiteRead ReadDonorSpliceSite(ENa_strand strand, TSeqPos stop, const CSeqVector &vec, TSeqPos seq_len, TSpliceSite &site)
void ValidateSpliceMrna(const CSeq_feat &feat, const CBioseq_Handle &bsh, ENa_strand strand)
pair< size_t, TSeqPos > TSpliceProblem
void ValidateDonorAcceptorPair(ENa_strand strand, TSeqPos stop, const CSeqVector &vec_donor, TSeqPos seq_len_donor, TSeqPos start, const CSeqVector &vec_acceptor, TSeqPos seq_len_acceptor)
Include a standard set of the NCBI C++ Toolkit most basic headers.
The NCBI C++ standard methods for dealing with std::string.
#define head
Definition: ct_nlmzip_i.h:138
static const char si[8][64]
Definition: des.c:146
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define Char
Definition: ncbistd.hpp:124
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
bool IsPartialStart(ESeqLocExtremes ext) const
check start or stop of location for e_Lim fuzz
Definition: Seq_loc.cpp:3222
TSeqPos GetStart(ESeqLocExtremes ext) const
Return start and stop positions of the seq-loc.
Definition: Seq_loc.cpp:915
bool IsPartialStop(ESeqLocExtremes ext) const
Definition: Seq_loc.cpp:3251
TSeqPos GetStop(ESeqLocExtremes ext) const
Definition: Seq_loc.cpp:963
CMappedFeat GetBestOverlappingFeat(const CMappedFeat &feat, CSeqFeatData::ESubtype need_subtype, sequence::EOverlapType overlap_type, CFeatTree *feat_tree=0, const SAnnotSelector *base_sel=0)
Definition: feature.cpp:3653
@ eOverlap_Contains
2nd contains 1st extremes
@ eOverlap_Contained
2nd contained within 1st extremes
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
Definition: scope.cpp:95
TInst_Length GetInst_Length(void) const
CScope & GetScope(void) const
Get scope this handle belongs to.
CSeqVector GetSeqVector(EVectorCoding coding, ENa_strand strand=eNa_strand_plus) const
Get sequence: Iupacna or Iupacaa if use_iupac_coding is true.
@ eCoding_Iupac
Set coding to printable coding (Iupacna or Iupacaa)
bool IsInGap(TSeqPos pos) const
true if sequence at 0-based position 'pos' has gap Note: this method is not MT-safe,...
Definition: seq_vector.hpp:277
char Char
Alias for char.
Definition: ncbitype.h:93
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
static SIZE_TYPE FindNoCase(const CTempString str, const CTempString pattern, SIZE_TYPE start, SIZE_TYPE end, EOccurrence which=eFirst)
Find the pattern in the specified range of a string using a case insensitive search.
Definition: ncbistr.cpp:2989
TTo GetTo(void) const
Get the To member data.
Definition: Range_.hpp:269
TFrom GetFrom(void) const
Get the From member data.
Definition: Range_.hpp:222
const TLocation & GetLocation(void) const
Get the Location member data.
Definition: Seq_feat_.hpp:1117
const TData & GetData(void) const
Get the Data member data.
Definition: Seq_feat_.hpp:925
bool IsSetExcept(void) const
something funny about this? Check if a value has been assigned to Except data member.
Definition: Seq_feat_.hpp:990
const TExcept_text & GetExcept_text(void) const
Get the Except_text member data.
Definition: Seq_feat_.hpp:1405
bool IsSetExcept_text(void) const
explain if except=TRUE Check if a value has been assigned to Except_text data member.
Definition: Seq_feat_.hpp:1393
ENa_strand
strand of nucleic acid
Definition: Na_strand_.hpp:64
@ eNa_strand_plus
Definition: Na_strand_.hpp:66
@ eNa_strand_minus
Definition: Na_strand_.hpp:67
@ eNa_strand_unknown
Definition: Na_strand_.hpp:65
int i
range(_Ty, _Ty) -> range< _Ty >
const struct ncbi::grid::netcache::search::fields::SIZE size
ESERV_Site site
bool ConsistentWithT(Char ch)
Definition: utilities.cpp:2896
bool IsOrganelle(int genome)
Definition: utilities.cpp:2838
bool ConsistentWithA(Char ch)
Definition: utilities.cpp:2878
bool ConsistentWithC(Char ch)
Definition: utilities.cpp:2884
bool ConsistentWithG(Char ch)
Definition: utilities.cpp:2890
static char tmp[2048]
Definition: utf8.c:42
bool s_EqualsT(Char c)
bool s_EqualsG(Char c)
bool CheckIntronAcceptor(ENa_strand strand, TConstSpliceSite acceptor)
bool CheckIntronSpliceSites(ENa_strand strand, TConstSpliceSite donor, TConstSpliceSite acceptor)
bool s_EqualsC(Char c)
bool s_EqualsA(Char c)
bool CheckAdjacentSpliceSites(const string &signature, ENa_strand strand, TConstSpliceSite donor, TConstSpliceSite acceptor)
bool CheckIntronDonor(ENa_strand strand, TConstSpliceSite donor)
bool CheckSpliceSite(const string &signature, ENa_strand strand, TConstSpliceSite site)
const string kSpliceSiteGT
const string kSpliceSiteGTAG
Char const (& TConstSpliceSite)[2]
const string kSpliceSiteGC
const string kSpliceSiteAG
Char(& TSpliceSite)[2]
const string kSpliceSiteATAC
const string kSpliceSiteGCAG
Modified on Fri Sep 29 07:32:11 2023 by modify_doxy.py rev. 669887