NCBI C++ ToolKit
splice_problems.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: splice_problems.cpp 101247 2023-11-20 15:20:49Z stakhovv $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Colleen Bollin
27  *
28  * File Description:
29  * validation of Seq_feat splice sites
30  * .......
31  *
32  */
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbistd.hpp>
35 #include <corelib/ncbistr.hpp>
38 #include <objmgr/bioseq_handle.hpp>
39 #include <objmgr/seqdesc_ci.hpp>
40 #include <objmgr/seq_vector.hpp>
41 #include <objmgr/util/sequence.hpp>
43 
44 
47 BEGIN_SCOPE(validator)
48 using namespace sequence;
49 
50 
51 
53  ENa_strand strand,
54  TSeqPos stop,
55  const CSeqVector& vec_donor,
56  TSeqPos seq_len_donor,
57  TSeqPos start,
58  const CSeqVector& vec_acceptor,
59  TSeqPos seq_len_acceptor)
60 {
61  char donor[2];
62  char acceptor[2];
63 
64  ESpliceSiteRead good_donor = ReadDonorSpliceSite(strand, stop, vec_donor, seq_len_donor, donor);
65  ESpliceSiteRead good_acceptor = ReadAcceptorSpliceSite(strand, start, vec_acceptor, seq_len_acceptor, acceptor);
66  bool donor_ok = (good_donor == eSpliceSiteRead_OK || good_donor == eSpliceSiteRead_WrongNT);
67  bool acceptor_ok = (good_acceptor == eSpliceSiteRead_OK || good_acceptor == eSpliceSiteRead_WrongNT);
68 
69  if (donor_ok && acceptor_ok) {
70  // Check canonical adjacent splice sites: "GT-AG"
71  // Check non-canonical adjacent splice sites: "GC-AG"
72  // Check non-canonical adjacent splice sites: "AT-AC"
73  if (CheckAdjacentSpliceSites(kSpliceSiteGTAG, strand, donor, acceptor) ||
74  CheckAdjacentSpliceSites(kSpliceSiteGCAG, strand, donor, acceptor) ||
75  CheckAdjacentSpliceSites(kSpliceSiteATAC, strand, donor, acceptor)) {
76  return; // canonical splice site found
77  }
78  m_DonorProblems.push_back(TSpliceProblem(good_donor, stop));
79  m_AcceptorProblems.push_back(TSpliceProblem(good_acceptor, start));
80  } else {
81  m_DonorProblems.push_back(TSpliceProblem(good_donor, stop));
82  m_AcceptorProblems.push_back(TSpliceProblem(good_acceptor, start));
83  }
84 }
85 
86 
89 {
90  try {
91  bool in_gap;
92  bool bad_seq = false;
93 
94  if (strand == eNa_strand_minus) {
95  // check donor and acceptor on minus strand
96  if (stop > 1 && stop <= seq_len) {
97  in_gap = (vec.IsInGap(stop - 2) && vec.IsInGap(stop - 1));
98  if (!in_gap) {
99  bad_seq = (vec[stop - 1] > 250 || vec[stop - 2] > 250);
100  }
101 
102  if (in_gap) {
103  return eSpliceSiteRead_Gap;
104  } else if (bad_seq) {
105  return eSpliceSiteRead_BadSeq;
106  }
107 
108  // Read splice site seq
109  site[0] = vec[stop - 2];
110  site[1] = vec[stop - 1];
111  } else {
112  return eSpliceSiteRead_OutOfRange;
113  }
114  }
115  // Read donor splice site from plus strand
116  else {
117  if (stop < seq_len - 2) {
118  in_gap = (vec.IsInGap(stop + 1) && vec.IsInGap(stop + 2));
119  if (!in_gap) {
120  bad_seq = (vec[stop + 1] > 250 || vec[stop + 2] > 250);
121  }
122  if (in_gap) {
123  return eSpliceSiteRead_Gap;
124  } else if (bad_seq) {
125  return eSpliceSiteRead_BadSeq;
126  }
127  site[0] = vec[stop + 1];
128  site[1] = vec[stop + 2];
129  } else {
130  return eSpliceSiteRead_OutOfRange;
131  }
132  }
133 
134  // Check canonical donor site: "GT" and non-canonical donor site: "GC"
136  return eSpliceSiteRead_OK;
137  } else {
138  return eSpliceSiteRead_WrongNT;
139  }
140  } catch (CException&) {
141  return eSpliceSiteRead_OK;
142  }
143 }
144 
145 
148 {
149  char site[2];
150 
151  return ReadDonorSpliceSite(strand, stop, vec, seq_len, site);
152 }
153 
154 
157  ENa_strand strand,
158  TSeqPos start,
159  const CSeqVector& vec,
160  TSeqPos seq_len,
161  TSpliceSite& site)
162 {
163  try {
164  bool in_gap;
165  bool bad_seq = false;
166 
167  if (strand == eNa_strand_minus) {
168  // check donor and acceptor on minus strand
169  if (start < seq_len - 2) {
170  in_gap = (vec.IsInGap(start + 1) && vec.IsInGap(start + 2));
171  if (!in_gap) {
172  bad_seq = (vec[start + 1] > 250 || vec[start + 2] > 250);
173  }
174 
175  if (in_gap) {
176  return eSpliceSiteRead_Gap;
177  } else if (bad_seq) {
178  return eSpliceSiteRead_BadSeq;
179  }
180  site[0] = vec[start + 1];
181  site[1] = vec[start + 2];
182  } else {
183  return eSpliceSiteRead_OutOfRange;
184  }
185  }
186  // read acceptor splice site from plus strand
187  else {
188  if (start > 1 && start <= seq_len) {
189  in_gap = (vec.IsInGap(start - 2) && vec.IsInGap(start - 1));
190  if (!in_gap) {
191  bad_seq = (vec[start - 2] > 250 || vec[start - 1] > 250);
192  }
193 
194  if (in_gap) {
195  return eSpliceSiteRead_Gap;
196  } else if (bad_seq) {
197  return eSpliceSiteRead_BadSeq;
198  }
199  site[0] = vec[start - 2];
200  site[1] = vec[start - 1];
201  } else {
202  return eSpliceSiteRead_OutOfRange;
203  }
204  }
205  // Check canonical acceptor site: "AG"
206  if (CheckSpliceSite(kSpliceSiteAG, strand, site)) {
207  return eSpliceSiteRead_OK;
208  } else {
209  return eSpliceSiteRead_WrongNT;
210  }
211  } catch (CException&) {
212  return eSpliceSiteRead_BadSeq;
213  }
214 }
215 
216 
219  ENa_strand strand,
220  TSeqPos start,
221  const CSeqVector& vec,
222  TSeqPos seq_len)
223 {
224  char site[2];
225  return ReadAcceptorSpliceSite(strand, start, vec, seq_len, site);
226 }
227 
228 
230 {
231  bool has_errors = false;
232  // donors
233  for (auto it = m_DonorProblems.begin(); it != m_DonorProblems.end() && !has_errors; it++) {
234  if (it->first == eSpliceSiteRead_BadSeq || it->first == eSpliceSiteRead_Gap ||
235  it->first == eSpliceSiteRead_WrongNT) {
236  has_errors = true;
237  }
238  }
239  // acceptors
240  for (auto it = m_AcceptorProblems.begin(); it != m_AcceptorProblems.end() && !has_errors; it++) {
241  if (it->first == eSpliceSiteRead_BadSeq || it->first == eSpliceSiteRead_Gap ||
242  it->first == eSpliceSiteRead_WrongNT) {
243  has_errors = true;
244  }
245  }
246 
247  return has_errors;
248 }
249 
250 
251 void CSpliceProblems::CalculateSpliceProblems(const CSeq_feat& feat, bool check_all, bool pseudo, CBioseq_Handle loc_handle)
252 {
253  m_DonorProblems.clear();
254  m_AcceptorProblems.clear();
255  m_ExceptionUnnecessary = false;
256  m_ErrorsNotExpected = true;
257 
258  bool has_errors = false, ribo_slip = false;
259 
260  const CSeq_loc& loc = feat.GetLocation();
261 
262  // skip if organelle
263  if (!loc_handle || IsOrganelle(loc_handle)) {
264  return;
265  }
266 
267  // suppress for specific biological exceptions
268  if (feat.IsSetExcept() && feat.IsSetExcept_text()
269  && (NStr::FindNoCase(feat.GetExcept_text(), "low-quality sequence region") != string::npos)) {
270  return;
271  }
272  if (feat.IsSetExcept() && feat.IsSetExcept_text()
273  && (NStr::FindNoCase(feat.GetExcept_text(), "ribosomal slippage") != string::npos)) {
274  m_ErrorsNotExpected = false;
275  ribo_slip = true;
276  }
277  if (feat.IsSetExcept() && feat.IsSetExcept_text()
278  && (NStr::FindNoCase(feat.GetExcept_text(), "artificial frameshift") != string::npos
279  || NStr::FindNoCase(feat.GetExcept_text(), "nonconsensus splice site") != string::npos
280  || NStr::FindNoCase(feat.GetExcept_text(), "adjusted for low-quality genome") != string::npos
281  || NStr::FindNoCase(feat.GetExcept_text(), "heterogeneous population sequenced") != string::npos
282  || NStr::FindNoCase(feat.GetExcept_text(), "low-quality sequence region") != string::npos
283  || NStr::FindNoCase(feat.GetExcept_text(), "artificial location") != string::npos)) {
284  m_ErrorsNotExpected = false;
285  }
286 
287 
288  // look for mixed strands, skip if found
290 
291  int num_parts = 0;
292  for (CSeq_loc_CI si(loc); si; ++si) {
293  if (si.IsSetStrand()) {
294  ENa_strand tmp = si.GetStrand();
295  if (tmp == eNa_strand_plus || tmp == eNa_strand_minus) {
296  if (strand == eNa_strand_unknown) {
297  strand = si.GetStrand();
298  } else if (strand != tmp) {
299  return;
300  }
301  }
302  }
303  num_parts++;
304  }
305 
306  if (!check_all && num_parts < 2) {
307  return;
308  }
309 
310  // Default value for a strand is '+'
311  if (eNa_strand_unknown == strand) {
312  strand = eNa_strand_plus;
313  }
314 
315  // only check for errors if overlapping gene is not pseudo
316  if (!pseudo) {
317  CSeqFeatData::ESubtype subtype = feat.GetData().GetSubtype();
318  switch (subtype) {
320  ValidateSpliceExon(feat, loc_handle, strand);
321  break;
323  ValidateSpliceMrna(feat, loc_handle, strand);
324  break;
326  ValidateSpliceCdregion(feat, loc_handle, strand);
327  break;
328  default:
329  break;
330  }
331  }
332  has_errors = SpliceSitesHaveErrors();
333 
334  if (!m_ErrorsNotExpected && !has_errors && !ribo_slip) {
335  m_ExceptionUnnecessary = true;
336  }
337 }
338 
339 
341 {
342  const CSeq_loc& loc = feat.GetLocation();
343 
344  // Find overlapping feature - mRNA or gene - to identify start / stop exon
345  bool overlap_feat_partial_5 = false; // set to true if 5'- most start of overlapping feature is partial
346  bool overlap_feat_partial_3 = false; // set to true if 3'- most end of overlapping feature is partial
347  TSeqPos overlap_feat_start = 0; // start position of overlapping feature
348  TSeqPos overlap_feat_stop = 0; // stop position of overlapping feature
349 
350  bool overlap_feat_exists = false;
351  // Locate overlapping mRNA feature
353  loc,
356  bsh.GetScope());
357  if (mrna) {
358  overlap_feat_exists = true;
359  overlap_feat_partial_5 = mrna->GetLocation().IsPartialStart(eExtreme_Biological);
360  overlap_feat_start = mrna->GetLocation().GetStart(eExtreme_Biological);
361 
362  overlap_feat_partial_3 = mrna->GetLocation().IsPartialStop(eExtreme_Biological);
363  overlap_feat_stop = mrna->GetLocation().GetStop(eExtreme_Biological);
364  }
365  else {
366  // Locate overlapping gene feature.
368  loc,
371  bsh.GetScope());
372  if (gene) {
373  overlap_feat_exists = true;
374  overlap_feat_partial_5 = gene->GetLocation().IsPartialStart(eExtreme_Biological);
375  overlap_feat_start = gene->GetLocation().GetStart(eExtreme_Biological);
376 
377  overlap_feat_partial_3 = gene->GetLocation().IsPartialStop(eExtreme_Biological);
378  overlap_feat_stop = gene->GetLocation().GetStop(eExtreme_Biological);
379  }
380  }
381 
382  CSeq_loc_CI si(loc);
383  try{
384  CSeq_loc::TRange range = si.GetRange();
385  CConstRef<CSeq_loc> cur_int = si.GetRangeAsSeq_loc();
386  if (cur_int) {
387  CBioseq_Handle bsh_si = bsh.GetScope().GetBioseqHandle(*cur_int);
388 
389  if (bsh_si) {
391 
392  TSeqPos start, stop;
393  if (eNa_strand_minus == strand) {
394  start = range.GetTo();
395  stop = range.GetFrom();
396  } else {
397  start = range.GetFrom();
398  stop = range.GetTo();
399  }
400 
401  if (overlap_feat_exists) {
402  if (!cur_int->IsPartialStop(eExtreme_Biological)) {
403  if (stop == overlap_feat_stop) {
404  if (overlap_feat_partial_3) {
405  ESpliceSiteRead good_donor = ReadDonorSpliceSite(strand, stop, vec, bsh_si.GetInst_Length());
406  m_DonorProblems.push_back(TSpliceProblem(good_donor, stop));
407  }
408  } else {
409  ESpliceSiteRead good_donor = ReadDonorSpliceSite(strand, stop, vec, bsh_si.GetInst_Length());
410  m_DonorProblems.push_back(TSpliceProblem(good_donor, stop));
411  }
412  }
413 
414  if (!cur_int->IsPartialStart(eExtreme_Biological)) {
415  if (start == overlap_feat_start) {
416  if (overlap_feat_partial_5) {
417  ESpliceSiteRead good_acceptor = ReadAcceptorSpliceSite(strand, start, vec, bsh_si.GetInst_Length());
418  m_AcceptorProblems.push_back(TSpliceProblem(good_acceptor, start));
419  }
420  } else {
421  ESpliceSiteRead good_acceptor = ReadAcceptorSpliceSite(strand, start, vec, bsh_si.GetInst_Length());
422  m_AcceptorProblems.push_back(TSpliceProblem(good_acceptor, start));
423  }
424  }
425  } else {
426  // Overlapping feature - mRNA or gene - not found.
427  if (!cur_int->IsPartialStop(eExtreme_Biological)) {
428  ESpliceSiteRead good_donor = ReadDonorSpliceSite(strand, stop, vec, bsh_si.GetInst_Length());
429  m_DonorProblems.push_back(TSpliceProblem(good_donor, stop));
430  }
431  if (!cur_int->IsPartialStart(eExtreme_Biological)) {
432  ESpliceSiteRead good_acceptor = ReadAcceptorSpliceSite(strand, start, vec, bsh_si.GetInst_Length());
433  m_AcceptorProblems.push_back(TSpliceProblem(good_acceptor, start));
434  }
435  }
436  }
437  }
438  } catch (const CException& ) {
439  ;
440  } catch (const std::exception& ) {
441  ;// could get errors from CSeqVector
442  }
443 }
444 
446 {
447  const CSeq_loc& loc = feat.GetLocation();
448 
449  bool ignore_mrna_partial5 = false;
450  bool ignore_mrna_partial3 = false;
451 
452  // Retrieve overlapping cdregion
454  loc,
457  bsh.GetScope());
458  if (cds) {
459  // If there is no UTR information, then the mRNA location should agree with its CDS location,
460  // but the mRNA should be marked partial at its 5' and 3' ends
461  // Do not check splice site (either donor or acceptor) if CDS location's start / stop is complete.
464  ignore_mrna_partial5 = true;
465  }
468  ignore_mrna_partial3 = true;
469  }
470  }
471 
472  TSeqPos start;
473  TSeqPos stop;
474 
475  CSeq_loc_CI head(loc);
476  if (head) {
477  // Validate acceptor site of 5'- most feature
478  const CSeq_loc& part = head.GetEmbeddingSeq_loc();
479  CSeq_loc::TRange range = head.GetRange();
480  CBioseq_Handle bsh_head1 = bsh.GetScope().GetBioseqHandle(*head.GetRangeAsSeq_loc());
481  if (bsh_head1) {
483 
484  if (strand == eNa_strand_minus) {
485  start = range.GetTo();
486  } else {
487  start = range.GetFrom();
488  }
489  if (part.IsPartialStart(eExtreme_Biological) && !ignore_mrna_partial5) {
490  ESpliceSiteRead good_acceptor = ReadAcceptorSpliceSite(strand, start, vec, bsh_head1.GetInst_Length());
491  m_AcceptorProblems.push_back(TSpliceProblem(good_acceptor, start));
492  }
493  }
494 
495  CSeq_loc_CI tail(loc);
496  ++tail;
497 
498  // Validate adjacent (donor...acceptor) splice sites.
499  // @head is a location of exon that contibutes `donor site`
500  // @tail is a location of exon that contibutes `acceptor site`
501  for(; tail; ++head, ++tail) {
502  CSeq_loc::TRange range_head = head.GetRange();
503  CSeq_loc::TRange range_tail = tail.GetRange();
504  CBioseq_Handle bsh_head = bsh.GetScope().GetBioseqHandle(*head.GetRangeAsSeq_loc());
505  CBioseq_Handle bsh_tail = bsh.GetScope().GetBioseqHandle(*tail.GetRangeAsSeq_loc());
506  if (bsh_head && bsh_tail) {
507  try {
510 
511  if (strand == eNa_strand_minus) {
512  start = range_tail.GetTo();
513  stop = range_head.GetFrom();
514  } else {
515  start = range_tail.GetFrom();
516  stop = range_head.GetTo();
517  }
518  ValidateDonorAcceptorPair(
519  strand,
520  stop, vec_head, bsh_head.GetInst_Length(),
521  start, vec_tail, bsh_tail.GetInst_Length());
522  } catch (CSeqVectorException&) {
523  }
524  }
525  }
526  }
527 
528  // Validate donor site of 3'most feature
529  if(head) {
530  const CSeq_loc& part = head.GetEmbeddingSeq_loc();
531  CSeq_loc::TRange range = head.GetRange();
532  CBioseq_Handle bsh_head = bsh.GetScope().GetBioseqHandle(*head.GetRangeAsSeq_loc());
533  if (bsh_head) {
535 
536  if (strand == eNa_strand_minus) {
537  stop = range.GetFrom();
538  } else {
539  stop = range.GetTo();
540  }
541  if (part.IsPartialStop(eExtreme_Biological) && !ignore_mrna_partial3) {
542  ESpliceSiteRead good_donor = ReadDonorSpliceSite(strand, stop, vec, bsh_head.GetInst_Length());
543  m_DonorProblems.push_back(TSpliceProblem(good_donor, stop));
544  }
545  }
546  }
547 }
548 
550 {
551  const CSeq_loc& loc = feat.GetLocation();
552 
553  TSeqPos start;
554  TSeqPos stop;
555 
556  CSeq_loc_CI head(loc);
557  if (head) {
558  // Validate acceptor site of 5'- most feature
559  const CSeq_loc& part = head.GetEmbeddingSeq_loc();
560  CSeq_loc::TRange range = head.GetRange();
561  CBioseq_Handle bsh_head1 = bsh.GetScope().GetBioseqHandle(*head.GetRangeAsSeq_loc());
562  if (bsh_head1) {
563  try {
565  if (part.IsPartialStart(eExtreme_Biological)) {
566  if (strand == eNa_strand_minus) {
567  start = range.GetTo();
568  } else {
569  start = range.GetFrom();
570  }
571  ESpliceSiteRead good_acceptor = ReadAcceptorSpliceSite(strand, start, vec, bsh_head1.GetInst_Length());
572  m_AcceptorProblems.push_back(TSpliceProblem(good_acceptor, start));
573  }
574  } catch (CSeqVectorException&) {
575  }
576  }
577 
578  CSeq_loc_CI tail(loc);
579  ++tail;
580 
581  // Validate adjacent (donor...acceptor) splice sites.
582  // @head is a location of exon that contibutes `donor site`
583  // @tail is a location of exon that contibutes `acceptor site`
584  for(; tail; ++head, ++tail) {
585  CSeq_loc::TRange range_head = head.GetRange();
586  CSeq_loc::TRange range_tail = tail.GetRange();
587  CBioseq_Handle bsh_head = bsh.GetScope().GetBioseqHandle(*head.GetRangeAsSeq_loc());
588  CBioseq_Handle bsh_tail = bsh.GetScope().GetBioseqHandle(*tail.GetRangeAsSeq_loc());
589  if (bsh_head && bsh_tail) {
590  try {
593 
594  if (strand == eNa_strand_minus) {
595  start = range_tail.GetTo();
596  stop = range_head.GetFrom();
597  } else {
598  start = range_tail.GetFrom();
599  stop = range_head.GetTo();
600  }
601  ValidateDonorAcceptorPair(
602  strand,
603  stop, vec_head, bsh_head.GetInst_Length(),
604  start, vec_tail, bsh_tail.GetInst_Length());
605  } catch (CSeqVectorException&) {
606  }
607  }
608  }
609  }
610 
611  // Validate donor site of 3'most feature
612  if(head) {
613  const CSeq_loc& part = head.GetEmbeddingSeq_loc();
614  CSeq_loc::TRange range = head.GetRange();
615  CBioseq_Handle bsh_head = bsh.GetScope().GetBioseqHandle(*head.GetRangeAsSeq_loc());
616  if (bsh_head) {
617  try {
619 
620  if (strand == eNa_strand_minus) {
621  stop = range.GetFrom();
622  } else {
623  stop = range.GetTo();
624  }
625  if (part.IsPartialStop(eExtreme_Biological)) {
626  ESpliceSiteRead good_donor = ReadDonorSpliceSite(strand, stop, vec, bsh_head.GetInst_Length());
627  m_DonorProblems.push_back(TSpliceProblem(good_donor, stop));
628  }
629  } catch (CSeqVectorException&) {
630  }
631  }
632  }
633 }
634 
635 
636 static bool s_EqualsG(Char c)
637 {
638  return c == 'G';
639 }
640 
641 static bool s_EqualsC(Char c)
642 {
643  return c == 'C';
644 }
645 
646 static bool s_EqualsA(Char c)
647 {
648  return c == 'A';
649 }
650 
651 static bool s_EqualsT(Char c)
652 {
653  return c == 'T';
654 }
655 
656 bool CheckAdjacentSpliceSites(const string& signature, ENa_strand strand, TConstSpliceSite donor, TConstSpliceSite acceptor)
657 {
658  static
659  struct tagSpliceSiteInfo
660  {
661  const string& id;
662  ENa_strand strand;
663  bool(*check_donor0)(Char);
664  bool(*check_donor1)(Char);
665  bool(*check_acceptor0)(Char);
666  bool(*check_acceptor1)(Char);
667  }
668  SpliceSiteInfo[] = {
669  // 5' << GT...AG <<
671  // >> CT...AC >>, reverse complement
673  // 5' << GC...AG <<
675  // >> CT...GC >>, reverse complement
677  // 5' << AT...AC <<
679  // >> GT...AT >>, reverse complement
681  };
682  static int size = sizeof(SpliceSiteInfo) / sizeof(struct tagSpliceSiteInfo);
683 
684  for (int i = 0; i < size; ++i) {
685  struct tagSpliceSiteInfo* entry = &SpliceSiteInfo[i];
686  if (strand == entry->strand && entry->id == signature) {
687  return (entry->check_donor0(donor[0]) && entry->check_donor1(donor[1]) &&
688  entry->check_acceptor0(acceptor[0]) && entry->check_acceptor1(acceptor[1]));
689  }
690  }
691 
692  NCBI_THROW(CCoreException, eCore, "Unknown splice site signature.");
693 }
694 
695 
696 bool CheckSpliceSite(const string& signature, ENa_strand strand, TConstSpliceSite site)
697 {
698  static
699  struct tagSpliceSiteInfo
700  {
701  const string& id;
702  ENa_strand strand;
703  bool(*check_site0)(Char);
704  bool(*check_site1)(Char);
705  }
706  SpliceSiteInfo[] = {
707  // 5' << GT... <<
709  // >> ...AC >>, reverse complement
711  // 5' << ...AG <<
713  // >> CT...>>, reverse complement
715  // 5' << GC... <<
717  // >> ...GC >>, reverse complement
719  };
720  static int size = sizeof(SpliceSiteInfo) / sizeof(struct tagSpliceSiteInfo);
721 
722  for (int i = 0; i < size; ++i) {
723  struct tagSpliceSiteInfo* entry = &SpliceSiteInfo[i];
724  if (strand == entry->strand && entry->id == signature) {
725  return (entry->check_site0(site[0]) && entry->check_site1(site[1]));
726  }
727  }
728 
729  NCBI_THROW(CCoreException, eCore, "Unknown splice site signature.");
730 }
731 
732 
734 {
735  return (CheckAdjacentSpliceSites(kSpliceSiteGTAG, strand, donor, acceptor) ||
736  CheckAdjacentSpliceSites(kSpliceSiteGCAG, strand, donor, acceptor) ||
737  CheckAdjacentSpliceSites(kSpliceSiteATAC, strand, donor, acceptor));
738 }
739 
741 {
742  return (CheckSpliceSite(kSpliceSiteGT, strand, donor) ||
743  CheckSpliceSite(kSpliceSiteGC, strand, donor));
744 }
745 
747 {
748  return CheckSpliceSite(kSpliceSiteAG, strand, acceptor);
749 }
750 
751 END_SCOPE(validator)
@ eExtreme_Biological
5' and 3'
Definition: Na_strand.hpp:62
CBioseq_Handle –.
CCoreException –.
Definition: ncbiexpt.hpp:1476
ESubtype GetSubtype(void) const
SeqVector related exceptions.
CSeqVector –.
Definition: seq_vector.hpp:65
namespace ncbi::objects::
Definition: Seq_feat.hpp:58
void ValidateSpliceCdregion(const CSeq_feat &feat, const CBioseq_Handle &bsh, ENa_strand strand)
void CalculateSpliceProblems(const CSeq_feat &feat, bool check_all, bool pseudo, CBioseq_Handle loc_handle)
void ValidateSpliceExon(const CSeq_feat &feat, const CBioseq_Handle &bsh, ENa_strand strand)
ESpliceSiteRead ReadAcceptorSpliceSite(ENa_strand strand, TSeqPos start, const CSeqVector &vec, TSeqPos seq_len, TSpliceSite &site)
ESpliceSiteRead ReadDonorSpliceSite(ENa_strand strand, TSeqPos stop, const CSeqVector &vec, TSeqPos seq_len, TSpliceSite &site)
void ValidateSpliceMrna(const CSeq_feat &feat, const CBioseq_Handle &bsh, ENa_strand strand)
pair< size_t, TSeqPos > TSpliceProblem
void ValidateDonorAcceptorPair(ENa_strand strand, TSeqPos stop, const CSeqVector &vec_donor, TSeqPos seq_len_donor, TSeqPos start, const CSeqVector &vec_acceptor, TSeqPos seq_len_acceptor)
Include a standard set of the NCBI C++ Toolkit most basic headers.
The NCBI C++ standard methods for dealing with std::string.
#define head
Definition: ct_nlmzip_i.h:138
static const char si[8][64]
Definition: des.c:146
#define bool
Definition: bool.h:34
static char tmp[3200]
Definition: utf8.c:42
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define Char
Definition: ncbistd.hpp:124
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
bool IsPartialStart(ESeqLocExtremes ext) const
check start or stop of location for e_Lim fuzz
Definition: Seq_loc.cpp:3222
TSeqPos GetStart(ESeqLocExtremes ext) const
Return start and stop positions of the seq-loc.
Definition: Seq_loc.cpp:915
bool IsPartialStop(ESeqLocExtremes ext) const
Definition: Seq_loc.cpp:3251
TSeqPos GetStop(ESeqLocExtremes ext) const
Definition: Seq_loc.cpp:963
CMappedFeat GetBestOverlappingFeat(const CMappedFeat &feat, CSeqFeatData::ESubtype need_subtype, sequence::EOverlapType overlap_type, CFeatTree *feat_tree=0, const SAnnotSelector *base_sel=0)
Definition: feature.cpp:3653
@ eOverlap_Contains
2nd contains 1st extremes
@ eOverlap_Contained
2nd contained within 1st extremes
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
Definition: scope.cpp:95
TInst_Length GetInst_Length(void) const
CScope & GetScope(void) const
Get scope this handle belongs to.
CSeqVector GetSeqVector(EVectorCoding coding, ENa_strand strand=eNa_strand_plus) const
Get sequence: Iupacna or Iupacaa if use_iupac_coding is true.
@ eCoding_Iupac
Set coding to printable coding (Iupacna or Iupacaa)
bool IsInGap(TSeqPos pos) const
true if sequence at 0-based position 'pos' has gap Note: this method is not MT-safe,...
Definition: seq_vector.hpp:277
char Char
Alias for char.
Definition: ncbitype.h:93
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
static SIZE_TYPE FindNoCase(const CTempString str, const CTempString pattern, SIZE_TYPE start, SIZE_TYPE end, EOccurrence which=eFirst)
Find the pattern in the specified range of a string using a case insensitive search.
Definition: ncbistr.cpp:2993
TTo GetTo(void) const
Get the To member data.
Definition: Range_.hpp:269
TFrom GetFrom(void) const
Get the From member data.
Definition: Range_.hpp:222
const TLocation & GetLocation(void) const
Get the Location member data.
Definition: Seq_feat_.hpp:1117
const TData & GetData(void) const
Get the Data member data.
Definition: Seq_feat_.hpp:925
bool IsSetExcept(void) const
something funny about this? Check if a value has been assigned to Except data member.
Definition: Seq_feat_.hpp:990
const TExcept_text & GetExcept_text(void) const
Get the Except_text member data.
Definition: Seq_feat_.hpp:1405
bool IsSetExcept_text(void) const
explain if except=TRUE Check if a value has been assigned to Except_text data member.
Definition: Seq_feat_.hpp:1393
ENa_strand
strand of nucleic acid
Definition: Na_strand_.hpp:64
@ eNa_strand_plus
Definition: Na_strand_.hpp:66
@ eNa_strand_minus
Definition: Na_strand_.hpp:67
@ eNa_strand_unknown
Definition: Na_strand_.hpp:65
int i
range(_Ty, _Ty) -> range< _Ty >
const struct ncbi::grid::netcache::search::fields::SIZE size
ESERV_Site site
bool ConsistentWithT(Char ch)
Definition: utilities.cpp:2886
bool IsOrganelle(int genome)
Definition: utilities.cpp:2831
bool ConsistentWithA(Char ch)
Definition: utilities.cpp:2871
bool ConsistentWithC(Char ch)
Definition: utilities.cpp:2876
bool ConsistentWithG(Char ch)
Definition: utilities.cpp:2881
bool CheckIntronAcceptor(ENa_strand strand, TConstSpliceSite acceptor)
bool CheckIntronSpliceSites(ENa_strand strand, TConstSpliceSite donor, TConstSpliceSite acceptor)
static bool s_EqualsA(Char c)
static bool s_EqualsT(Char c)
static bool s_EqualsG(Char c)
bool CheckAdjacentSpliceSites(const string &signature, ENa_strand strand, TConstSpliceSite donor, TConstSpliceSite acceptor)
static bool s_EqualsC(Char c)
bool CheckIntronDonor(ENa_strand strand, TConstSpliceSite donor)
bool CheckSpliceSite(const string &signature, ENa_strand strand, TConstSpliceSite site)
const string kSpliceSiteGT
const string kSpliceSiteGTAG
Char const (& TConstSpliceSite)[2]
const string kSpliceSiteGC
const string kSpliceSiteAG
Char(& TSpliceSite)[2]
const string kSpliceSiteATAC
const string kSpliceSiteGCAG
Modified on Wed Apr 24 14:19:11 2024 by modify_doxy.py rev. 669887