NCBI C++ ToolKit
nw_spliced_aligner.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: nw_spliced_aligner.cpp 102883 2024-08-02 16:56:49Z badrazat $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors: Yuri Kapustin
27 *
28 * File Description: Base class for spliced aligners.
29 *
30 * ===========================================================================
31 *
32 */
33 
34 
35 #include <ncbi_pch.hpp>
36 
37 #include "messages.hpp"
41 
43 
44 
46  m_IntronMinSize(GetDefaultIntronMinSize()),
47  m_cds_start(0), m_cds_stop(0)
48 {
49  SetEndSpaceFree(true, true, false, false);
50 }
51 
52 
53 CSplicedAligner::CSplicedAligner(const char* seq1, size_t len1,
54  const char* seq2, size_t len2)
55  : CBandAligner(seq1, len1, seq2, len2),
56  m_IntronMinSize(GetDefaultIntronMinSize())
57 {
58  SetEndSpaceFree(true, true, false, false);
59 }
60 
61 
62 CSplicedAligner::CSplicedAligner(const string& seq1, const string& seq2)
63  : CBandAligner(seq1, seq2),
64  m_IntronMinSize(GetDefaultIntronMinSize())
65 {
66  SetEndSpaceFree(true, true, false, false);
67 }
68 
69 
70 void CSplicedAligner::SetWi (unsigned char splice_type, TScore value)
71 {
72  if(splice_type < GetSpliceTypeCount()) {
73  x_GetSpliceScores()[splice_type] = value;
74  }
75  else
76  {
78  eInvalidSpliceTypeIndex,
80  }
81 }
82 
83 
85 {
86  if(splice_type < GetSpliceTypeCount()) {
87  return x_GetSpliceScores()[splice_type];
88  }
89  else
90  {
92  eInvalidSpliceTypeIndex,
94  }
95 }
96 
97 
99 {
101 }
102 
103 
105 
106 public:
107 
108  typedef Uint1 TRank;
109 
111  {
112  m_Ranks.assign(0xFFFF, 0);
113 
114  const vector<Uint2> ascii (0xFF, 0xFFFF);
115  m_Char2Bits.push_back(ascii);
116  m_Char2Bits.push_back(ascii);
117  m_Char2Bits.push_back(ascii);
118  m_Char2Bits.push_back(ascii);
119 
120  Uint2 cA(0x00), cG(0x01), cC(0x02), cT(0x03);
121 
122  for(Uint1 k (0); k < 4; ++k) {
123  m_Char2Bits[k]['A'] = cA; cA <<= 4;
124  m_Char2Bits[k]['G'] = cG; cG <<= 4;
125  m_Char2Bits[k]['C'] = cC; cC <<= 4;
126  m_Char2Bits[k]['T'] = cT; cT <<= 4;
127  }
128 
129  m_Ranks[x_GetIdx("GAAG")] =
130  m_Ranks[x_GetIdx("GTTG")] =
131  m_Ranks[x_GetIdx("TTAG")] = 1;
132  }
133 
134  TRank GetRank(const char * donor, const char * acceptor) const {
135 
136  return m_Ranks[ m_Char2Bits[0] [donor[0]]
137  | m_Char2Bits[1] [donor[1]]
138  | m_Char2Bits[2] [acceptor[0]]
139  | m_Char2Bits[3] [acceptor[1]] ];
140  }
141 
142 private:
143 
144  vector<TRank> m_Ranks;
145  vector<vector<Uint2> > m_Char2Bits;
146 
147  Uint2 x_GetIdx(const char splice[4]) const {
148  return
149  m_Char2Bits[0][splice[0]]
150  | m_Char2Bits[1][splice[1]]
151  | m_Char2Bits[2][splice[2]]
152  | m_Char2Bits[3][splice[3]];
153  }
154 };
155 
156 
157 namespace {
158  CSpliceRanker g_Ranker;
159 }
160 
161 
162 size_t GetSplicePriority(const char * dnr, const char* acc)
163 {
164  static const char* splice_sites[] = {
165  "GT-TG",
166  "AT-AG",
167  "GA-AG",
168  "GT-GG",
169  "AT-AA",
170  "ATA-AT",
171  "GG-AG",
172  "GT-AT",
173  "TT-AG",
174  "GT-AA",
175  };
176  static size_t weight[] = {
177  18 , //GT-TG
178  14 , //AT-AG
179  10 , //GA-AG
180  12 , //GT-GG
181  10 , //AT-AA -- artificially weighting this higher than AT-AT
182  8 , //ATA-AT
183  7 , //GG-AG
184  6 , //GT-AT
185  6 , //TT-AG
186  2 , //GT-AA
187  0
188  };
189 
190  for (size_t i = 0; weight[i] > 0; ++i) {
191  const char * acceptor = strchr(splice_sites[i]+2, '-')+1;
192  if (splice_sites[i][0]==dnr[0] &&
193  splice_sites[i][1]==dnr[1] &&
194  (splice_sites[i][2] == '-' || splice_sites[i][2] == dnr[2]) &&
195  acceptor[0]==acc[0] &&
196  acceptor[1]==acc[1]
197  ) {
198  return weight[i];
199  }
200  }
201 
202  return 0;
203 }
204 
205 
206 // Prefer experimentally verified non-consensus
207 // splices among equally scoring alternatives
209 {
210  if(m_Transcript.size() == 0) {
212 
213  }
214 
215  const char * p1 (GetSeq1()), * p2 (GetSeq2());
216  for(Int8 t (m_Transcript.size() - 1), csq_matches(0); t >= 0; --t) {
217 
218  switch(m_Transcript[t]) {
219 
220  case eTS_Match:
221  ++csq_matches;
222  ++p1;
223  ++p2;
224  break;
225 
226  case eTS_Replace:
227  csq_matches = 0;
228  ++p1;
229  ++p2;
230  break;
231 
232  case eTS_Insert:
233  case eTS_SlackInsert:
234  csq_matches = 0;
235  ++p2;
236  break;
237 
238  case eTS_Delete:
239  case eTS_SlackDelete:
240  csq_matches = 0;
241  ++p1;
242  break;
243 
244  case eTS_Intron: {
245  int ti (static_cast<int>(t - 1));
246  for(; ti >= 0 && m_Transcript[ti] == eTS_Intron; --ti);
247  const int ilen (static_cast<int>(t - ti));
248  const char * p2e (p2 + ilen);
249  if(CNWFormatter::SSegment::s_IsConsensusSplice(p2, p2e - 2, true)) {
250  p2 += ilen;
251  t = ti + 1;
252  }
253  else {
254 
255  size_t maxpr = GetSplicePriority(p2, p2e - 2);
256  int jmaxpr (0);
257  // explore the left of the splice
258  int j;
259  for(j = -1; *(p1 + j) == *(p2e + j) && j >= -csq_matches; --j) {
260  const size_t pr (GetSplicePriority(p2 + j, p2e + j - 2));
261  if(pr > maxpr) {
262  maxpr = pr;
263  jmaxpr = j;
264  }
265  }
266 
267  // explore the right of the splice
268  for(j = 1; j <= ti + 1 && m_Transcript[ti - j + 1] == eTS_Match
269  && *(p1 + j - 1) == *(p2 + j - 1); ++j)
270  {
271  const size_t pr (GetSplicePriority(p2 + j, p2e + j - 2));
272  if(pr > maxpr) {
273  maxpr = pr;
274  jmaxpr = j;
275  }
276  }
277 
278  if(jmaxpr == 0) {
279  p2 += ilen;
280  t = ti + 1;
281  }
282  else {
283  // adjust the splice site
284  const int incr (jmaxpr < 0? -1: 1);
285  for(int k (0); k != jmaxpr; k += incr) {
286  swap(m_Transcript[t-k], m_Transcript[ti-k]);
287  }
288  t = ti - jmaxpr + 1;
289  p1 += jmaxpr;
290  p2 = p2 + ilen + jmaxpr;
291  }
292  }
293  csq_matches = 0;
294  }
295  break;
296 
297  default:
298  NCBI_THROW(CAlgoAlignException, eInternal, "Unexpected transcript symbol");
299  }
300  }
301 }
302 
303 
Uint2 x_GetIdx(const char splice[4]) const
TRank GetRank(const char *donor, const char *acceptor) const
vector< vector< Uint2 > > m_Char2Bits
vector< TRank > m_Ranks
static bool s_IsConsensusSplice(const char *donor, const char *acceptor, bool semi_as_cons=false)
virtual bool x_CheckMemoryLimit(void)
const char * GetSeq2(void) const
Definition: nw_aligner.hpp:171
const char * GetSeq1(void) const
Definition: nw_aligner.hpp:169
TTranscript m_Transcript
Definition: nw_aligner.hpp:314
void SetEndSpaceFree(bool Left1, bool Right1, bool Left2, bool Right2)
Definition: nw_aligner.cpp:192
virtual size_t GetSpliceTypeCount(void)=0
TScore GetWi(unsigned char splice_type)
void SetWi(unsigned char splice_type, TScore value)
virtual TScore * x_GetSpliceScores()=0
virtual bool x_CheckMemoryLimit(void)
void swap(NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair1, NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair2)
Definition: ncbimisc.hpp:1508
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
uint8_t Uint1
1-byte (8-bit) unsigned integer
Definition: ncbitype.h:99
uint16_t Uint2
2-byte (16-bit) unsigned integer
Definition: ncbitype.h:101
int64_t Int8
8-byte (64-bit) signed integer
Definition: ncbitype.h:104
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
n font weight
int i
const GenericPointer< typename T::ValueType > T2 value
Definition: pointer.h:1227
EIPRangeType t
Definition: ncbi_localip.c:101
size_t GetSplicePriority(const char *dnr, const char *acc)
const char g_msg_NoAlignment[]
Definition: messages.hpp:38
const char g_msg_InvalidSpliceTypeIndex[]
Definition: messages.hpp:21
Modified on Fri Sep 20 14:58:02 2024 by modify_doxy.py rev. 669887