NCBI C++ ToolKit
polya.hpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: polya.hpp 78930 2017-07-31 13:06:45Z dicuccio $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Philip Johnson
27 *
28 * File Description: finds mRNA 3' modification (poly-A tails)
29 *
30 * ---------------------------------------------------------------------------
31 */
32 #ifndef ALGO_SEQUENCE___POLYA__HPP
33 #define ALGO_SEQUENCE___POLYA__HPP
34 
35 #include <corelib/ncbistd.hpp>
36 #include <util/range.hpp>
37 
39 
40 enum EPolyTail {
42  ePolyTail_A3, //> 3' poly-A tail
43  ePolyTail_T5 //> 5' poly-T head (submitted to db reversed?)
44 };
45 
46 
47 ///////////////////////////////////////////////////////////////////////////////
48 /// PRE : two random access iterators pointing to sequence data [begin,
49 /// end)
50 /// POST: poly-A tail cleavage site, if any (-1 if not)
51 template <typename Iterator>
53 FindPolyA(Iterator begin, Iterator end);
54 
55 ///////////////////////////////////////////////////////////////////////////////
56 /// PRE : two random access iterators pointing to sequence data [begin,
57 /// end); minimum length for tail
58 /// POST: cleavageSite (if any) and whether we found a poly-A tail, a poly-T
59 /// head, or neither
60 template <typename Iterator>
62 FindPolyTail(Iterator begin, Iterator end,
63  TSignedSeqPos &cleavageSite,
64  TSeqPos min_length = 1);
65 
66 ///////////////////////////////////////////////////////////////////////////////
67 /// PRE : two random access iterators pointing to sequence data [begin,
68 /// end); maximum number of non-A bases that are allowed to follow the tail
69 /// POST: poly-A tail range, if any (empty range if not)
70 template <typename Iterator>
72 FindPolyARange(Iterator begin, Iterator end, TSeqPos max_following_bases);
73 
74 ///////////////////////////////////////////////////////////////////////////////
75 /// PRE : two random access iterators pointing to sequence data [begin,
76 /// end); minimum length for tail; maximum number of non-A bases that
77 /// are allowed to follow the tail
78 /// POST: poly-tail range (if any) and whether we found a poly-A tail, a poly-T
79 /// head, or neither
80 template <typename Iterator>
82 FindPolyTail(Iterator begin, Iterator end,
83  TSeqRange &tail,
84  TSeqPos min_length = 1,
85  TSeqPos max_following_bases = 0);
86 
87 
88 ///////////////////////////////////////////////////////////////////////////////
89 /// Implementation [in header because of templates]
90 
91 template<typename Iterator>
92 class CRevComp_It {
93 public:
94  CRevComp_It(void) {}
95  CRevComp_It(const Iterator &it) {
96  m_Base = it;
97  }
98 
99  char operator*(void) const {
100  Iterator tmp = m_Base;
101  switch (*--tmp) {
102  case 'A': return 'T';
103  case 'T': return 'A';
104  case 'C': return 'G';
105  case 'G': return 'C';
106  default: return *tmp;
107  }
108  }
110  --m_Base;
111  return *this;
112  }
114  CRevComp_It it = m_Base;
115  --m_Base;
116  return it;
117  }
119  ++m_Base;
120  return *this;
121  }
123  CRevComp_It it = m_Base;
124  ++m_Base;
125  return it;
126  }
128  m_Base -= i;
129  return *this;
130  }
132  m_Base += i;
133  return *this;
134  }
135  CRevComp_It operator+ (int i) const {
136  CRevComp_It it(m_Base);
137  it += i;
138  return it;
139  }
140  CRevComp_It operator- (int i) const {
141  CRevComp_It it(m_Base);
142  it -= i;
143  return it;
144  }
145  int operator- (const CRevComp_It &it) const {
146  return it.m_Base - m_Base;
147  }
148 
149  //booleans
150  bool operator>= (const CRevComp_It &it) const {
151  return m_Base <= it.m_Base;
152  }
153  bool operator> (const CRevComp_It &it) const {
154  return m_Base < it.m_Base;
155  }
156  bool operator<= (const CRevComp_It &it) const {
157  return m_Base >= it.m_Base;
158  }
159  bool operator< (const CRevComp_It &it) const {
160  return m_Base > it.m_Base;
161  }
162  bool operator== (const CRevComp_It &it) const {
163  return m_Base == it.m_Base;
164  }
165  bool operator!= (const CRevComp_It &it) const {
166  return m_Base != it.m_Base;
167  }
168 private:
169  Iterator m_Base;
170 };
171 
172 
173 ///////////////////////////////////////////////////////////////////////////////
174 // PRE : same conditions as STL 'search', but iterators must have ptrdiff_t
175 // difference type
176 // POST: same as STL 'search'
177 template <typename ForwardIterator1, typename ForwardIterator2>
178 ForwardIterator1 ItrSearch(ForwardIterator1 first1, ForwardIterator1 last1,
179  ForwardIterator2 first2, ForwardIterator2 last2)
180 {
181  ptrdiff_t d1 = last1 - first1;
182  ptrdiff_t d2 = last2 - first2;
183  if (d1 < d2) {
184  return last1;
185  }
186 
187  ForwardIterator1 current1 = first1;
188  ForwardIterator2 current2 = first2;
189 
190  while (current2 != last2) {
191  if (!(*current1 == *current2)) {
192  if (d1-- == d2) {
193  return last1;
194  } else {
195  current1 = ++first1;
196  current2 = first2;
197  }
198  } else {
199  ++current1;
200  ++current2;
201  }
202  }
203  return (current2 == last2) ? first1 : last1;
204 }
205 
206 ///////////////////////////////////////////////////////////////////////////////
207 // PRE : two random access iterators pointing to sequence data [begin,
208 // end)
209 // POST: poly-A tail cleavage site, if any (-1 if not)
210 template <typename Iterator>
211 TSignedSeqPos FindPolyA(Iterator begin, Iterator end)
212 {
213  TSeqRange tail_found = FindPolyARange(begin, end, 0);
214  return tail_found.Empty() ? -1 : tail_found.GetFrom();
215 }
216 
217 ///////////////////////////////////////////////////////////////////////////////
218 /// PRE : two random access iterators pointing to sequence data [begin,
219 /// end)
220 /// POST: poly-A tail range, if any (empty range if not)
221 template <typename Iterator>
222 TSeqRange FindPolyARange(Iterator begin, Iterator end, TSeqPos max_following_bases)
223 {
224  string motif1("AATAAA");
225  string motif2("ATTAAA");
226 
227  Iterator pos = begin;
228 
229  Iterator uStrmMotif = pos;
230  while (uStrmMotif != end) {
231  pos = uStrmMotif;
232  uStrmMotif = ItrSearch(pos, end, motif1.begin(), motif1.end());
233  if (uStrmMotif == end) {
234  uStrmMotif = ItrSearch(pos, end, motif2.begin(), motif2.end());
235  }
236 
237  if (uStrmMotif != end) {
238  if (end - uStrmMotif < 16) { // skip over upstream motif, and at least 10 more
239  break;
240  }
241  pos = uStrmMotif + 15;
242  ++uStrmMotif;
243 
244  Iterator maxCleavage = (end - pos < 21) ? end : pos + 21;
245 
246  while (pos < maxCleavage) {
247  unsigned int aRun = 0;
248  for (++pos; pos < maxCleavage && aRun < 3; ++pos) {
249  if (*pos == 'A') {
250  ++aRun;
251  } else {
252  aRun = 0;
253  }
254  }
255 
256  Iterator cleavageSite = pos - aRun;
257 
258  //now let's look for poly-adenylated tail..
259  unsigned int numA = 0, numOther = 0;
260  for (Iterator p = cleavageSite; p < end; ++p) {
261  if (*p == 'A') {
262  ++numA;
263  } else {
264  ++numOther;
265  }
266  }
267 
268  for(Iterator p = end - 1;
269  p >= cleavageSite && TSeqPos(end - p) <= max_following_bases+1;
270  --p){
271  if (numOther + numA > 0 &&
272  ((double) numA / (numA+numOther)) > 0.95) {
273  while(*p != 'A')
274  --p;
275  return TSeqRange(cleavageSite - begin, p - begin);
276  }
277  if (*p == 'A') {
278  --numA;
279  } else {
280  --numOther;
281  }
282  }
283  }
284  }
285  }
286 
287  return TSeqRange();
288 }
289 
290 ///////////////////////////////////////////////////////////////////////////////
291 // PRE : two random access iterators pointing to sequence data [begin,
292 // end)
293 // POST: cleavageSite (if any) and whether we found a poly-A tail, a poly-T
294 // head, or neither
295 template<typename Iterator>
296 EPolyTail
297 FindPolyTail(Iterator begin, Iterator end,
298  TSignedSeqPos &cleavageSite,
299  TSeqPos min_length)
300 {
301  TSeqRange tail;
302  EPolyTail type = FindPolyTail(begin, end, tail, min_length);
303  if(type == ePolyTail_A3)
304  cleavageSite = tail.GetFrom();
305  else if(type == ePolyTail_T5)
306  cleavageSite = tail.GetTo();
307  return type;
308 }
309 
310 ///////////////////////////////////////////////////////////////////////////////
311 /// PRE : two random access iterators pointing to sequence data [begin,
312 /// end); minimum length for tail
313 /// POST: poly-tail range (if any) and whether we found a poly-A tail, a poly-T
314 /// head, or neither
315 template <typename Iterator>
316 EPolyTail
317 FindPolyTail(Iterator begin, Iterator end,
318  TSeqRange &tail_result,
319  TSeqPos min_length,
320  TSeqPos max_following_bases)
321 {
322  TSeqRange tail = FindPolyARange(begin, end, max_following_bases);
323  if (tail.GetLength() >= min_length) {
324  tail_result = tail;
325  return ePolyTail_A3;
326  } else {
328  CRevComp_It<Iterator>(begin),
329  max_following_bases);
330 
331  if (tail.GetLength() >= min_length) {
332  int seqLen = end - begin;
333  tail_result.Set(seqLen - 1 - tail.GetTo(),
334  seqLen - 1 - tail.GetFrom());
335  return ePolyTail_T5;
336  }
337  }
338 
339  return ePolyTail_None;
340 }
341 
343 
344 #endif /*ALGO_SEQUENCE___POLYA__HPP*/
Implementation [in header because of templates].
Definition: polya.hpp:92
bool operator!=(const CRevComp_It &it) const
Definition: polya.hpp:165
bool operator<(const CRevComp_It &it) const
Definition: polya.hpp:159
CRevComp_It & operator-=(int i)
Definition: polya.hpp:131
bool operator==(const CRevComp_It &it) const
Definition: polya.hpp:162
CRevComp_It operator++(int)
Definition: polya.hpp:113
bool operator>=(const CRevComp_It &it) const
Definition: polya.hpp:150
CRevComp_It operator--(int)
Definition: polya.hpp:122
CRevComp_It(void)
Definition: polya.hpp:94
CRevComp_It(const Iterator &it)
Definition: polya.hpp:95
CRevComp_It operator+(int i) const
Definition: polya.hpp:135
bool operator>(const CRevComp_It &it) const
Definition: polya.hpp:153
CRevComp_It & operator+=(int i)
Definition: polya.hpp:127
Iterator m_Base
Definition: polya.hpp:169
CRevComp_It & operator--(void)
Definition: polya.hpp:118
CRevComp_It operator-(int i) const
Definition: polya.hpp:140
char operator*(void) const
Definition: polya.hpp:99
bool operator<=(const CRevComp_It &it) const
Definition: polya.hpp:156
CRevComp_It & operator++(void)
Definition: polya.hpp:109
Include a standard set of the NCBI C++ Toolkit most basic headers.
static int type
Definition: getdata.c:31
static char tmp[3200]
Definition: utf8.c:42
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
int TSignedSeqPos
Type for signed sequence position.
Definition: ncbimisc.hpp:887
position_type GetLength(void) const
Definition: range.hpp:158
TThisType & Set(position_type from, position_type to)
Definition: range.hpp:188
bool Empty(void) const
Definition: range.hpp:148
CRange< TSeqPos > TSeqRange
typedefs for sequence ranges
Definition: range.hpp:419
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
TTo GetTo(void) const
Get the To member data.
Definition: Range_.hpp:269
TFrom GetFrom(void) const
Get the From member data.
Definition: Range_.hpp:222
int i
TSeqRange FindPolyARange(Iterator begin, Iterator end, TSeqPos max_following_bases)
PRE : two random access iterators pointing to sequence data [begin, end); maximum number of non-A bas...
Definition: polya.hpp:222
TSignedSeqPos FindPolyA(Iterator begin, Iterator end)
PRE : two random access iterators pointing to sequence data [begin, end) POST: poly-A tail cleavage s...
Definition: polya.hpp:211
EPolyTail
Definition: polya.hpp:40
@ ePolyTail_None
Definition: polya.hpp:41
@ ePolyTail_A3
Definition: polya.hpp:42
@ ePolyTail_T5
Definition: polya.hpp:43
EPolyTail FindPolyTail(Iterator begin, Iterator end, TSignedSeqPos &cleavageSite, TSeqPos min_length=1)
PRE : two random access iterators pointing to sequence data [begin, end); minimum length for tail POS...
Definition: polya.hpp:297
ForwardIterator1 ItrSearch(ForwardIterator1 first1, ForwardIterator1 last1, ForwardIterator2 first2, ForwardIterator2 last2)
Definition: polya.hpp:178
Definition: type.c:6
Modified on Fri Sep 20 14:58:03 2024 by modify_doxy.py rev. 669887