NCBI C++ ToolKit
adapter_search.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: adapter_search.cpp 60914 2013-12-12 15:38:06Z astashya $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  */
27 
28 #include <ncbi_pch.hpp>
29 #include <corelib/ncbidbg.hpp>
30 #include <corelib/ncbimisc.hpp>
32 
33 #include <cmath>
34 #include <algorithm>
35 #include <functional>
36 #include <vector>
37 #include <set>
38 #include <queue>
39 
41 
43  TWord w,
44  size_t mer_size)
45 {
46  string s("");
47  s.resize(mer_size);
48  static const char* alphabet = "ACGT";
49  for(size_t i = 0; i < mer_size; i++) {
50  s[mer_size - 1 - i] = alphabet[w & 3];
51  w >>= 2;
52  }
53  return s;
54 }
55 
57  const TWords& words,
58  size_t mer_size)
59 {
60  if(words.size() == 0) {
61  return "";
62  }
63 
64  string iupac;
65  iupac.resize(words.size() - 1);
66  static const char* alphabet = "ACGT";
67  for(size_t i = 0; i < words.size() - 1; i++) {
68  iupac[i] = alphabet[words[i] >> (mer_size * 2 - 2)];
69  }
70  iupac += s_AsIUPAC(words.back(), mer_size);
71  return iupac;
72 }
73 
74 
76 {
77  vector<Uint1> counts(64, 0);
78  Uint8 w2 = word | (word << (MER_LENGTH * 2));
79  for(size_t i = 0; i < MER_LENGTH; i++) {
80  counts[w2 & 0x3F] += 1; //counting trinucleotides (6 bits each)
81  w2 >>= 2;
82  }
83  size_t sumsq(0);
84  for(size_t i = 0; i < 64; i++) {
85  size_t c = counts[i];
86  sumsq += c*c;
87  }
88  return (144 - sumsq)/132.0; //normalize to [0..1]
89 }
90 
91 
92 
94  const char* const seq,
95  size_t seq_size,
96  bool revcomp, TWords& words)
97 {
98  if(seq_size < MER_LENGTH) {
99  words.resize(0);
100  return;
101  } else {
102  words.resize(seq_size - (MER_LENGTH - 1));
103  }
104 
105  //(A,C,G,T,*) -> (0, 1, 2, 3, 0)
106  static const Uint1 char2letter[] = {
107  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
108  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
109  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
110  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
111  0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0,
112  0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
113  0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0,
114  0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
115  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
116  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
117  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
118  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
119  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
120  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
121  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
122  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
123  };
124 
125  // mask of set bits; spanning MER_LENGTH-1 letters
126  TWord mask = (1 << ((MER_LENGTH - 1) * 2)) - 1;
127 
128  if(!revcomp) {
129  TWord w0 = 0;
130  for(size_t i = 0; i < MER_LENGTH; i++) {
131  Uint1 c = char2letter[static_cast<int>(seq[i])];
132  w0 = (w0 << 2) | c;
133  }
134  words[0] = w0;
135 
136  int k = MER_LENGTH - 1;
137  for(size_t i = 1; i < words.size(); i++) {
138  Uint1 c = char2letter[static_cast<int>(seq[i + k])];
139  words[i] = ((words[i - 1] & mask) << 2) | c;
140  }
141  } else {
142  TWord w0 = 0;
143  for(size_t i = 0; i < MER_LENGTH; i++) {
144  Uint1 c = 3 - char2letter[static_cast<int>(seq[seq_size - 1 - i])];
145  w0 = (w0 << 2) | c;
146  }
147  words[0] = w0;
148 
149  int k = seq_size - MER_LENGTH;
150  for(size_t i = 1; i < words.size(); i++) {
151  Uint1 c = 3 - char2letter[static_cast<int>(seq[k - i])];
152  words[i] = ((words[i - 1] & mask) << 2) | c;
153  }
154  }
155 }
156 
157 
158 void NAdapterSearch
159  ::CPairedEndAdapterDetector
160  ::CConsensusPattern
161  ::AddExemplar(
162  TWords::const_iterator begin,
163  TWords::const_iterator end)
164 {
165  size_t words_size = end - begin;
166  size_t len = min(words_size, m_len);
167 
168  // Store position-specific 10-mer counts;
169  // (10-mer is in upper bits of 12-mer)
170  for(size_t i = 0; i < len; i++) {
171  x_IncrCount(i, *(begin + i) >> 4);
172  }
173 
174  // Can get two more 10-mers out of last 12-mer
175  if(words_size > 0 && words_size < m_len) {
176  size_t len = min<size_t>(2, m_len - words_size);
177  TWord w = *(begin + words_size);
178  for(size_t i = 0; i < len; i++) {
179  size_t pos = words_size + i;
180  TWord w2 = (w >> (2 - i*2)) & 0xFFFFF;
181  x_IncrCount(pos, w2);
182  }
183  }
184 }
185 
186 
187 NAdapterSearch::TWord NAdapterSearch
188  ::CPairedEndAdapterDetector
189  ::CConsensusPattern
190  ::x_NextWord(size_t pos, TWord prev_word) const
191 {
192  TWord best_word = 0;
193  size_t best_count = 0;
194  for(Uint1 letter = 0; letter < 4; letter++) {
195  TWord word = ((prev_word << 2) & 0xFFFFF) | letter;
196  TCount count = x_GetCount(pos + 1, word);
197  if(count > best_count) {
198  best_count = count;
199  best_word = word;
200  }
201  }
202  return best_word;
203 }
204 
205 
206 string NAdapterSearch
207  ::CPairedEndAdapterDetector
208  ::CConsensusPattern
209  ::InferConsensus(const SParams& params) const
210 {
211  // Find the best (most-frequent) and 2nd-best word at position 0
212  size_t top_sup(0);
213  size_t secondbest_sup(0);
214  size_t best_word(0);
215  for(size_t word = 0; word < NMERS10; word++) {
216  size_t sup = m_counts[word];
217  if(sup > top_sup && !NAdapterSearch::s_IsLowComplexity(word)) {
218  secondbest_sup = top_sup;
219  top_sup = sup;
220  best_word = word;
221  }
222  }
223 
224  if(!params.HaveInitialSupport(top_sup, secondbest_sup)) {
225  return "";
226  }
227 
228  TWords words(m_len);
229  words[0] = best_word;
230 
231  ERR_POST(Info
232  << "Seed: "
233  << s_AsIUPAC(best_word,10)
234  << "; overrepresentation: "
235  << top_sup
236  << "/"
237  << secondbest_sup);
238 
239  // Extend the sequence from the starting word
240  for(size_t i = 1; i < words.size(); i++) {
241  TWord last_word = words[i - 1];
242  TCount last_sup = x_GetCount(i - 1, last_word);
243 
244  TWord curr_word = x_NextWord(i - 1, last_word);
245  TCount curr_sup = x_GetCount(i, curr_word);
246 
247  if(params.HaveContinuedSupport(top_sup, last_sup, curr_sup)) {
248  words[i] = curr_word;
249  } else {
250  words.resize(i);
251  break;
252  }
253  }
254 
255  return s_AsIUPAC(words, 10);
256 }
257 
258 ///////////////////////////////////////////////////////////////////////
259 
260 pair<size_t, size_t> NAdapterSearch
261  ::CPairedEndAdapterDetector
262  ::s_FindAdapterStartPos(
263  const TWords& fwd_read,
264  const TWords& rev_read)
265 {
266  return pair<size_t, size_t>(
267  find(fwd_read.begin(), fwd_read.end(), rev_read.back())
268  - fwd_read.begin() + MER_LENGTH,
269 
270  find(rev_read.begin(), rev_read.end(), fwd_read.front())
271  - rev_read.begin());
272 }
273 
274 
275 void NAdapterSearch
276  ::CPairedEndAdapterDetector
277  ::AddExemplar(
278  const char* spot,
279  size_t spot_len)
280 {
281  size_t len = spot_len / 2;
282  const char* fwd_read = spot;
283  const char* rev_read = spot+len;
284 
285  TWords fwd_words;
286  TWords rev_words;
287  NAdapterSearch::s_Translate(fwd_read, len, false, fwd_words);
288  NAdapterSearch::s_Translate(rev_read, len, true, rev_words);
289  // note: translating in rev_read reverse-orientation,
290  // such that translations are in consistent orientation
291  // so we can check for overlap
292 
293  pair<size_t, size_t> pos = s_FindAdapterStartPos(fwd_words, rev_words);
294 
295  bool is_consistent = (pos.first + pos.second == len);
296  // this is a likely true-positive rather than a random match
297 
298  size_t adapter_len = len - pos.first; // remaining portion of the read
299  if(is_consistent && adapter_len >= MER_LENGTH) {
300  m_cons5.AddExemplar(fwd_words.begin() + pos.first, fwd_words.end());
301 
302  // We translated R-read in rev-comp above for the purpose of
303  // detecting self-overlap within a spot, but here we'll need to do it
304  // again without rev-comp for the purpose of consensus-finding.
305  // We don't need to translate the whole read, just the adapter part.
306  TWords words2;
307  s_Translate(rev_read + pos.first, adapter_len, false, words2);
308  m_cons3.AddExemplar(words2.begin(), words2.end());
309  }
310 }
311 
312 ///////////////////////////////////////////////////////////////////////////
313 
315  const char* seq,
316  size_t len)
317 {
318  TWords words;
319  s_Translate(seq, len, false, words);
320  ITERATE(TWords, it, words) {
321  m_counts[*it] += 1;
322  }
323 }
324 
325 
327 {
328  TWord seed = x_FindAdapterSeed();
329  if(!seed) {
330  return "";
331  }
332 
333  // subsequent words will need to satisfy minimum support
334  // relative to the originating seed.
335  size_t top_count = m_counts[seed];
336 
337  // Get sequence of overlapping words starting at the seed
338  TWords words;
339  words.push_back(seed);
340  x_ExtendSeed(words, top_count, false); // extend left
341  reverse(words.begin(), words.end()); // fix the order left->right
342  x_ExtendSeed(words, top_count, true); // extend right
343 
344  // The non-biological sequence following a read is often a homopolymer
345  // (usually A+), so statistically it looks like part of the adapter.
346  // We'll truncate the homopolymer suffix if last word is a homopolymer
347 
348  string seq = s_AsIUPAC(words);
349  if(NAdapterSearch::s_IsHomopolymer(words.back())) {
350  char c = seq[seq.size() - 1];
351  while(seq.size() > 0 && seq[seq.size() - 1] == c) {
352  seq.resize(seq.size() - 1);
353  words.pop_back();
354  }
355  }
356  return seq;
357 }
358 
359 
360 NAdapterSearch::TWord NAdapterSearch
361  ::CUnpairedAdapterDetector
362  ::x_FindAdapterSeed() const
363 {
364  typedef pair<TCount, TWord> TPair;
365  typedef priority_queue< TPair, vector<TPair>, greater<TPair> > TTopWords;
366  TTopWords top_words;
367 
368  // Collect most frequent words. The count must be large enough such that
369  // that most of these words are biological (not read-specific). The idea
370  // is to check whether most frequent word (potentially adapter-specific)
371  // is overrepresented relative to biological seqs
372  for(TWord w = 0; w < m_counts.size(); w++) {
373  Uint8 count = m_counts[w];
374  if(count > m_params.min_support && !NAdapterSearch::s_IsLowComplexity(w)) {
375  top_words.push(TPair(count, w));
376  while(top_words.size() > m_params.top_n) {
377  top_words.pop();
378  }
379  }
380  }
381 
382  // Calculate geometric mean of counts
383  double sumlogs(0);
384  TWord word(0); // last word and occ will be the most frequent one
385  TCount sup(0);
386  size_t n = top_words.size();
387  for(; !top_words.empty(); top_words.pop()) {
388  sup = top_words.top().first;
389  word = top_words.top().second;
390  sumlogs += log(sup + 1.0);
391  }
392 
393  size_t gmean = n == 0 ? 0 : exp(sumlogs / n) - 1;
394 
395  ERR_POST(Info
396  << "Seed: "
397  << s_AsIUPAC(word)
398  << "; overrepresentation: "
399  << sup
400  << "/"
401  << static_cast<size_t>(gmean));
402 
403  return m_params.HaveInitialSupport(sup, gmean) ? word : 0;
404 }
405 
406 
408  TWord w,
409  bool right) const
410 {
411  TCount best_count(0);
412  TWord best_word(0);
413 
414  for(Uint1 letter = 0; letter < 4; letter++) {
415  TWord w2 = right ? ((w << 2) & 0xFFFFFF) | letter
416  : (w >> 2) | (letter << 22);
417 
418  TCount count = m_counts[w2];
419  if(count > best_count) {
420  best_count = count;
421  best_word = w2;
422  }
423  }
424  return best_word;
425 }
426 
427 
429  TWords& words,
430  size_t top_sup,
431  bool right) const
432 {
433  while(true) {
434  TWord prev_word = words.back();
435  TWord curr_word = x_GetAdjacent(prev_word, right);
436  TCount prev_sup = m_counts[prev_word];
437  TCount curr_sup = m_counts[curr_word];
438 
439  if(!m_params.HaveContinuedSupport(top_sup, prev_sup, curr_sup)
440  || find(words.begin(), words.end(), curr_word) != words.end())
441  {
442  // Stop extending if support is too low, or
443  // this word is already in the sequence (to prevent loops)
444  break;
445  } else {
446  words.push_back(curr_word);
447  }
448  }
449 }
450 
451 
452 ///////////////////////////////////////////////////////////////////////
453 
455 {
458  {
459  return a.second < b.second;
460  }
461 };
462 
464  NAdapterSearch
465 ::CSimpleUngappedAligner
466 ::GetSeqRange(TPos pos) const
467 {
468  TRanges::const_iterator it = lower_bound(
469  m_subseq_ranges.begin(),
470  m_subseq_ranges.end(),
471  TRange(pos, pos),
472  SOpLess_Second());
473 
474  return it == m_subseq_ranges.end() ? TRange(NULLPOS, NULLPOS) : *it;
475 }
476 
478  NAdapterSearch
479 ::CSimpleUngappedAligner
480 ::x_GetBetterOf(const SMatch& a, const SMatch& b) const
481 {
482  if(a.len > 0 && b.len == 0) {
483  return a;
484  } else if(a.len == 0 && b.len > 0) {
485  return b;
486  } else {
487  TRange ar = GetSeqRange((a.second - a.first) / 2);
488  TRange br = GetSeqRange((b.second - b.first) / 2);
489 
490  // calculate unaligned length
491  // note ::len is doubled on antidiag
492  TPos a_un = (ar.second - ar.first) - a.len/2;
493  TPos b_un = (br.second - br.first) - b.len/2;
494 
495 #if 0
496  LOG_POST((a.second + a.first) / 2
497  << " " << (a.second - a.first) / 2
498  << " " << a.len
499  << " " << a_un
500  << " " << m_seq.substr(ar.first, ar.second - ar.first));
501  LOG_POST((b.second + b.first) / 2
502  << " " << (b.second - b.first) / 2
503  << " " << b.len
504  << " " << b_un
505  << " " << m_seq.substr(br.first, br.second - br.first));
506  LOG_POST("\n");
507 #endif
508 
509  // score is proportional to the length, and penalized
510  // proportonally to the log of unaligned length
511  double a_score = a.len - log(1.0 + a_un)*5;
512  double b_score = b.len - log(1.0 + b_un)*5;
513  return a_score < b_score ? b : a;
514  }
515 }
516 
517 
519  TWord word,
520  TPos pos,
521  TPositions& vec_index,
522  TCoordSet& coordset)
523 {
524  TWords approx_words;
525  s_PermuteMismatches(word, approx_words);
526 
527  ITERATE(TWords, it2, approx_words) {
528  TWord w = *it2;
529 
530  // store word->pos in vec-index;
531  // if encountered multiplicity, move
532  // the values to coordset and mark as
533  // MULTIPOS in vec-index
534 
535  TPos& pos_ref = vec_index[w];
536  if(pos_ref == pos || pos_ref == NULLPOS) {
537  pos_ref = pos;
538  } else {
539  if(pos_ref != MULTIPOS) {
540  coordset.insert(TCoordSet::value_type(w, pos_ref));
541  pos_ref = MULTIPOS;
542  }
543  coordset.insert(TCoordSet::value_type(w, pos));
544  }
545  }
546 }
547 
548 
550  const TCoordSet& coordset,
551  TMapIndex& map_index,
552  TPositions& positions)
553 {
554  map_index.clear();
555  positions.resize(coordset.size());
556  TPositions::iterator dest = positions.begin();
557 
558  ITERATE(TCoordSet, it, coordset) {
559  TWord w = it->first;
560  TPos pos = it->second;
561 
562  *dest = pos;
563  if(map_index.find(w) == map_index.end()) {
564  map_index[w].first = dest;
565  }
566  dest++;
567  map_index[w].second = dest;
568  }
569 }
570 
571 
573 {
574  m_seq.resize(len);
575  m_seq.assign(seq, len);
576 
577  m_subseq_ranges.clear();
578 
579  m_vec_index.resize(1 << (MER_LENGTH*2));
580  fill(m_vec_index.begin(), m_vec_index.end(), (TPos)NULLPOS);
581 
582  m_positions.clear();
583  m_map_index.clear();
584 
585  // process each '-'-delimited substring.
586  TCoordSet coordset;
587  const char* endend = seq+len;
588  for(const char *begin = seq, *end = find(begin, endend, '-');
589  begin < endend;
590  begin = end + 1, end = find(begin, endend, '-'))
591  {
592  TRange r(begin - seq, end - seq);
593  m_subseq_ranges.push_back(r);
594 
595  TWords words;
596  s_Translate(begin, r.second - r.first, false, words);
597 
598  for(size_t i = 0; i < words.size(); i++) {
599  TPos pos = r.first + i;
600  s_IndexWord(words[i], pos, m_vec_index, coordset);
601  }
602  }
603 
604  s_CoordSetToMapIndex(coordset, m_map_index, m_positions);
605 }
606 
607 
609  TWord w,
610  TWords& words)
611 {
612  words.resize(240); // nCr(6,2)*4^2
613  TWords::iterator dest = words.begin();
614  for(size_t i1 = 3; i1 < 9; i1++) {
615 
616  for(Uint1 c1 = 0; c1 < 4; c1++) {
617  TWord w1 = s_Put(w, i1, c1);
618 
619  for(size_t i2 = i1+1; i2 < 9; i2++) {
620 
621  for(Uint1 c2 = 0; c2 < 4; c2++) {
622  *(dest++) = s_Put(w1, i2, c2);
623  }
624  }
625  }
626  }
627 }
628 
629 
631  SMatch& m1,
632  const SMatch& m2)
633 {
634  if(m1.first == NULLPOS) {
635  m1 = m2;
636  return true;
637  } else if(m1.first == m2.first
638  && m1.second + m1.len + 2 >= m2.second)
639  {
640  // on same diag and [almost-]overlapping on antidiag
641  // - extend the length of m1 to the end of m2
642  m1.len = m2.len + m2.second - m1.second;
643  return true;
644  } else {
645  return false;
646  }
647 }
648 
649 
652  TPos q_pos,
653  TPos s_pos) const
654 {
655  SMatch m;
656  m.first = q_pos - s_pos;
657  m.second = q_pos + s_pos;
658  m.len = MER_LENGTH * 2; // lengths are doubled on antidiag
659  return m;
660 }
661 
662 
664  SMatch& seg,
665  const char* q_seq,
666  const size_t query_len,
667  const bool direction, //true iff forward
668  const int match_score,
669  const int mismatch_score,
670  const int dropoff_score) const
671 {
672  int delta_pos = direction ? 1 : -1;
673 
674  // Create a point at the first or last position of
675  // the initial segment, depending on direction
676  typedef pair<TPos, TPos> TPoint;
677  TPoint p(seg.first + (direction ? seg.len - 1 : 0),
678  seg.second + (direction ? seg.len - 1 : 0));
679 
680  Int8 score(0), max_score(0);
681  TPoint best_p = p;
682 
683  // bounds are [min, max)
684  TRange q_bounds(0, query_len);
685  TRange s_bounds = GetSeqRange(seg.second);
686 
687  // the initial point is apriori matched, so advance
688  p.first += delta_pos;
689  p.second += delta_pos;
690  while(score + dropoff_score > max_score
691  && p.first >= q_bounds.first
692  && p.first < q_bounds.second
693  && p.second >= s_bounds.first
694  && p.second < s_bounds.second)
695  {
696  score += (q_seq[p.first] == m_seq[p.second]) ? match_score
697  : mismatch_score;
698 #if 0
699  LOG_POST(direction
700  << " " << q_seq[p.first]
701  << " " << m_seq[p.second]
702  << " " << score
703  << " " << best_p.first);
704 #endif
705 
706  if(score > max_score) {
707  max_score = score;
708  best_p = p;
709  }
710 
711  p.first += delta_pos;
712  p.second += delta_pos;
713  }
714 
715  if(direction) {
716  // starting coordinates are the same; length increased
717  seg.len = best_p.first - seg.first + 1;
718  } else {
719  // starting coordinates are at best_p;
720  // length is distance to the starting coords + original length
721  SMatch seg2;
722  seg2.first = best_p.first;
723  seg2.second = best_p.second;
724  seg2.len = seg.len + seg.first - best_p.first;
725  seg = seg2;
726  }
727 }
728 
729 
732  const char* query,
733  size_t len) const
734 {
735  SMatch best_match;
736 
737  typedef map<TPos, SMatch> TMatches;
738  TMatches matches; //diag -> match
739 
740  TWords words;
741  s_Translate(query, len, false, words);
742 
743  for(size_t i = 0; i < words.size(); i++) {
744  TWord w = words[i];
745 
746  TPosRange r(m_vec_index.begin() + w,
747  m_vec_index.begin() + w + 1);
748 
749  if(*r.first == NULLPOS) {
750  continue;
751  } else if(*r.first == MULTIPOS) {
752  r = m_map_index.find(w)->second;
753  }
754 
755  // For each matched position in db for this word
756  // create a Match and try to merge it with existing
757  // overlapping Match on this diag, if any. If can't merge,
758  // we'll never be able to merge anything into the old mMatch
759  // because following matches on this diag will be even
760  // farther on the antidiag, so we can drop the old match
761  // (but keep it it is best so far), and replace it with the
762  // new one.
763  for(TPositions::const_iterator it = r.first; it != r.second; ++it) {
764  SMatch new_match = x_CreateMatch(i, *it);
765  SMatch& old_match = matches[new_match.first];
766 
767  if(!s_Merge(old_match, new_match)) {
768  best_match = x_GetBetterOf(best_match, old_match);
769  old_match = new_match;
770  }
771  }
772  }
773 
774  ITERATE(TMatches, it, matches) {
775  best_match = x_GetBetterOf(best_match, it->second);
776  }
777 
778  // Convert (diag,antidiag) representation to (query,target)
779  SMatch match;
780  match.len = best_match.len / 2;
781  match.first = (best_match.first + best_match.second) / 2;
782  match.second = best_match.second - match.first;
783 
784  // Do ungapped extension both ways.
785  x_Extend(match, query, len, true);
786  x_Extend(match, query, len, false);
787 
788  return match;
789 }
790 
LargeInt< 1 > revcomp(const LargeInt< 1 > &x, size_t sizeKmer)
Definition: LargeInt1.hpp:148
CLocalRange< TOffset > TRange
define for the fundamental building block of sequence ranges
Definition: base.hpp:115
ncbi::TMaskedQueryRegions mask
static void s_CoordSetToMapIndex(const TCoordSet &coordset, TMapIndex &map_index, TPositions &positions)
static void s_PermuteMismatches(TWord w, TWords &words)
SMatch FindSingleBest(const char *query, size_t len) const
Return best ungapped alignment of at least MER_LENGTH.
pair< TPositions::const_iterator, TPositions::const_iterator > TPosRange
void Init(const char *seq, size_t len)
IUPAC sequence of target sequence(s) Multiple sequences may be concatenated and '-'-delimited e....
SMatch x_CreateMatch(TPos q_start, TPos s_start) const
Create a match in diag/antidiag coordspace for a matched word.
static bool s_Merge(SMatch &m, const SMatch &m2)
void x_Extend(SMatch &match, const char *query_seq, const size_t query_len, const bool direction, const int match_score=3, const int mismatch_score=-2, const int dropoff=5) const
Ungapped extension (match is in normal coordinates)
static void s_IndexWord(TWord w, TPos pos, TPositions &vec_index, TCoordSet &coordset)
TWord x_GetAdjacent(TWord w, bool right) const
if right, return most frequent word whose 11-prefix = 11-suffix of w; otherwise, return most frequent...
virtual void AddExemplar(const char *seq, size_t len)
Add sequence of a spot from an SRA run (single or paired-end read)
virtual string InferAdapterSeq() const
returns IUPAC seq of the inferred adapter seq; empty if not found.
void x_ExtendSeed(TWords &words, size_t top_count, bool right) const
Extend the seed one way, as long as it satisfies extension thresholds.
static bool s_IsLowComplexity(TWord word)
With threshold of 0.9, 0.7% of all words are classified as such.
static double s_GetWordComplexity(TWord word)
Complexity is a value between 0 (for a homopolymer) and 1 (max complexity).
vector< TWord > TWords
static const size_t MER_LENGTH
Will use 24-bit 12-mers as words.
static string s_AsIUPAC(TWord w, size_t mer_size=MER_LENGTH)
Convert word to IUPAC string [ACGT]{MER_LENGTH}.
static void s_Translate(const char *const iupac_na, size_t len, bool apply_revcomp, TWords &words)
Translate IUPAC sequence to a sequence of overlapping words (A,C,G,T,*) <-> (0,1,2,...
static bool s_IsHomopolymer(TWord w)
const_iterator end() const
Definition: map.hpp:152
void clear()
Definition: map.hpp:169
const_iterator find(const key_type &key) const
Definition: map.hpp:153
Definition: set.hpp:45
iterator_bool insert(const value_type &val)
Definition: set.hpp:149
size_type size() const
Definition: set.hpp:132
parent_type::value_type value_type
Definition: set.hpp:78
SStaticPair< const char *, const char * > TPair
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define ERR_POST(message)
Error posting with file, line number information but without error codes.
Definition: ncbidiag.hpp:186
#define LOG_POST(message)
This macro is deprecated and it's strongly recomended to move in all projects (except tests) to macro...
Definition: ncbidiag.hpp:226
void Info(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1185
uint8_t Uint1
1-byte (8-bit) unsigned integer
Definition: ncbitype.h:99
int64_t Int8
8-byte (64-bit) signed integer
Definition: ncbitype.h:104
uint64_t Uint8
8-byte (64-bit) unsigned integer
Definition: ncbitype.h:105
int i
yy_size_t n
int len
void s_Merge(SExpression &l, SExpression &r)
unsigned int a
Definition: ncbi_localip.c:102
NCBI C++ auxiliary debug macros.
Miscellaneous common-use basic types and functionality.
T min(T x_, T y_)
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
static int match(PCRE2_SPTR start_eptr, PCRE2_SPTR start_ecode, uint16_t top_bracket, PCRE2_SIZE frame_size, pcre2_match_data *match_data, match_block *mb)
Definition: pcre2_match.c:594
#define count
USING_NCBI_SCOPE
static SLJIT_INLINE sljit_ins br(sljit_gpr target)
Represents single ungapped alignment.
An adapter sequence is presumed to occur at least min_support times in the input, and is overrepresen...
bool HaveContinuedSupport(size_t top_sup, size_t prev_sup, size_t candidate_sup) const
bool HaveInitialSupport(size_t top_candidate_sup, size_t non_candidate_sup) const
bool operator()(const NAdapterSearch::CSimpleUngappedAligner::TRange &a, const NAdapterSearch::CSimpleUngappedAligner::TRange &b) const
static string query
static int seed
Definition: test_table.cpp:132
static Uint4 letter(char c)
Modified on Fri Sep 20 14:57:35 2024 by modify_doxy.py rev. 669887