CPP_DOC/doxyhtml/sequence_2adapter__search_8cpp_source.html

 /*  $Id: adapter_search.cpp 60914 2013-12-12 15:38:06Z astashya $

  * ===========================================================================

  *

  *                            PUBLIC DOMAIN NOTICE

  *               National Center for Biotechnology Information

  *

  *  This software/database is a "United States Government Work" under the

  *  terms of the United States Copyright Act.  It was written as part of

  *  the author's official duties as a United States Government employee and

  *  thus cannot be copyrighted.  This software/database is freely available

  *  to the public for use. The National Library of Medicine and the U.S.

  *  Government have not placed any restriction on its use or reproduction.

  *

  *  Although all reasonable efforts have been taken to ensure the accuracy

  *  and reliability of the software and data, the NLM and the U.S.

  *  Government do not and cannot warrant the performance or results that

  *  may be obtained by using this software or data. The NLM and the U.S.

  *  Government disclaim all warranties, express or implied, including

  *  warranties of performance, merchantability or fitness for any particular

  *  purpose.

  *

  *  Please cite the author in any work or product based on this material.

  *

  * ===========================================================================

  *

  */


 #include <ncbi_pch.hpp>

 #include <corelib/ncbidbg.hpp>

 #include <corelib/ncbimisc.hpp>

 #include <algo/sequence/adapter_search.hpp>


 #include <cmath>

 #include <algorithm>

 #include <functional>

 #include <vector>

 #include <set>

 #include <queue>


 USING_NCBI_SCOPE;


 string NAdapterSearch::s_AsIUPAC(

         TWord w,

         size_t mer_size)

 {

     string s("");

     s.resize(mer_size);

     static const char* alphabet = "ACGT";

     for(size_t i = 0; i < mer_size; i++) {

         s[mer_size - 1 - i] = alphabet[w & 3];

         w >>= 2;

     }

     return s;

 }


 string NAdapterSearch::s_AsIUPAC(

         const TWords& words,

         size_t mer_size)

 {

     if(words.size() == 0) {

         return "";

     }


     string iupac;

     iupac.resize(words.size() - 1);

     static const char* alphabet = "ACGT";

     for(size_t i = 0; i < words.size() - 1; i++) {

         iupac[i] = alphabet[words[i] >> (mer_size * 2 - 2)];

     }

     iupac += s_AsIUPAC(words.back(), mer_size);

     return iupac;

 }


 double NAdapterSearch::s_GetWordComplexity(TWord word)

 {

     vector<Uint1> counts(64, 0);

     Uint8 w2 = word | (word << (MER_LENGTH * 2));

     for(size_t i = 0; i < MER_LENGTH; i++) {

         counts[w2 & 0x3F] += 1; //counting trinucleotides (6 bits each)

         w2 >>= 2;

     }

     size_t sumsq(0);

     for(size_t i = 0; i < 64; i++) {

         size_t c = counts[i];

         sumsq += c*c;

     }

     return (144 - sumsq)/132.0; //normalize to [0..1]

 }


 void NAdapterSearch::s_Translate(

         const char* const seq,

         size_t seq_size,

         bool revcomp, TWords& words)

 {

     if(seq_size < MER_LENGTH) {

         words.resize(0);

         return;

     } else {

         words.resize(seq_size - (MER_LENGTH - 1));

     }


     //(A,C,G,T,*) -> (0, 1, 2, 3, 0)

     static const Uint1 char2letter[] = {

         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

         0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0,

         0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

         0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0,

         0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

     };


     // mask of set bits; spanning MER_LENGTH-1 letters

     TWord mask = (1 << ((MER_LENGTH - 1) * 2)) - 1;


     if(!revcomp) {

         TWord w0 = 0;

         for(size_t i = 0; i < MER_LENGTH; i++) {

             Uint1 c = char2letter[static_cast<int>(seq[i])];

             w0 = (w0 << 2) | c;

         }

         words[0] = w0;


         int k = MER_LENGTH - 1;

         for(size_t i = 1; i < words.size(); i++) {

             Uint1 c = char2letter[static_cast<int>(seq[i + k])];

             words[i] = ((words[i - 1] & mask) << 2) | c;

         }

     } else {

         TWord w0 = 0;

         for(size_t i = 0; i < MER_LENGTH; i++) {

             Uint1 c = 3 - char2letter[static_cast<int>(seq[seq_size - 1 - i])];

             w0 = (w0 << 2) | c;

         }

         words[0] = w0;


         int k = seq_size - MER_LENGTH;

         for(size_t i = 1; i < words.size(); i++) {

             Uint1 c = 3 - char2letter[static_cast<int>(seq[k - i])];

             words[i] = ((words[i - 1] & mask) << 2) | c;

         }

     }

 }


 void NAdapterSearch

    ::CPairedEndAdapterDetector

    ::CConsensusPattern

    ::AddExemplar(

         TWords::const_iterator begin,

         TWords::const_iterator end)

 {

     size_t words_size = end - begin;

     size_t len = min(words_size, m_len);


     // Store position-specific 10-mer counts;

     // (10-mer is in upper bits of 12-mer)

     for(size_t i = 0; i < len; i++) {

         x_IncrCount(i, *(begin + i) >> 4);

     }


     // Can get two more 10-mers out of last 12-mer

     if(words_size > 0 && words_size < m_len) {

         size_t len = min<size_t>(2, m_len - words_size);

         TWord w = *(begin + words_size);

         for(size_t i = 0; i < len; i++) {

             size_t pos = words_size + i;

             TWord w2 = (w >> (2 - i*2)) & 0xFFFFF;

             x_IncrCount(pos, w2);

         }

     }

 }


 NAdapterSearch::TWord NAdapterSearch

      ::CPairedEndAdapterDetector

      ::CConsensusPattern

      ::x_NextWord(size_t pos, TWord prev_word) const

 {

     TWord best_word = 0;

     size_t best_count = 0;

     for(Uint1 letter = 0; letter < 4; letter++) {

         TWord word = ((prev_word << 2) & 0xFFFFF) | letter;

         TCount count = x_GetCount(pos + 1, word);

         if(count > best_count) {

             best_count = count;

             best_word = word;

         }

     }

     return best_word;

 }


 string NAdapterSearch

      ::CPairedEndAdapterDetector

      ::CConsensusPattern

      ::InferConsensus(const SParams& params) const

 {

     // Find the best (most-frequent) and 2nd-best word at position 0

     size_t top_sup(0);

     size_t secondbest_sup(0);

     size_t best_word(0);

     for(size_t word = 0; word < NMERS10; word++) {

         size_t sup = m_counts[word];

         if(sup > top_sup && !NAdapterSearch::s_IsLowComplexity(word)) {

             secondbest_sup = top_sup;

             top_sup = sup;

             best_word = word;

         }

     }


     if(!params.HaveInitialSupport(top_sup, secondbest_sup)) {

         return "";

     }


     TWords words(m_len);

     words[0] = best_word;


     ERR_POST(Info

           << "Seed: "

           << s_AsIUPAC(best_word,10)

           << "; overrepresentation: "

           << top_sup

           << "/"

           << secondbest_sup);


     // Extend the sequence from the starting word

     for(size_t i = 1; i < words.size(); i++) {

         TWord last_word = words[i - 1];

         TCount last_sup = x_GetCount(i - 1, last_word);


         TWord curr_word = x_NextWord(i - 1, last_word);

         TCount curr_sup = x_GetCount(i,     curr_word);


         if(params.HaveContinuedSupport(top_sup, last_sup, curr_sup)) {

             words[i] = curr_word;

         } else {

             words.resize(i);

             break;

         }

     }


     return s_AsIUPAC(words, 10);

 }


 ///////////////////////////////////////////////////////////////////////


 pair<size_t, size_t> NAdapterSearch

     ::CPairedEndAdapterDetector

     ::s_FindAdapterStartPos(

         const TWords& fwd_read,

         const TWords& rev_read)

 {

     return pair<size_t, size_t>(

         find(fwd_read.begin(), fwd_read.end(), rev_read.back())

             - fwd_read.begin() + MER_LENGTH,


         find(rev_read.begin(), rev_read.end(), fwd_read.front())

             - rev_read.begin());

 }


 void NAdapterSearch

     ::CPairedEndAdapterDetector

     ::AddExemplar(

         const char* spot,

         size_t spot_len)

 {

     size_t len = spot_len / 2;

     const char* fwd_read = spot;

     const char* rev_read = spot+len;


     TWords fwd_words;

     TWords rev_words;

     NAdapterSearch::s_Translate(fwd_read, len, false, fwd_words);

     NAdapterSearch::s_Translate(rev_read, len, true,  rev_words);

         // note: translating in rev_read reverse-orientation,

         // such that translations are in consistent orientation

         // so we can check for overlap


     pair<size_t, size_t> pos = s_FindAdapterStartPos(fwd_words, rev_words);


     bool is_consistent = (pos.first + pos.second == len);

         // this is a likely true-positive rather than a random match


     size_t adapter_len = len - pos.first; // remaining portion of the read

     if(is_consistent && adapter_len >= MER_LENGTH) {

         m_cons5.AddExemplar(fwd_words.begin() + pos.first, fwd_words.end());


         // We translated R-read in rev-comp above for the purpose of

         // detecting self-overlap within a spot, but here we'll need to do it

         // again without rev-comp for the purpose of consensus-finding.

         // We don't need to translate the whole read, just the adapter part.

         TWords words2;

         s_Translate(rev_read + pos.first, adapter_len, false, words2);

         m_cons3.AddExemplar(words2.begin(), words2.end());

     }

 }


 ///////////////////////////////////////////////////////////////////////////


 void NAdapterSearch::CUnpairedAdapterDetector::AddExemplar(

         const char* seq,

         size_t len)

 {

     TWords words;

     s_Translate(seq, len, false, words);

     ITERATE(TWords, it, words) {

         m_counts[*it] += 1;

     }

 }


 string NAdapterSearch::CUnpairedAdapterDetector::InferAdapterSeq() const

 {

     TWord seed = x_FindAdapterSeed();

     if(!seed) {

         return "";

     }


     // subsequent words will need to satisfy minimum support

     // relative to the originating seed.

     size_t top_count = m_counts[seed];


     // Get sequence of overlapping words starting at the seed

     TWords words;

     words.push_back(seed);

     x_ExtendSeed(words, top_count, false);  // extend left

     reverse(words.begin(), words.end());    // fix the order left->right

     x_ExtendSeed(words, top_count, true);   // extend right


     // The non-biological sequence following a read is often a homopolymer

     // (usually A+), so statistically it looks like part of the adapter.

     // We'll truncate the homopolymer suffix if last word is a homopolymer


     string seq = s_AsIUPAC(words);

     if(NAdapterSearch::s_IsHomopolymer(words.back())) {

         char c = seq[seq.size() - 1];

         while(seq.size() > 0 && seq[seq.size() - 1] == c) {

             seq.resize(seq.size() - 1);

             words.pop_back();

         }

     }

     return seq;

 }


 NAdapterSearch::TWord NAdapterSearch

     ::CUnpairedAdapterDetector

     ::x_FindAdapterSeed() const

 {

     typedef pair<TCount, TWord> TPair;

     typedef priority_queue< TPair, vector<TPair>, greater<TPair> > TTopWords;

     TTopWords top_words;


     // Collect most frequent words. The count must be large enough such that

     // that most of these words are biological (not read-specific). The idea

     // is to check whether most frequent word (potentially adapter-specific)

     // is overrepresented relative to biological seqs

     for(TWord w = 0; w < m_counts.size(); w++) {

         Uint8 count = m_counts[w];

         if(count > m_params.min_support && !NAdapterSearch::s_IsLowComplexity(w)) {

             top_words.push(TPair(count, w));

             while(top_words.size() > m_params.top_n) {

                 top_words.pop();

             }

         }

     }


     // Calculate geometric mean of counts

     double sumlogs(0);

     TWord word(0); // last word and occ will be the most frequent one

     TCount sup(0);

     size_t n = top_words.size();

     for(; !top_words.empty(); top_words.pop()) {

         sup = top_words.top().first;

         word = top_words.top().second;

         sumlogs += log(sup + 1.0);

     }


     size_t gmean = n == 0 ? 0 : exp(sumlogs / n) - 1;


     ERR_POST(Info

           << "Seed: "

           << s_AsIUPAC(word)

           << "; overrepresentation: "

           << sup

           << "/"

           << static_cast<size_t>(gmean));


     return m_params.HaveInitialSupport(sup, gmean) ? word : 0;

 }


 NAdapterSearch::TWord NAdapterSearch::CUnpairedAdapterDetector::x_GetAdjacent(

         TWord w,

         bool right) const

 {

     TCount best_count(0);

     TWord best_word(0);


     for(Uint1 letter = 0; letter < 4; letter++) {

         TWord w2 = right ? ((w << 2) & 0xFFFFFF) | letter

                          :  (w >> 2)             | (letter << 22);


         TCount count = m_counts[w2];

         if(count > best_count) {

             best_count = count;

             best_word = w2;

         }

     }

     return best_word;

 }


 void NAdapterSearch::CUnpairedAdapterDetector::x_ExtendSeed(

         TWords& words,

         size_t top_sup,

         bool right) const

 {

     while(true) {

         TWord prev_word = words.back();

         TWord curr_word = x_GetAdjacent(prev_word, right);

         TCount prev_sup = m_counts[prev_word];

         TCount curr_sup = m_counts[curr_word];


         if(!m_params.HaveContinuedSupport(top_sup, prev_sup, curr_sup)

            || find(words.begin(), words.end(), curr_word) != words.end())

         {

             // Stop extending if support is too low, or

             // this word is already in the sequence (to prevent loops)

             break;

         } else {

             words.push_back(curr_word);

         }

     }

 }


 ///////////////////////////////////////////////////////////////////////


 struct SOpLess_Second

 {

     bool operator()(const NAdapterSearch::CSimpleUngappedAligner::TRange& a,

                     const NAdapterSearch::CSimpleUngappedAligner::TRange& b) const

     {

         return a.second < b.second;

     }

 };


 NAdapterSearch::CSimpleUngappedAligner::TRange

   NAdapterSearch

 ::CSimpleUngappedAligner

 ::GetSeqRange(TPos pos) const

 {

     TRanges::const_iterator it = lower_bound(

             m_subseq_ranges.begin(),

             m_subseq_ranges.end(),

             TRange(pos, pos),

             SOpLess_Second());


     return it == m_subseq_ranges.end() ? TRange(NULLPOS, NULLPOS) : *it;

 }


 const NAdapterSearch::CSimpleUngappedAligner::SMatch&

   NAdapterSearch

 ::CSimpleUngappedAligner

 ::x_GetBetterOf(const SMatch& a, const SMatch& b) const

 {

     if(a.len > 0 && b.len == 0) {

         return a;

     } else if(a.len == 0 && b.len > 0) {

         return b;

     } else {

         TRange ar = GetSeqRange((a.second - a.first) / 2);

         TRange br = GetSeqRange((b.second - b.first) / 2);


         // calculate unaligned length

         // note ::len is doubled on antidiag

         TPos a_un = (ar.second - ar.first) - a.len/2;

         TPos b_un = (br.second - br.first) - b.len/2;


 #if 0

         LOG_POST((a.second + a.first) / 2

                 << " " << (a.second - a.first) / 2

                 << " " << a.len

                 << " " << a_un

                 << " " << m_seq.substr(ar.first, ar.second - ar.first));

         LOG_POST((b.second + b.first) / 2

                 << " " << (b.second - b.first) / 2

                 << " " << b.len

                 << " " << b_un

                 << " " << m_seq.substr(br.first, br.second - br.first));

         LOG_POST("\n");

 #endif


         // score is proportional to the length, and penalized

         // proportonally to the log of unaligned length

         double a_score = a.len - log(1.0 + a_un)*5;

         double b_score = b.len - log(1.0 + b_un)*5;

         return a_score < b_score ? b : a;

     }

 }


 void NAdapterSearch::CSimpleUngappedAligner::s_IndexWord(

         TWord word,

         TPos pos,

         TPositions& vec_index,

         TCoordSet& coordset)

 {

     TWords approx_words;

     s_PermuteMismatches(word, approx_words);


     ITERATE(TWords, it2, approx_words) {

         TWord w = *it2;


         // store word->pos in vec-index;

         // if encountered multiplicity, move

         // the values to coordset and mark as

         // MULTIPOS in vec-index


         TPos& pos_ref = vec_index[w];

         if(pos_ref == pos || pos_ref == NULLPOS) {

             pos_ref = pos;

         } else {

             if(pos_ref != MULTIPOS) {

                 coordset.insert(TCoordSet::value_type(w, pos_ref));

                 pos_ref = MULTIPOS;

             }

             coordset.insert(TCoordSet::value_type(w, pos));

         }

     }

 }


 void NAdapterSearch::CSimpleUngappedAligner::s_CoordSetToMapIndex(

         const TCoordSet& coordset,

         TMapIndex& map_index,

         TPositions& positions)

 {

      map_index.clear();

      positions.resize(coordset.size());

      TPositions::iterator dest = positions.begin();


      ITERATE(TCoordSet, it, coordset) {

         TWord w = it->first;

         TPos pos = it->second;


         *dest = pos;

         if(map_index.find(w) == map_index.end()) {

             map_index[w].first = dest;

         }

         dest++;

         map_index[w].second = dest;

      }

 }


 void NAdapterSearch::CSimpleUngappedAligner::Init(const char* seq, size_t len)

 {

     m_seq.resize(len);

     m_seq.assign(seq, len);


     m_subseq_ranges.clear();


     m_vec_index.resize(1 << (MER_LENGTH*2));

     fill(m_vec_index.begin(), m_vec_index.end(), (TPos)NULLPOS);


     m_positions.clear();

     m_map_index.clear();


     // process each '-'-delimited substring.

     TCoordSet coordset;

     const char* endend = seq+len;

     for(const char *begin = seq,    *end = find(begin, endend, '-');

                     begin < endend;

                     begin = end + 1, end = find(begin, endend, '-'))

     {

         TRange r(begin - seq, end - seq);

         m_subseq_ranges.push_back(r);


         TWords words;

         s_Translate(begin, r.second - r.first, false, words);


         for(size_t i = 0; i < words.size(); i++) {

             TPos pos = r.first + i;

             s_IndexWord(words[i], pos, m_vec_index, coordset);

         }

     }


     s_CoordSetToMapIndex(coordset, m_map_index, m_positions);

 }


 void NAdapterSearch::CSimpleUngappedAligner::s_PermuteMismatches(

         TWord w,

         TWords& words)

 {

     words.resize(240); // nCr(6,2)*4^2

     TWords::iterator dest = words.begin();

     for(size_t i1 = 3; i1 < 9; i1++) {


         for(Uint1 c1 = 0; c1 < 4; c1++) {

             TWord w1 = s_Put(w, i1, c1);


             for(size_t i2 = i1+1; i2 < 9; i2++) {


                 for(Uint1 c2 = 0; c2 < 4; c2++) {

                     *(dest++) = s_Put(w1, i2, c2);

                 }

             }

         }

     }

 }


 bool NAdapterSearch::CSimpleUngappedAligner::s_Merge(

         SMatch& m1,

         const SMatch& m2)

 {

     if(m1.first == NULLPOS) {

         m1 = m2;

         return true;

     } else if(m1.first == m2.first

            && m1.second + m1.len + 2 >= m2.second)

     {

         // on same diag and [almost-]overlapping on antidiag

         // - extend the length of m1 to the end of m2

         m1.len = m2.len + m2.second - m1.second;

         return true;

     } else {

         return false;

     }

 }


 NAdapterSearch::CSimpleUngappedAligner::SMatch

 NAdapterSearch::CSimpleUngappedAligner::x_CreateMatch(

         TPos q_pos,

         TPos s_pos) const

 {

     SMatch m;

     m.first  = q_pos - s_pos;

     m.second = q_pos + s_pos;

     m.len = MER_LENGTH * 2; // lengths are doubled on antidiag

     return m;

 }


 void NAdapterSearch::CSimpleUngappedAligner::x_Extend(

         SMatch& seg,

         const char* q_seq,

         const size_t query_len,

         const bool direction, //true iff forward

         const int match_score,

         const int mismatch_score,

         const int dropoff_score) const

 {

     int delta_pos = direction ? 1 : -1;


     // Create a point at the first or last position of

     // the initial segment, depending on direction

     typedef pair<TPos, TPos> TPoint;

     TPoint p(seg.first  + (direction ? seg.len - 1 : 0),

              seg.second + (direction ? seg.len - 1 : 0));


     Int8 score(0), max_score(0);

     TPoint best_p = p;


     // bounds are [min, max)

     TRange q_bounds(0, query_len);

     TRange s_bounds = GetSeqRange(seg.second);


     // the initial point is apriori matched, so advance

     p.first  += delta_pos;

     p.second += delta_pos;

     while(score + dropoff_score > max_score

           && p.first  >= q_bounds.first

           && p.first  <  q_bounds.second

           && p.second >= s_bounds.first

           && p.second <  s_bounds.second)

     {

         score += (q_seq[p.first] == m_seq[p.second]) ? match_score

                                                      : mismatch_score;

 #if 0

         LOG_POST(direction

                  << " " << q_seq[p.first]

                  << " " << m_seq[p.second]

                  << " " << score

                  << " " << best_p.first);

 #endif


         if(score > max_score) {

             max_score = score;

             best_p = p;

         }


         p.first  += delta_pos;

         p.second += delta_pos;

     }


     if(direction) {

         // starting coordinates are the same; length increased

         seg.len = best_p.first - seg.first + 1;

     } else {

         // starting coordinates are at best_p;

         // length is distance to the starting coords + original length

         SMatch seg2;

         seg2.first  = best_p.first;

         seg2.second = best_p.second;

         seg2.len    = seg.len + seg.first - best_p.first;

         seg = seg2;

     }

 }


 NAdapterSearch::CSimpleUngappedAligner::SMatch

 NAdapterSearch::CSimpleUngappedAligner::FindSingleBest(

         const char* query,

         size_t len) const

 {

     SMatch best_match;


     typedef map<TPos, SMatch> TMatches;

     TMatches matches; //diag -> match


     TWords words;

     s_Translate(query, len, false, words);


     for(size_t i = 0; i < words.size(); i++) {

         TWord w = words[i];


         TPosRange r(m_vec_index.begin() + w,

                     m_vec_index.begin() + w + 1);


         if(*r.first == NULLPOS) {

             continue;

         } else if(*r.first == MULTIPOS) {

             r = m_map_index.find(w)->second;

         }


         // For each matched position in db for this word

         // create a Match and try to merge it with existing

         // overlapping Match on this diag, if any. If can't merge,

         // we'll never be able to merge anything into the old mMatch

         // because following matches on this diag will be even

         // farther on the antidiag, so we can drop the old match

         // (but keep it it is best so far), and replace it with the

         // new one.

         for(TPositions::const_iterator it = r.first; it != r.second; ++it) {

             SMatch new_match = x_CreateMatch(i, *it);

             SMatch& old_match = matches[new_match.first];


             if(!s_Merge(old_match, new_match)) {

                 best_match = x_GetBetterOf(best_match, old_match);

                 old_match = new_match;

             }

         }

     }


     ITERATE(TMatches, it, matches) {

         best_match = x_GetBetterOf(best_match, it->second);

     }


     // Convert (diag,antidiag) representation to (query,target)

     SMatch match;

     match.len            = best_match.len / 2;

     match.first          = (best_match.first + best_match.second) / 2;

     match.second         = best_match.second - match.first;


     // Do ungapped extension both ways.

     x_Extend(match, query, len, true);

     x_Extend(match, query, len, false);


     return match;

 }


revcomp
LargeInt< 1 > revcomp(const LargeInt< 1 > &x, size_t sizeKmer)
Definition: LargeInt1.hpp:148

adapter_search.hpp

TRange
CLocalRange< TOffset > TRange
define for the fundamental building block of sequence ranges
Definition: base.hpp:115

mask
ncbi::TMaskedQueryRegions mask
Definition: blastxml_format.cpp:59

NAdapterSearch::CSimpleUngappedAligner::TPos
Int2 TPos
Definition: adapter_search.hpp:285

NAdapterSearch::CSimpleUngappedAligner::s_CoordSetToMapIndex
static void s_CoordSetToMapIndex(const TCoordSet &coordset, TMapIndex &map_index, TPositions &positions)
Definition: adapter_search.cpp:549

NAdapterSearch::CSimpleUngappedAligner::s_PermuteMismatches
static void s_PermuteMismatches(TWord w, TWords &words)
Definition: adapter_search.cpp:608

NAdapterSearch::CSimpleUngappedAligner::FindSingleBest
SMatch FindSingleBest(const char *query, size_t len) const
Return best ungapped alignment of at least MER_LENGTH.
Definition: adapter_search.cpp:731

NAdapterSearch::CSimpleUngappedAligner::TPosRange
pair< TPositions::const_iterator, TPositions::const_iterator > TPosRange
Definition: adapter_search.hpp:324

NAdapterSearch::CSimpleUngappedAligner::Init
void Init(const char *seq, size_t len)
IUPAC sequence of target sequence(s) Multiple sequences may be concatenated and '-'-delimited e....
Definition: adapter_search.cpp:572

NAdapterSearch::CSimpleUngappedAligner::x_CreateMatch
SMatch x_CreateMatch(TPos q_start, TPos s_start) const
Create a match in diag/antidiag coordspace for a matched word.
Definition: adapter_search.cpp:651

NAdapterSearch::CSimpleUngappedAligner::s_Merge
static bool s_Merge(SMatch &m, const SMatch &m2)
Definition: adapter_search.cpp:630

NAdapterSearch::CSimpleUngappedAligner::x_Extend
void x_Extend(SMatch &match, const char *query_seq, const size_t query_len, const bool direction, const int match_score=3, const int mismatch_score=-2, const int dropoff=5) const
Ungapped extension (match is in normal coordinates)
Definition: adapter_search.cpp:663

NAdapterSearch::CSimpleUngappedAligner::s_IndexWord
static void s_IndexWord(TWord w, TPos pos, TPositions &vec_index, TCoordSet &coordset)
Definition: adapter_search.cpp:518

NAdapterSearch::CSimpleUngappedAligner::TRange
pair< TPos, TPos > TRange
Definition: adapter_search.hpp:316

NAdapterSearch::CSimpleUngappedAligner::TPositions
vector< TPos > TPositions
Definition: adapter_search.hpp:321

NAdapterSearch::CUnpairedAdapterDetector::m_counts
TCounts m_counts
Definition: adapter_search.hpp:194

NAdapterSearch::CUnpairedAdapterDetector::x_GetAdjacent
TWord x_GetAdjacent(TWord w, bool right) const
if right, return most frequent word whose 11-prefix = 11-suffix of w; otherwise, return most frequent...
Definition: adapter_search.cpp:407

NAdapterSearch::CUnpairedAdapterDetector::AddExemplar
virtual void AddExemplar(const char *seq, size_t len)
Add sequence of a spot from an SRA run (single or paired-end read)
Definition: adapter_search.cpp:314

NAdapterSearch::CUnpairedAdapterDetector::InferAdapterSeq
virtual string InferAdapterSeq() const
returns IUPAC seq of the inferred adapter seq; empty if not found.
Definition: adapter_search.cpp:326

NAdapterSearch::CUnpairedAdapterDetector::x_ExtendSeed
void x_ExtendSeed(TWords &words, size_t top_count, bool right) const
Extend the seed one way, as long as it satisfies extension thresholds.
Definition: adapter_search.cpp:428

NAdapterSearch::TCount
Uint4 TCount
Definition: adapter_search.hpp:48

NAdapterSearch::TWord
Uint4 TWord
Definition: adapter_search.hpp:45

NAdapterSearch::s_IsLowComplexity
static bool s_IsLowComplexity(TWord word)
With threshold of 0.9, 0.7% of all words are classified as such.
Definition: adapter_search.hpp:76

NAdapterSearch::s_GetWordComplexity
static double s_GetWordComplexity(TWord word)
Complexity is a value between 0 (for a homopolymer) and 1 (max complexity).
Definition: adapter_search.cpp:75

NAdapterSearch::TWords
vector< TWord > TWords
Definition: adapter_search.hpp:46

NAdapterSearch::MER_LENGTH
static const size_t MER_LENGTH
Will use 24-bit 12-mers as words.
Definition: adapter_search.hpp:44

NAdapterSearch::s_AsIUPAC
static string s_AsIUPAC(TWord w, size_t mer_size=MER_LENGTH)
Convert word to IUPAC string [ACGT]{MER_LENGTH}.
Definition: adapter_search.cpp:42

NAdapterSearch::s_Translate
static void s_Translate(const char *const iupac_na, size_t len, bool apply_revcomp, TWords &words)
Translate IUPAC sequence to a sequence of overlapping words (A,C,G,T,*) <-> (0,1,2,...
Definition: adapter_search.cpp:93

NAdapterSearch::s_IsHomopolymer
static bool s_IsHomopolymer(TWord w)
Definition: adapter_search.hpp:81

map_checker::end
const_iterator end() const
Definition: map.hpp:152

map_checker::clear
void clear()
Definition: map.hpp:169

map_checker::find
const_iterator find(const key_type &key) const
Definition: map.hpp:153

map< TWord, TPosRange >

set
Definition: set.hpp:45

set::insert
iterator_bool insert(const value_type &val)
Definition: set.hpp:149

set::size
size_type size() const
Definition: set.hpp:132

set::value_type
parent_type::value_type value_type
Definition: set.hpp:78

TPair
SStaticPair< const char *, const char * > TPair
Definition: entrez_conversion.cpp:43

ITERATE
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815

ERR_POST
#define ERR_POST(message)
Error posting with file, line number information but without error codes.
Definition: ncbidiag.hpp:186

LOG_POST
#define LOG_POST(message)
This macro is deprecated and it's strongly recomended to move in all projects (except tests) to macro...
Definition: ncbidiag.hpp:226

Info
void Info(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1185

Uint1
uint8_t Uint1
1-byte (8-bit) unsigned integer
Definition: ncbitype.h:99

Int8
int64_t Int8
8-byte (64-bit) signed integer
Definition: ncbitype.h:104

Uint8
uint64_t Uint8
8-byte (64-bit) unsigned integer
Definition: ncbitype.h:105

b
b
Definition: lex.newick.cpp:1285

i
int i
Definition: lex.newick.cpp:1456

n
yy_size_t n
Definition: lex.newick.cpp:1455

len
int len
Definition: lex.newick.cpp:1450

ncbi::grid::netcache::search::s_Merge
void s_Merge(SExpression &l, SExpression &r)
Definition: netcache_search.cpp:315

a
unsigned int a
Definition: ncbi_localip.c:102

ncbi_pch.hpp

ncbidbg.hpp
NCBI C++ auxiliary debug macros.

ncbimisc.hpp
Miscellaneous common-use basic types and functionality.

min
T min(T x_, T y_)
Definition: njn_function.hpp:107

r
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)

match
static int match(PCRE2_SPTR start_eptr, PCRE2_SPTR start_ecode, uint16_t top_bracket, PCRE2_SIZE frame_size, pcre2_match_data *match_data, match_block *mb)
Definition: pcre2_match.c:594

count
#define count
Definition: pcre2posix_test.c:54

USING_NCBI_SCOPE
USING_NCBI_SCOPE
Definition: adapter_search.cpp:40

br
static SLJIT_INLINE sljit_ins br(sljit_gpr target)
Definition: sljitNativeS390X.c:786

NAdapterSearch::CSimpleUngappedAligner::SMatch
Represents single ungapped alignment.
Definition: adapter_search.hpp:299

NAdapterSearch::CSimpleUngappedAligner::SMatch::len
TPos len
Definition: adapter_search.hpp:302

NAdapterSearch::CSimpleUngappedAligner::SMatch::second
TPos second
Definition: adapter_search.hpp:301

NAdapterSearch::CSimpleUngappedAligner::SMatch::first
TPos first
Definition: adapter_search.hpp:300

NAdapterSearch::IAdapterDetector::SParams
An adapter sequence is presumed to occur at least min_support times in the input, and is overrepresen...
Definition: adapter_search.hpp:107

NAdapterSearch::IAdapterDetector::SParams::HaveContinuedSupport
bool HaveContinuedSupport(size_t top_sup, size_t prev_sup, size_t candidate_sup) const
Definition: adapter_search.hpp:136

NAdapterSearch::IAdapterDetector::SParams::HaveInitialSupport
bool HaveInitialSupport(size_t top_candidate_sup, size_t non_candidate_sup) const
Definition: adapter_search.hpp:125

SOpLess_Second
Definition: adapter_search.cpp:455

SOpLess_Second::operator()
bool operator()(const NAdapterSearch::CSimpleUngappedAligner::TRange &a, const NAdapterSearch::CSimpleUngappedAligner::TRange &b) const
Definition: adapter_search.cpp:456

query
static string query
Definition: struct_dp_demo.cpp:88

eFiles::log
@ log

seed
static int seed
Definition: test_table.cpp:132

letter
static Uint4 letter(char c)
Definition: win_mask_gen_counts.cpp:62