48 static const char* alphabet =
"ACGT";
49 for(
size_t i = 0;
i < mer_size;
i++) {
50 s[mer_size - 1 -
i] = alphabet[w & 3];
60 if(words.size() == 0) {
65 iupac.resize(words.size() - 1);
66 static const char* alphabet =
"ACGT";
67 for(
size_t i = 0;
i < words.size() - 1;
i++) {
68 iupac[
i] = alphabet[words[
i] >> (mer_size * 2 - 2)];
70 iupac +=
s_AsIUPAC(words.back(), mer_size);
77 vector<Uint1> counts(64, 0);
80 counts[w2 & 0x3F] += 1;
84 for(
size_t i = 0;
i < 64;
i++) {
88 return (144 - sumsq)/132.0;
94 const char*
const seq,
106 static const Uint1 char2letter[] = {
107 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
108 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
109 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
110 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
111 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0,
112 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
113 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0,
114 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
115 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
116 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
117 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
118 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
119 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
120 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
121 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
122 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
131 Uint1 c = char2letter[
static_cast<int>(seq[
i])];
137 for(
size_t i = 1;
i < words.size();
i++) {
138 Uint1 c = char2letter[
static_cast<int>(seq[
i + k])];
139 words[
i] = ((words[
i - 1] &
mask) << 2) | c;
144 Uint1 c = 3 - char2letter[
static_cast<int>(seq[seq_size - 1 -
i])];
150 for(
size_t i = 1;
i < words.size();
i++) {
151 Uint1 c = 3 - char2letter[
static_cast<int>(seq[k -
i])];
152 words[
i] = ((words[
i - 1] &
mask) << 2) | c;
159 ::CPairedEndAdapterDetector
162 TWords::const_iterator begin,
163 TWords::const_iterator end)
165 size_t words_size = end - begin;
166 size_t len =
min(words_size, m_len);
170 for(
size_t i = 0;
i <
len;
i++) {
171 x_IncrCount(
i, *(begin +
i) >> 4);
175 if(words_size > 0 && words_size < m_len) {
176 size_t len = min<size_t>(2, m_len - words_size);
177 TWord w = *(begin + words_size);
178 for(
size_t i = 0;
i <
len;
i++) {
179 size_t pos = words_size +
i;
180 TWord w2 = (w >> (2 -
i*2)) & 0xFFFFF;
181 x_IncrCount(pos, w2);
188 ::CPairedEndAdapterDetector
190 ::x_NextWord(
size_t pos,
TWord prev_word)
const
193 size_t best_count = 0;
195 TWord word = ((prev_word << 2) & 0xFFFFF) |
letter;
197 if(
count > best_count) {
206 string NAdapterSearch
207 ::CPairedEndAdapterDetector
209 ::InferConsensus(
const SParams& params)
const
213 size_t secondbest_sup(0);
215 for(
size_t word = 0; word < NMERS10; word++) {
216 size_t sup = m_counts[word];
218 secondbest_sup = top_sup;
229 words[0] = best_word;
233 << s_AsIUPAC(best_word,10)
234 <<
"; overrepresentation: "
240 for(
size_t i = 1;
i < words.size();
i++) {
241 TWord last_word = words[
i - 1];
242 TCount last_sup = x_GetCount(
i - 1, last_word);
244 TWord curr_word = x_NextWord(
i - 1, last_word);
245 TCount curr_sup = x_GetCount(
i, curr_word);
248 words[
i] = curr_word;
255 return s_AsIUPAC(words, 10);
260 pair<size_t, size_t> NAdapterSearch
261 ::CPairedEndAdapterDetector
262 ::s_FindAdapterStartPos(
266 return pair<size_t, size_t>(
267 find(fwd_read.begin(), fwd_read.end(), rev_read.back())
268 - fwd_read.begin() + MER_LENGTH,
270 find(rev_read.begin(), rev_read.end(), fwd_read.front())
276 ::CPairedEndAdapterDetector
281 size_t len = spot_len / 2;
282 const char* fwd_read = spot;
283 const char* rev_read = spot+
len;
293 pair<size_t, size_t> pos = s_FindAdapterStartPos(fwd_words, rev_words);
295 bool is_consistent = (pos.first + pos.second ==
len);
298 size_t adapter_len =
len - pos.first;
299 if(is_consistent && adapter_len >= MER_LENGTH) {
300 m_cons5.AddExemplar(fwd_words.begin() + pos.first, fwd_words.end());
307 s_Translate(rev_read + pos.first, adapter_len,
false, words2);
308 m_cons3.AddExemplar(words2.begin(), words2.end());
335 size_t top_count = m_counts[
seed];
339 words.push_back(
seed);
340 x_ExtendSeed(words, top_count,
false);
341 reverse(words.begin(), words.end());
342 x_ExtendSeed(words, top_count,
true);
350 char c = seq[seq.size() - 1];
351 while(seq.size() > 0 && seq[seq.size() - 1] == c) {
352 seq.resize(seq.size() - 1);
361 ::CUnpairedAdapterDetector
362 ::x_FindAdapterSeed()
const
364 typedef pair<TCount, TWord>
TPair;
365 typedef priority_queue< TPair, vector<TPair>, greater<TPair> > TTopWords;
372 for(
TWord w = 0; w < m_counts.size(); w++) {
376 while(top_words.size() > m_params.top_n) {
386 size_t n = top_words.size();
387 for(; !top_words.empty(); top_words.pop()) {
388 sup = top_words.top().first;
389 word = top_words.top().second;
390 sumlogs +=
log(sup + 1.0);
393 size_t gmean =
n == 0 ? 0 : exp(sumlogs /
n) - 1;
398 <<
"; overrepresentation: "
401 <<
static_cast<size_t>(gmean));
403 return m_params.HaveInitialSupport(sup, gmean) ? word : 0;
416 : (w >> 2) | (
letter << 22);
419 if(
count > best_count) {
434 TWord prev_word = words.back();
435 TWord curr_word = x_GetAdjacent(prev_word, right);
436 TCount prev_sup = m_counts[prev_word];
437 TCount curr_sup = m_counts[curr_word];
439 if(!m_params.HaveContinuedSupport(top_sup, prev_sup, curr_sup)
440 || find(words.begin(), words.end(), curr_word) != words.end())
446 words.push_back(curr_word);
459 return a.second <
b.second;
465 ::CSimpleUngappedAligner
466 ::GetSeqRange(
TPos pos)
const
468 TRanges::const_iterator it = lower_bound(
469 m_subseq_ranges.begin(),
470 m_subseq_ranges.end(),
474 return it == m_subseq_ranges.end() ?
TRange(NULLPOS, NULLPOS) : *it;
479 ::CSimpleUngappedAligner
482 if(
a.len > 0 &&
b.len == 0) {
484 }
else if(
a.len == 0 &&
b.len > 0) {
487 TRange ar = GetSeqRange((
a.second -
a.first) / 2);
488 TRange br = GetSeqRange((
b.second -
b.first) / 2);
492 TPos a_un = (ar.second - ar.first) -
a.len/2;
493 TPos b_un = (
br.second -
br.first) -
b.len/2;
497 <<
" " << (
a.second -
a.first) / 2
500 <<
" " << m_seq.substr(ar.first, ar.second - ar.first));
502 <<
" " << (
b.second -
b.first) / 2
505 <<
" " << m_seq.substr(
br.first,
br.second -
br.first));
511 double a_score =
a.len -
log(1.0 + a_un)*5;
512 double b_score =
b.len -
log(1.0 + b_un)*5;
513 return a_score < b_score ?
b :
a;
525 s_PermuteMismatches(word, approx_words);
535 TPos& pos_ref = vec_index[w];
536 if(pos_ref == pos || pos_ref == NULLPOS) {
539 if(pos_ref != MULTIPOS) {
555 positions.resize(coordset.
size());
556 TPositions::iterator dest = positions.begin();
560 TPos pos = it->second;
563 if(map_index.
find(w) == map_index.
end()) {
564 map_index[w].first = dest;
567 map_index[w].second = dest;
575 m_seq.assign(seq,
len);
577 m_subseq_ranges.clear();
580 fill(m_vec_index.begin(), m_vec_index.end(), (
TPos)NULLPOS);
587 const char* endend = seq+
len;
588 for(
const char *begin = seq, *end = find(begin, endend,
'-');
590 begin = end + 1, end = find(begin, endend,
'-'))
592 TRange r(begin - seq, end - seq);
593 m_subseq_ranges.push_back(
r);
598 for(
size_t i = 0;
i < words.size();
i++) {
600 s_IndexWord(words[
i], pos, m_vec_index, coordset);
604 s_CoordSetToMapIndex(coordset, m_map_index, m_positions);
613 TWords::iterator dest = words.begin();
614 for(
size_t i1 = 3; i1 < 9; i1++) {
616 for(
Uint1 c1 = 0; c1 < 4; c1++) {
617 TWord w1 = s_Put(w, i1, c1);
619 for(
size_t i2 = i1+1; i2 < 9; i2++) {
621 for(
Uint1 c2 = 0; c2 < 4; c2++) {
622 *(dest++) = s_Put(w1, i2, c2);
634 if(m1.
first == NULLPOS) {
656 m.
first = q_pos - s_pos;
666 const size_t query_len,
667 const bool direction,
668 const int match_score,
669 const int mismatch_score,
670 const int dropoff_score)
const
672 int delta_pos = direction ? 1 : -1;
676 typedef pair<TPos, TPos> TPoint;
677 TPoint p(seg.
first + (direction ? seg.
len - 1 : 0),
678 seg.
second + (direction ? seg.
len - 1 : 0));
680 Int8 score(0), max_score(0);
684 TRange q_bounds(0, query_len);
688 p.first += delta_pos;
689 p.second += delta_pos;
690 while(score + dropoff_score > max_score
691 && p.first >= q_bounds.first
692 && p.first < q_bounds.second
693 && p.second >= s_bounds.first
694 && p.second < s_bounds.second)
696 score += (q_seq[p.first] == m_seq[p.second]) ? match_score
700 <<
" " << q_seq[p.first]
701 <<
" " << m_seq[p.second]
703 <<
" " << best_p.first);
706 if(score > max_score) {
711 p.first += delta_pos;
712 p.second += delta_pos;
717 seg.
len = best_p.first - seg.
first + 1;
722 seg2.
first = best_p.first;
723 seg2.
second = best_p.second;
743 for(
size_t i = 0;
i < words.size();
i++) {
747 m_vec_index.begin() + w + 1);
749 if(*
r.first == NULLPOS) {
751 }
else if(*
r.first == MULTIPOS) {
752 r = m_map_index.find(w)->second;
763 for(TPositions::const_iterator it =
r.first; it !=
r.second; ++it) {
764 SMatch new_match = x_CreateMatch(
i, *it);
767 if(!
s_Merge(old_match, new_match)) {
768 best_match = x_GetBetterOf(best_match, old_match);
769 old_match = new_match;
774 ITERATE(TMatches, it, matches) {
775 best_match = x_GetBetterOf(best_match, it->
second);
LargeInt< 1 > revcomp(const LargeInt< 1 > &x, size_t sizeKmer)
CLocalRange< TOffset > TRange
define for the fundamental building block of sequence ranges
static void s_CoordSetToMapIndex(const TCoordSet &coordset, TMapIndex &map_index, TPositions &positions)
static void s_PermuteMismatches(TWord w, TWords &words)
SMatch FindSingleBest(const char *query, size_t len) const
Return best ungapped alignment of at least MER_LENGTH.
pair< TPositions::const_iterator, TPositions::const_iterator > TPosRange
void Init(const char *seq, size_t len)
IUPAC sequence of target sequence(s) Multiple sequences may be concatenated and '-'-delimited e....
SMatch x_CreateMatch(TPos q_start, TPos s_start) const
Create a match in diag/antidiag coordspace for a matched word.
static bool s_Merge(SMatch &m, const SMatch &m2)
void x_Extend(SMatch &match, const char *query_seq, const size_t query_len, const bool direction, const int match_score=3, const int mismatch_score=-2, const int dropoff=5) const
Ungapped extension (match is in normal coordinates)
static void s_IndexWord(TWord w, TPos pos, TPositions &vec_index, TCoordSet &coordset)
pair< TPos, TPos > TRange
vector< TPos > TPositions
TWord x_GetAdjacent(TWord w, bool right) const
if right, return most frequent word whose 11-prefix = 11-suffix of w; otherwise, return most frequent...
virtual void AddExemplar(const char *seq, size_t len)
Add sequence of a spot from an SRA run (single or paired-end read)
virtual string InferAdapterSeq() const
returns IUPAC seq of the inferred adapter seq; empty if not found.
void x_ExtendSeed(TWords &words, size_t top_count, bool right) const
Extend the seed one way, as long as it satisfies extension thresholds.
static bool s_IsLowComplexity(TWord word)
With threshold of 0.9, 0.7% of all words are classified as such.
static double s_GetWordComplexity(TWord word)
Complexity is a value between 0 (for a homopolymer) and 1 (max complexity).
static const size_t MER_LENGTH
Will use 24-bit 12-mers as words.
static string s_AsIUPAC(TWord w, size_t mer_size=MER_LENGTH)
Convert word to IUPAC string [ACGT]{MER_LENGTH}.
static void s_Translate(const char *const iupac_na, size_t len, bool apply_revcomp, TWords &words)
Translate IUPAC sequence to a sequence of overlapping words (A,C,G,T,*) <-> (0,1,2,...
static bool s_IsHomopolymer(TWord w)
const_iterator end() const
const_iterator find(const key_type &key) const
iterator_bool insert(const value_type &val)
parent_type::value_type value_type
SStaticPair< const char *, const char * > TPair
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
#define ERR_POST(message)
Error posting with file, line number information but without error codes.
#define LOG_POST(message)
This macro is deprecated and it's strongly recomended to move in all projects (except tests) to macro...
void Info(CExceptionArgs_Base &args)
uint8_t Uint1
1-byte (8-bit) unsigned integer
int64_t Int8
8-byte (64-bit) signed integer
uint64_t Uint8
8-byte (64-bit) unsigned integer
void s_Merge(SExpression &l, SExpression &r)
NCBI C++ auxiliary debug macros.
Miscellaneous common-use basic types and functionality.
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
static int match(PCRE2_SPTR start_eptr, PCRE2_SPTR start_ecode, uint16_t top_bracket, PCRE2_SIZE frame_size, pcre2_match_data *match_data, match_block *mb)
static SLJIT_INLINE sljit_ins br(sljit_gpr target)
Represents single ungapped alignment.
An adapter sequence is presumed to occur at least min_support times in the input, and is overrepresen...
bool HaveContinuedSupport(size_t top_sup, size_t prev_sup, size_t candidate_sup) const
bool HaveInitialSupport(size_t top_candidate_sup, size_t non_candidate_sup) const
bool operator()(const NAdapterSearch::CSimpleUngappedAligner::TRange &a, const NAdapterSearch::CSimpleUngappedAligner::TRange &b) const
static Uint4 letter(char c)