28 #ifndef ALGO_SEQUENCE___ADAPTE_SEARCH__HPP
29 #define ALGO_SEQUENCE___ADAPTE_SEARCH__HPP
44 static const size_t MER_LENGTH = 12;
53 static void s_Translate(
54 const char*
const iupac_na,
60 static string s_AsIUPAC(
62 size_t mer_size = MER_LENGTH);
66 static string s_AsIUPAC(
68 size_t mer_size = MER_LENGTH);
73 static double s_GetWordComplexity(
TWord word);
78 return s_GetWordComplexity(word) < 0.9;
83 return w == 0 || w == 0x555555 || w == 0xAAAAAA || w == 0xFFFFFF;
117 , min_init_factor(10.0f)
118 , min_ext_factor_adj(0.5f)
119 , min_ext_factor_top(0.2f)
126 size_t non_candidate_sup)
const
128 return top_candidate_sup > non_candidate_sup * min_init_factor
129 && top_candidate_sup > min_support;
138 size_t candidate_sup)
const
140 return candidate_sup > top_sup * min_ext_factor_top
141 && candidate_sup > prev_sup * min_ext_factor_adj;
174 : m_counts(1 << (MER_LENGTH*2), 0)
177 virtual void AddExemplar(
const char* seq,
size_t len);
180 virtual string InferAdapterSeq()
const;
184 TWord x_FindAdapterSeed()
const;
188 TWord x_GetAdjacent(
TWord w,
bool right)
const;
191 void x_ExtendSeed(
TWords& words,
size_t top_count,
bool right)
const;
215 static const size_t NMERS10 = 1048576;
218 : m_len(max_pattern_len)
219 , m_counts(max_pattern_len * NMERS10, 0)
224 TWords::const_iterator begin,
225 TWords::const_iterator end);
228 string InferConsensus(
const SParams& params)
const;
233 return m_counts[pos * NMERS10 + word];
238 m_counts[pos * NMERS10 + word] += 1;
242 TWord x_NextWord(
size_t pos,
TWord word)
const;
258 virtual void AddExemplar(
const char* seq,
size_t len);
264 return m_cons5.InferConsensus(m_params) +
"-"
265 + m_cons3.InferConsensus(m_params);
271 static pair<size_t, size_t> s_FindAdapterStartPos(
295 void Init(
const char* seq,
size_t len);
323 typedef pair<TPositions::const_iterator,
354 SMatch x_CreateMatch(TPos q_start, TPos s_start)
const;
359 const char* query_seq,
360 const size_t query_len,
361 const bool direction,
362 const int match_score = 3,
363 const int mismatch_score = -2,
364 const int dropoff = 5)
const;
376 static void s_PermuteMismatches(
TWord w,
TWords& words);
381 return (w & ~(3 << (pos*2)))
389 static void s_IndexWord(
395 static void s_CoordSetToMapIndex(
CConsensusPattern calculates most frequent pattern from a set of (noisy) exemplars based on distribut...
CConsensusPattern(size_t max_pattern_len)
TCount x_GetCount(size_t pos, TWord word) const
void x_IncrCount(size_t pos, TWord word)
vector< TCount > m_counts
CConsensusPattern m_cons5
CConsensusPattern m_cons3
virtual string InferAdapterSeq() const
The returned string contains '-'-delimited pair of IUPAC strings for 5' and 3' adapter respectively.
CPairedEndAdapterDetector(size_t max_len=100)
Find ungapped alignment of queries to a subject The subject sequence is presumed to be fairly short (...
pair< TPositions::const_iterator, TPositions::const_iterator > TPosRange
map< TWord, TPosRange > TMapIndex
const string & GetSeq() const
set< TWordAndPos > TCoordSet
static TWord s_Put(TWord w, size_t pos, Uint1 letter)
Replace a letter in a 2-bit-coding word.
pair< TWord, TPos > TWordAndPos
Helpers for indexing.
pair< TPos, TPos > TRange
vector< TPos > TPositions
This class assembles adapter sequence based on distribution of word counts.
CUnpairedAdapterDetector()
virtual ~IAdapterDetector()
const SParams & GetParams() const
virtual string InferAdapterSeq() const =0
returns IUPAC seq of the inferred adapter seq; empty if not found.
virtual void AddExemplar(const char *seq, size_t len)=0
Add sequence of a spot from an SRA run (single or paired-end read)
static bool s_IsLowComplexity(TWord word)
With threshold of 0.9, 0.7% of all words are classified as such.
NAdapterSearch & operator=(const NAdapterSearch &other)
static bool s_IsHomopolymer(TWord w)
static DLIST_TYPE *DLIST_NAME() first(DLIST_LIST_TYPE *list)
uint8_t Uint1
1-byte (8-bit) unsigned integer
int16_t Int2
2-byte (16-bit) signed integer
uint32_t Uint4
4-byte (32-bit) unsigned integer
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
#define NCBI_XALGOSEQ_EXPORT
void s_Merge(SExpression &l, SExpression &r)
The NCBI C++/STL use hints.
Defines Limits for the types used in NCBI C/C++ toolkit.
static int match(PCRE2_SPTR start_eptr, PCRE2_SPTR start_ecode, uint16_t top_bracket, PCRE2_SIZE frame_size, pcre2_match_data *match_data, match_block *mb)
Represents single ungapped alignment.
An adapter sequence is presumed to occur at least min_support times in the input, and is overrepresen...
bool HaveContinuedSupport(size_t top_sup, size_t prev_sup, size_t candidate_sup) const
bool HaveInitialSupport(size_t top_candidate_sup, size_t non_candidate_sup) const
static Uint4 letter(char c)