NCBI C++ ToolKit
adapter_search.hpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: adapter_search.hpp 60914 2013-12-12 15:38:06Z astashya $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  */
27 
28 #ifndef ALGO_SEQUENCE___ADAPTE_SEARCH__HPP
29 #define ALGO_SEQUENCE___ADAPTE_SEARCH__HPP
30 
31 #include <corelib/ncbitype.h>
32 #include <corelib/ncbistl.hpp>
33 
34 #include <vector>
35 #include <set>
36 #include <map>
37 
39 
40 class NCBI_XALGOSEQ_EXPORT NAdapterSearch // used as namespace
41 {
42 private:
43  /// Will use 24-bit 12-mers as words
44  static const size_t MER_LENGTH = 12;
45  typedef Uint4 TWord;
46  typedef vector<TWord> TWords;
47 
48  typedef Uint4 TCount;
49  typedef vector<TCount> TCounts;
50 
51  /// Translate IUPAC sequence to a sequence of overlapping words
52  /// (A,C,G,T,*) <-> (0,1,2,3,0)
53  static void s_Translate(
54  const char* const iupac_na,
55  size_t len,
56  bool apply_revcomp,
57  TWords& words);
58 
59  /// Convert word to IUPAC string [ACGT]{MER_LENGTH}
60  static string s_AsIUPAC(
61  TWord w,
62  size_t mer_size = MER_LENGTH);
63 
64  /// Convert sequence of overlapping words to an IUPAC string
65  /// (inverse of Translate)
66  static string s_AsIUPAC(
67  const TWords& words,
68  size_t mer_size = MER_LENGTH);
69 
70  /// Complexity is a value between 0 (for a homopolymer) and 1
71  /// (max complexity). Average for all words is 0.984375
72  /// Based on sum of squares of trinucleotide counts
73  static double s_GetWordComplexity(TWord word);
74 
75  /// With threshold of 0.9, 0.7% of all words are classified as such
76  static bool s_IsLowComplexity(TWord word)
77  {
78  return s_GetWordComplexity(word) < 0.9;
79  }
80 
81  static bool s_IsHomopolymer(TWord w)
82  {
83  return w == 0 || w == 0x555555 || w == 0xAAAAAA || w == 0xFFFFFF;
84  }
85 
87  {}
88 
90  {
91  return *this;
92  }
93 
94 public:
95  ///////////////////////////////////////////////////////////////////////
96  //
98  {
99  public:
100  /// An adapter sequence is presumed to occur at least
101  /// min_support times in the input, and is overrepresented by
102  /// a factor of at least min_init_factor relative to most frequent
103  /// biological words.
104  /// The seed word will be extended as long as it remains frequent enough
105  /// and does not drop off too sharply relative to adjacent word.
106  struct SParams
107  {
108  size_t min_support;
109  size_t top_n;
113 
115  : min_support(100)
116  , top_n(1000)
117  , min_init_factor(10.0f)
118  , min_ext_factor_adj(0.5f)
119  , min_ext_factor_top(0.2f)
120  {}
121 
122  // An originating word must be overrepresented by a factor of
123  // at least min_init_factor relative to background, and satisfy
124  // absolute minimum of min_support
125  bool HaveInitialSupport(size_t top_candidate_sup,
126  size_t non_candidate_sup) const
127  {
128  return top_candidate_sup > non_candidate_sup * min_init_factor
129  && top_candidate_sup > min_support;
130  }
131 
132  // Will extend sequence as long as support is within
133  // factor of min_ext_factor_top from the originating seed, and
134  // factor of min_ext_factor_adj from the neighbor
135  // (i.e. support does not drop too sharply)
136  bool HaveContinuedSupport(size_t top_sup,
137  size_t prev_sup,
138  size_t candidate_sup) const
139  {
140  return candidate_sup > top_sup * min_ext_factor_top
141  && candidate_sup > prev_sup * min_ext_factor_adj;
142  }
143  };
144 
146  {
147  return m_params;
148  }
149 
150  const SParams& GetParams() const
151  {
152  return m_params;
153  }
154 
155  /// Add sequence of a spot from an SRA run (single or paired-end read)
156  virtual void AddExemplar(const char* seq, size_t len) = 0;
157 
158  /// returns IUPAC seq of the inferred adapter seq; empty if not found.
159  virtual string InferAdapterSeq() const = 0;
160 
161  virtual ~IAdapterDetector()
162  {}
163 
164  protected:
166  };
167 
168 
169  /// This class assembles adapter sequence based on distribution of word counts
171  {
172  public:
174  : m_counts(1 << (MER_LENGTH*2), 0)
175  {}
176 
177  virtual void AddExemplar(const char* seq, size_t len);
178 
179  /// returns IUPAC seq of the inferred adapter seq; empty if not found.
180  virtual string InferAdapterSeq() const;
181 
182  private:
183  /// Find a word that is likely adapter-specific, if any.
184  TWord x_FindAdapterSeed() const;
185 
186  /// if right, return most frequent word whose 11-prefix = 11-suffix of w;
187  /// otherwise, return most frequent word whose 11-suffix = 11-prefix of w
188  TWord x_GetAdjacent(TWord w, bool right) const;
189 
190  /// Extend the seed one way, as long as it satisfies extension thresholds.
191  void x_ExtendSeed(TWords& words, size_t top_count, bool right) const;
192 
193  private:
194  TCounts m_counts; //counts of words in the seq; not position-specific.
195  };
196 
197 
199  {
200  public:
201 
202  /// CConsensusPattern calculates most frequent pattern from a
203  /// set of (noisy) exemplars based on distribution of 10-mer
204  /// frequencies at every position of the pattern.
205  ///
206  /// inputs | output
207  /// ----------|---------
208  /// ACGTACGT |
209  /// ATATATAT | ACGTACGT
210  /// ACGTACGT |
211  /// CGCGCGCG |
213  {
214  public:
215  static const size_t NMERS10 = 1048576; //total count of possible 10-mers: 4^10
216 
217  CConsensusPattern(size_t max_pattern_len)
218  : m_len(max_pattern_len)
219  , m_counts(max_pattern_len * NMERS10, 0)
220  {}
221 
222  /// Add Exemplar sequence translated to overlapping 12-mers
223  void AddExemplar(
224  TWords::const_iterator begin,
225  TWords::const_iterator end);
226 
227  /// Return IUPAC string of most common sequence
228  string InferConsensus(const SParams& params) const;
229 
230  private:
231  TCount x_GetCount(size_t pos, TWord word) const
232  {
233  return m_counts[pos * NMERS10 + word];
234  }
235 
236  void x_IncrCount(size_t pos, TWord word)
237  {
238  m_counts[pos * NMERS10 + word] += 1;
239  }
240 
241  /// Return most frequent word @ pos+1 following word@pos. (note: 10-mers here)
242  TWord x_NextWord(size_t pos, TWord word) const;
243 
244  size_t m_len;
245  vector<TCount> m_counts; //rectangular array of 10-mer word counts at given position.
246  //size: length of pattern * count of 10-mers (4^10)
247  };
248 
249  public:
250  CPairedEndAdapterDetector(size_t max_len = 100)
251  : m_cons5(max_len)
252  , m_cons3(max_len)
253  {}
254 
255  /// Add examplar paired-end spot
256  /// seq = fwd_read + rev_read; |fwd_read|=|rev_read|=len/2
257  /// Reads are in original orientation ->...<- (as in fastq-dump)
258  virtual void AddExemplar(const char* seq, size_t len);
259 
260  /// The returned string contains '-'-delimited pair of
261  /// IUPAC strings for 5' and 3' adapter respectively
262  virtual string InferAdapterSeq() const
263  {
264  return m_cons5.InferConsensus(m_params) + "-"
265  + m_cons3.InferConsensus(m_params);
266  }
267 
268  private:
269  /// Given a paired-end read in consistent orientations with a fragment-overlap
270  /// overlap, compute the putative positions of the adapter starts
271  static pair<size_t, size_t> s_FindAdapterStartPos(
272  const TWords& fwd_read,
273  const TWords& rev_read);
274 
277  };
278 
279 
280  /// Find ungapped alignment of queries to a subject
281  /// The subject sequence is presumed to be fairly short (~100 bases, few kb max)
283  {
284  public:
285  typedef Int2 TPos;
286 
288  {}
289 
290  /// IUPAC sequence of target sequence(s)
291  /// Multiple sequences may be concatenated and '-'-delimited
292  /// e.g. if we want to match to seq or its rev-comp,
293  /// the combined seq is seq+"-"+rev_comp(seq).
294  ///
295  void Init(const char* seq, size_t len);
296 
297  /// Represents single ungapped alignment
298  struct SMatch
299  {
300  TPos first; // query-pos
301  TPos second; // subject-pos
303 
304  SMatch() : first(NULLPOS), second(NULLPOS), len(0)
305  {}
306  };
307 
308  /// Return best ungapped alignment of at least MER_LENGTH
309  SMatch FindSingleBest(const char* query, size_t len) const;
310 
311  const string& GetSeq() const
312  {
313  return m_seq;
314  }
315 
316  typedef pair<TPos, TPos> TRange;
317  TRange GetSeqRange(TPos pos) const;
318 
319  private:
320 
321  typedef vector<TPos> TPositions;
322 
323  typedef pair<TPositions::const_iterator,
324  TPositions::const_iterator> TPosRange;
326 
327  typedef vector<TRange> TRanges;
328 
329  string m_seq; // sequence of '-'-delimited subseqs
330  TRanges m_subseq_ranges; // stores (begin, end) of each subseq
331 
332  // m_vec_index: word -> starting position in the target seq.
333  // If uninitialized, m_vec_index will contain NULLPOS
334  // If unique, value will be in vec_index;
335  // If non-unique, m_vec_index will contain MULTIPOS;
336  // values will be in m_positions, and m_map_index
337  // will contain w -> (begin, end) into m_positions
338  TPositions m_vec_index; // length: 2^bitsize(word)
339  TPositions m_positions; // length: count of all non-unique words
340  TMapIndex m_map_index; // length: count of distinct non-unique words
341 
342  // Special values in vec-index.
343  // MULTIPOS indicates that the multiple pos-values
344  // for this word reside in the map-index.
345  // Note: not using "static const TPos NULLPOS = -1; etc"
346  // due to compiler-specific issues
347  enum {
348  NULLPOS = -1,
349  MULTIPOS = -2
350  };
351 
352  private:
353  /// Create a match in diag/antidiag coordspace for a matched word.
354  SMatch x_CreateMatch(TPos q_start, TPos s_start) const;
355 
356  /// Ungapped extension (match is in normal coordinates)
357  void x_Extend(
358  SMatch& match,
359  const char* query_seq,
360  const size_t query_len,
361  const bool direction, //true iff forward
362  const int match_score = 3,
363  const int mismatch_score = -2,
364  const int dropoff = 5) const;
365 
366  static bool s_Merge(SMatch& m, const SMatch& m2);
367 
368  /// Return better of the two matches
369  /// (tradeoff between length and coverage)
370  /// Positions are presumed to be are in diag/antidiag space
371  const SMatch& x_GetBetterOf(const SMatch& a, const SMatch& b) const;
372 
373  //Permute all mismatches in a 24-bit 12-mer:
374  // - Allow maximum of two mismatches.
375  // - 3-prefix and suffix are required to match.
376  static void s_PermuteMismatches(TWord w, TWords& words);
377 
378  /// Replace a letter in a 2-bit-coding word
379  static TWord s_Put(TWord w, size_t pos, Uint1 letter)
380  {
381  return (w & ~(3 << (pos*2))) // zero-out target frame;
382  | (letter << (pos*2)); // put the letter in there
383  }
384 
385  /// Helpers for indexing
386  typedef pair<TWord, TPos> TWordAndPos;
387  typedef set<TWordAndPos> TCoordSet; // Will use set as multimap
388  // enforcing unique keyvals
389  static void s_IndexWord(
390  TWord w,
391  TPos pos,
392  TPositions& vec_index,
393  TCoordSet& coordset);
394 
395  static void s_CoordSetToMapIndex(
396  const TCoordSet& coordset,
397  TMapIndex& map_index,
398  TPositions& positions);
399 
400  };
401 };
402 
404 #endif
CConsensusPattern calculates most frequent pattern from a set of (noisy) exemplars based on distribut...
virtual string InferAdapterSeq() const
The returned string contains '-'-delimited pair of IUPAC strings for 5' and 3' adapter respectively.
Find ungapped alignment of queries to a subject The subject sequence is presumed to be fairly short (...
pair< TPositions::const_iterator, TPositions::const_iterator > TPosRange
static TWord s_Put(TWord w, size_t pos, Uint1 letter)
Replace a letter in a 2-bit-coding word.
pair< TWord, TPos > TWordAndPos
Helpers for indexing.
This class assembles adapter sequence based on distribution of word counts.
const SParams & GetParams() const
virtual string InferAdapterSeq() const =0
returns IUPAC seq of the inferred adapter seq; empty if not found.
virtual void AddExemplar(const char *seq, size_t len)=0
Add sequence of a spot from an SRA run (single or paired-end read)
static bool s_IsLowComplexity(TWord word)
With threshold of 0.9, 0.7% of all words are classified as such.
vector< TWord > TWords
NAdapterSearch & operator=(const NAdapterSearch &other)
vector< TCount > TCounts
static bool s_IsHomopolymer(TWord w)
Definition: set.hpp:45
static DLIST_TYPE *DLIST_NAME() first(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:46
static void Init(void)
Definition: cursor6.c:76
uint8_t Uint1
1-byte (8-bit) unsigned integer
Definition: ncbitype.h:99
int16_t Int2
2-byte (16-bit) signed integer
Definition: ncbitype.h:100
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define NCBI_XALGOSEQ_EXPORT
Definition: ncbi_export.h:1017
int len
void s_Merge(SExpression &l, SExpression &r)
unsigned int a
Definition: ncbi_localip.c:102
The NCBI C++/STL use hints.
Defines Limits for the types used in NCBI C/C++ toolkit.
static int match(PCRE2_SPTR start_eptr, PCRE2_SPTR start_ecode, uint16_t top_bracket, PCRE2_SIZE frame_size, pcre2_match_data *match_data, match_block *mb)
Definition: pcre2_match.c:594
Represents single ungapped alignment.
An adapter sequence is presumed to occur at least min_support times in the input, and is overrepresen...
bool HaveContinuedSupport(size_t top_sup, size_t prev_sup, size_t candidate_sup) const
bool HaveInitialSupport(size_t top_candidate_sup, size_t non_candidate_sup) const
static string query
static Uint4 letter(char c)
Modified on Fri Sep 20 14:57:57 2024 by modify_doxy.py rev. 669887