NCBI C++ ToolKit
symdust.hpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: symdust.hpp 91946 2020-12-17 12:52:58Z grichenk $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Aleksandr Morgulis
27  *
28  * File Description:
29  * Header file for CSymDustMasker class.
30  *
31  */
32 
33 #ifndef C_SYM_DUST_MASKER_HPP
34 #define C_SYM_DUST_MASKER_HPP
35 
36 #include <corelib/ncbitype.h>
37 #include <corelib/ncbistr.hpp>
38 #include <corelib/ncbiobj.hpp>
39 
42 #include <objmgr/seq_vector.hpp>
43 #include <util/random_gen.hpp>
44 
45 #include <iostream>
46 #include <vector>
47 #include <algorithm>
48 #include <iterator>
49 #include <memory>
50 #include <deque>
51 #include <list>
52 
53 
55 
56 /**
57  \brief Looks for low complexity parts of sequences according to the
58  symmetric version of DUST algorithm.
59  */
61 {
62  private:
63 
64  /** \internal
65  \brief Function class responsible for conversion from IUPACNA
66  to NCBI2NA coding.
67  */
69  {
70  /** \internal
71  \brief Operator performing the actual conversion.
72  \param r base letter in IOPACNA encoding
73  \return the same letter in NCBI2NA encoding
74  */
76  {
77  switch( r )
78  {
79  case 67: return 1;
80  case 71: return 2;
81  case 84: return 3;
82  case 78: return (m_Random.GetRand() & 0x3);
83  default: return 0;
84  }
85  }
87  };
88 
89  typedef objects::CSeqVector seq_t; /**<\internal Sequence type. */
90  typedef CIupac2Ncbi2na_converter convert_t; /**<\internal Converter type. */
91 
92  public:
93 
94  /**\brief Public sequence type. */
95  typedef seq_t sequence_type;
96  /**\brief Integer size type corresponding to sequence_type. */
97  typedef sequence_type::size_type size_type;
98  /**\brief Type respresenting an interval selected for masking. */
99  typedef std::pair< size_type, size_type > TMaskedInterval;
100  /**\brief Type representing a list of masked intervals. */
101  typedef std::vector< TMaskedInterval > TMaskList;
102 
103  static const Uint4 DEFAULT_LEVEL = 20; /**< Default value of score threshold. */
104  static const Uint4 DEFAULT_WINDOW = 64; /**< Default window size. */
105  static const Uint4 DEFAULT_LINKER = 1; /**< Default value of the longest distance between
106  consequtive masked intervals at which they
107  should be merged. */
108 
109  // These (up to constructor) are public to work around the bug in SUN C++ compiler.
110 
111  /** \internal
112  \brief Type representing a perfect interval.
113  */
114  struct perfect
115  {
116  TMaskedInterval bounds_; /**<\internal The actual interval. */
117  Uint4 score_; /**<\internal The score of the interval. */
118  size_type len_; /**<\internal The length of the interval. */
119 
120  /** \internal
121  \brief Object constructor.
122  \param start position of the left end
123  \param stop position of the right end
124  \param score the score
125  \param len the length
126  */
127  perfect( size_type start, size_type stop, Uint4 score, size_type len )
128  : bounds_( start, stop ), score_( score ), len_( len )
129  {}
130  };
131 
132  /**\brief Type representing a list of perfect intervals. */
133  typedef std::list< perfect > perfect_list_type;
134  /**\brief Table type to store score sum thresholds for each window length. */
135  typedef std::vector< Uint4 > thres_table_type;
136  /**\brief Type representing a triplet value. */
138 
139  /**\brief Selects the significant bits in triplet_type. */
140  static const triplet_type TRIPLET_MASK = 0x3F;
141 
142  /**
143  \brief Object constructor.
144  \param level score threshold
145  \param window max window size
146  \param linker max distance at which to merge consequtive masked intervals
147  */
148  CSymDustMasker( Uint4 level = DEFAULT_LEVEL,
149  size_type window
150  = static_cast< size_type >( DEFAULT_WINDOW ),
151  size_type linker
152  = static_cast< size_type >( DEFAULT_LINKER ) );
153 
154  /**
155  \brief Mask a sequence.
156  \param seq a sequence to mask
157  \return list of masked intervals
158  */
159  std::unique_ptr< TMaskList > operator()( const sequence_type & seq );
160 
161  /**
162  \brief Mask a part of the sequence.
163  \param seq the sequence to mask
164  \param start beginning position of the subsequence to mask
165  \param stop ending position of the subsequence to mask
166  \return list of masked intervals
167  */
168  std::unique_ptr< TMaskList > operator()( const sequence_type & seq,
169  size_type start, size_type stop );
170 
171  /**
172  \brief Mask a sequence and return result as a sequence of CSeq_loc
173  objects.
174  \param seq_id sequence id
175  \param seq the sequence
176  \param [out] vector of const (smart) references to CSeq_loc
177  */
178  void GetMaskedLocs(
179  objects::CSeq_id & seq_id,
180  const sequence_type & seq,
181  std::vector< CConstRef< objects::CSeq_loc > > & locs );
182 
183  /**\brief Mask a sequence and return result as a CPacked_seqint
184  instance.
185  \param seq_id sequence id
186  \param seq the sequence
187  */
188  CRef< objects::CPacked_seqint > GetMaskedInts(
189  objects::CSeq_id & seq_id, const sequence_type & seq );
190 
191  private:
192 
193  /**\internal Sequence iterator type. */
194  typedef sequence_type::const_iterator seq_citer_type;
195 
196  /** \internal
197  \brief Class representing the set of triplets in a window.
198  */
199  class triplets
200  {
201  public:
202 
203  /** \internal
204  \brief Object constructor.
205  \param window max window size
206  \param low_k max triplet multiplicity that guarantees that
207  the window score is not above the threshold
208  \param perfect_list [in/out] current list of perfect intervals
209  \param thresholds table of threshold values for each window size
210  */
211  triplets( size_type window,
212  Uint1 low_k,
213  perfect_list_type & perfect_list,
214  thres_table_type & thresholds );
215 
216  size_type start() const { return start_; } /**<\internal Get position of the first triplet. */
217  size_type stop() const { return stop_; } /**<\internal Get position of the last triplet. */
218  size_type size() const { return triplet_list_.size(); } /**<\internal Get the number of triplets. */
219 
220  /** \internal
221  \brief Update the list of perfect intervals with with suffixes
222  of the current window.
223  */
224  void find_perfect();
225 
226  /** \internal
227  \brief Shift the window one base to the right using triplet
228  value t.
229  \param t the triplet value to add to the right end of the
230  triplet list
231  \return false, if the new window contains a single triplet value;
232  true otherwise
233  */
234  bool shift_window( triplet_type t );
235 
236  /** \internal
237  \brief Shift a single triplet window using a triplet value t.
238  \param t the triplet value to add to the right end of the
239  triplet list
240  \return false, if the new window contains a single triplet value;
241  true otherwise
242  */
243  bool shift_high( triplet_type t );
244 
245  /** \internal
246  \brief Check the condition of Proposition 2 allowing
247  to skip the window processing.
248  \return true if the window requires suffix processing,
249  false otherwise
250  */
251  bool needs_processing() const
252  {
253  Uint4 count = stop_ - L;
254  return count < triplet_list_.size() &&
255  10*r_w > thresholds_[count];
256  }
257 
258  private:
259 
260  /**\internal Implementation type for triplets list. */
261  typedef std::deque< triplet_type > impl_type;
262  /**\internal Triplets list iterator type. */
263  typedef impl_type::const_iterator impl_citer_type;
264  /**\internal Type for triplet counts tables. */
265  typedef Uint1 counts_type[64];
266 
267  /** \internal
268  \brief Recompute the value of the running sum
269  and the triplet counts when a new triplet
270  is added.
271  \param r the running sum
272  \param c the triplet counts
273  \param t the new triplet value
274  */
276  Uint4 & r, counts_type & c, triplet_type t )
277  { r += c[t]; ++c[t]; }
278 
279  /** \internal
280  \brief Recompute the value of the running sum
281  and the triplet counts when a triplet
282  is removed.
283  \param r the running sum
284  \param c the triplet counts
285  \param t the triplet value being removed
286  */
288  Uint4 & r, counts_type & c, triplet_type t )
289  { --c[t]; r -= c[t]; }
290 
291  impl_type triplet_list_; /**<\internal The triplet list. */
292 
293  size_type start_; /**<\internal Position of the first triplet in the window. */
294  size_type stop_; /**<\internal Position of the last triplet in the window. */
295  size_type max_size_; /**<\internal Maximum window size. */
296 
297  Uint1 low_k_; /**<\internal Max triplet multiplicity that guarantees that
298  that the window score is not above the threshold. */
299  Uint4 L; /**<\internal Position of the start of the window suffix
300  corresponding to low_k_. */
301 
302  perfect_list_type & P; /**<\internal Current list of perfect subintervals. */
303  thres_table_type & thresholds_; /**<\internal Table containing thresholds for each
304  value of window length. */
305 
306  counts_type c_w; /**<\internal Table of triplet counts for the whole window. */
307  counts_type c_v; /**<\internal Table of triplet counts for the window suffix. */
308  Uint4 r_w; /**<\internal Running sum for the whole window. */
309  Uint4 r_v; /**<\internal Running sum for the window suffix. */
310  Uint4 num_diff; /**<\internal Number of different triplets values. */
311  };
312 
313  /** \internal
314  \brief Merge perfect intervals into the result list.
315  \param res the result list
316  \param w the list of perfect intervals
317  \param start the start position of the subsequence
318  */
319  void save_masked_regions(
320  TMaskList & res, size_type w, size_type start );
321 
322  Uint4 level_; /**<\internal Score threshold. */
323  size_type window_; /**<\internal Max window size. */
324  size_type linker_; /**<\internal Max distance at which consequtive masked intervals should be merged. */
325 
326  Uint1 low_k_; /**<\internal max triplet multiplicity guaranteeing not to exceed score threshold. */
327 
328  perfect_list_type P; /**<\internal List of perfect intervals within current window. */
329  thres_table_type thresholds_; /**<\internal Table containing score thresholds for each window size. */
330 
331  convert_t converter_; /**\internal IUPACNA to NCBI2NA converter object. */
332 };
333 
335 
336 #endif
CRandom::
Definition: random_gen.hpp:66
size_type stop() const
Definition: symdust.hpp:217
void rem_triplet_info(Uint4 &r, counts_type &c, triplet_type t)
Definition: symdust.hpp:287
void add_triplet_info(Uint4 &r, counts_type &c, triplet_type t)
Definition: symdust.hpp:275
thres_table_type & thresholds_
Definition: symdust.hpp:303
std::deque< triplet_type > impl_type
Definition: symdust.hpp:261
perfect_list_type & P
Definition: symdust.hpp:302
size_type size() const
Definition: symdust.hpp:218
size_type start() const
Definition: symdust.hpp:216
bool needs_processing() const
Definition: symdust.hpp:251
impl_type::const_iterator impl_citer_type
Definition: symdust.hpp:263
Looks for low complexity parts of sequences according to the symmetric version of DUST algorithm.
Definition: symdust.hpp:61
Uint1 triplet_type
Type representing a triplet value.
Definition: symdust.hpp:137
objects::CSeqVector seq_t
Definition: symdust.hpp:89
std::list< perfect > perfect_list_type
Type representing a list of perfect intervals.
Definition: symdust.hpp:133
perfect_list_type P
Definition: symdust.hpp:328
sequence_type::const_iterator seq_citer_type
Definition: symdust.hpp:194
std::pair< size_type, size_type > TMaskedInterval
Type respresenting an interval selected for masking.
Definition: symdust.hpp:99
std::vector< Uint4 > thres_table_type
Table type to store score sum thresholds for each window length.
Definition: symdust.hpp:135
convert_t converter_
Definition: symdust.hpp:331
size_type linker_
Definition: symdust.hpp:324
CIupac2Ncbi2na_converter convert_t
Definition: symdust.hpp:90
size_type window_
Definition: symdust.hpp:323
thres_table_type thresholds_
Definition: symdust.hpp:329
seq_t sequence_type
Public sequence type.
Definition: symdust.hpp:95
sequence_type::size_type size_type
Integer size type corresponding to sequence_type.
Definition: symdust.hpp:97
std::vector< TMaskedInterval > TMaskList
Type representing a list of masked intervals.
Definition: symdust.hpp:101
The NCBI C++ standard methods for dealing with std::string.
uint8_t Uint1
1-byte (8-bit) unsigned integer
Definition: ncbitype.h:99
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define NCBI_XALGODUSTMASK_EXPORT
Definition: ncbi_export.h:1041
int len
EIPRangeType t
Definition: ncbi_localip.c:101
Portable reference counted smart and weak pointers using CWeakRef, CRef, CObject and CObjectEx.
Defines Limits for the types used in NCBI C/C++ toolkit.
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
perfect(size_type start, size_type stop, Uint4 score, size_type len)
Definition: symdust.hpp:127
TMaskedInterval bounds_
Definition: symdust.hpp:116
Modified on Tue Apr 23 07:38:52 2024 by modify_doxy.py rev. 669887