NCBI C++ ToolKit
seq_masker.hpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: seq_masker.hpp 98106 2022-09-29 01:34:59Z morgulis $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Aleksandr Morgulis
27  *
28  * File Description:
29  * Header file for CSeqMasker class.
30  *
31  */
32 
33 #ifndef C_SEQ_MASKER_H
34 #define C_SEQ_MASKER_H
35 
36 #include <corelib/ncbitype.h>
37 #include <corelib/ncbistr.hpp>
38 #include <corelib/ncbiobj.hpp>
39 
43 
45 
46 class CSeqMaskerScore;
47 
48 /**
49  **\brief Main interface to window based masker functionality.
50  **
51  **/
53 {
54 public:
55 
56  /// Version of window masking algorithm.
58 
59  /**
60  **\brief Type representing a masked interval within a sequence.
61  **
62  ** If A is an object of type TMaskedInterval, then A.first is
63  ** the offset (starting from 0) of the beginning of the
64  ** interval; A.second is the offset of the end of the interval.
65  **
66  **/
67  typedef pair< TSeqPos, TSeqPos > TMaskedInterval;
68 
69  /**
70  **\brief A type representing the total of masking information
71  ** about a sequence.
72  **
73  **/
74  typedef vector< TMaskedInterval > TMaskList;
75 
76  /**
77  **\brief Represents different error situations that can occur
78  ** in the masking process.
79  **/
81  {
82  public:
83 
84  /**
85  **\brief Integer error codes.
86  **/
87  enum EErrCode
88  {
89  eLstatStreamIpenFail, /**< Error opening the length statistics file */
90  eLstatSyntax, /**< Error parsing the length statistics file */
91  eLstatParam, /**< Error deducing parameters from lstat or command line */
92  eScoreAllocFail, /**< Error allocating the score function object */
93  eScoreP3AllocFail, /**< Error allocating the score function object for merging pass */
94  eValidation /**< Insconsistent internal parameters */
95  };
96 
97  /**
98  **\brief Get the exception description string.
99  **
100  ** The method translates internal error code in the exception
101  ** object into a human readable explanation string.
102  **
103  **\return explanation string for the exception
104  **
105  **/
106  virtual const char * GetErrCodeString() const override;
107 
109  };
110 
111  /**
112  **\brief Merge together two result lists.
113  **
114  ** Used to merge results lists obtained from winmask and dust
115  ** algorithms.
116  **
117  **\param dest this list will contain the merged data
118  **\param src the other results list
119  **/
120  static void MergeMaskInfo( TMaskList * dest, const TMaskList * src );
121 
122  /**
123  **\brief Object constructor.
124  **
125  ** Parameters to the constructor determine the behaviour of the
126  ** window based masking procedure.
127  **
128  **\param lstat_name the name of the file containing length statistics
129  **\param arg_window_size the window size in bps
130  **\param arg_window_step the window step
131  **\param arg_unit_step the unit step
132  **\param arg_textend the score above which it is allowed to keep masking
133  **\param arg_cutoff_score the unit score triggering the masking
134  **\param arg_max_score maximum allowed unit score
135  **\param arg_min_score minimum allowed unit score
136  **\param arg_set_max_score score to use for units exceeding max_score
137  **\param arg_set_min_score score to use for units below min_score
138  **\param arg_merge_pass whether or not to perform an interval merging pass
139  **\param arg_merge_cutoff_score combined average score at which intervals
140  ** should be merged
141  **\param arg_abs_merge_cutoff_dist maximum distance between intervals
142  ** at which they can be merged
143  ** unconditionally
144  **\param arg_mean_merge_cutoff_dist maximum distance between intervals
145  ** at which they can be merged if they
146  ** satisfy arg_merge_cutoff_score
147  ** threshold
148  **\param arg_merge_unit_step unit step to use for interval merging
149  **\param arg_trigger determines which method to use to trigger masking
150  **\param tmin_count if arg_trigger is "min" then determines how many of
151  ** the units in a window should be above the score
152  ** threshold in order to trigger masking
153  **\param arg_discontig whether or not to use discontiguous units
154  **\param arg_pattern base pattern to form discontiguous units
155  **\param arg_use_ba use bit array optimization, if available
156  **
157  **/
158  CSeqMasker( const string & lstat_name,
159  Uint1 arg_window_size,
160  Uint4 arg_window_step,
161  Uint1 arg_unit_step,
162  Uint4 arg_textend,
163  Uint4 arg_cutoff_score,
164  Uint4 arg_max_score,
165  Uint4 arg_min_score,
166  Uint4 arg_set_max_score,
167  Uint4 arg_set_min_score,
168  bool arg_merge_pass,
169  Uint4 arg_merge_cutoff_score,
170  Uint4 arg_abs_merge_cutoff_dist,
171  Uint4 arg_mean_merge_cutoff_dist,
172  Uint1 arg_merge_unit_step,
173  const string & arg_trigger,
174  Uint1 tmin_count,
175  bool arg_discontig,
176  Uint4 arg_pattern,
177  bool arg_use_ba,
178  double min_pct = -1.0,
179  double extend_pct = -1.0,
180  double thres_pct = -1.0,
181  double max_pct = -1.0 );
182 
183  /**
184  **\brief Object destructor.
185  **
186  **/
187  ~CSeqMasker();
188 
189  /**
190  **\brief Sequence masking operator.
191  **
192  ** seq_masker objects are function objects with. Main
193  ** processing is done by () operator.
194  **
195  **\param data the original sequence data in iupacna format
196  **\return pointer to the list of masked intervals
197  **
198  **/
199  TMaskList * operator()( const objects::CSeqVector & data ) const;
200 
201 private:
202 
203  /**\internal
204  **\brief Internal representation of a sequence interval.
205  **/
206  struct mitem
207  {
208  Uint4 start; /**< Start of the interval */
209  Uint4 end; /**< End of the interval */
210  double avg; /**< Average score of the units in the interval */
211 
212  /**
213  **\brief Object constructor.
214  **
215  ** All the additional parameters are used by the constructor to compute
216  ** the value of avg.
217  **
218  **\param start the start of the interval
219  **\param end the end of the interval
220  **\param unit_size the unit size in bases
221  **\param data the original sequence data in iupacna format
222  **\param owner back pointer to the seq_masker instance
223  **
224  **/
225  mitem( Uint4 start, Uint4 end, Uint1 unit_size,
226  const objects::CSeqVector & data, const CSeqMasker & owner );
227  };
228 
229  friend struct CSeqMasker::mitem;
230 
231  /**\internal
232  **\brief Type used for storing intermediate masked and unmasked intervals.
233  **/
234  typedef list< mitem > TMList;
235 
236  /** \internal
237  \brief Final masking pass with lookups of the actual Nmer scores.
238  \param data the sequence data
239  \param start start masking at this location
240  \param end stop masking at this location
241  \return container with masked intervals
242  */
243  TMaskList * DoMask( const objects::CSeqVector & data,
244  TSeqPos start, TSeqPos end ) const;
245 
246  /**\internal
247  **\brief Computes the average score of an interval generated by
248  ** connecting two neighbouring masked intervals.
249  **
250  **\param mi points to the first masked interval
251  **\param umi points to the right unmasked neighbour of mi
252  **\param unit_size the unit size to use in computations
253  **\return the average score of an interval formed by
254  ** mi, umi, and mi+1
255  **
256  **/
257  double MergeAvg( TMList::iterator mi, const TMList::iterator & umi,
258  Uint4 unit_size ) const;
259 
260  /**\internal
261  **\brief Merge two neighbouring masked intervals.
262  **
263  ** Merges intervals mi and mi+1 into one with average of the
264  ** triple mi,umi,mi+1. Removes mi mi+1 from m and substitues
265  ** mi with the merged interval. Removes umi from um.
266  **
267  **\param m list of intervals containing mi
268  **\param mi points to the first masked interval in the pair
269  ** that is being merged
270  **\param um list of intervals containing umi
271  **\param umi points to the right unmasked neighbour of mi
272  **
273  **/
274  void Merge( TMList & m, TMList::iterator mi,
275  TMList & um, TMList::iterator & umi ) const;
276 
277  /**\internal
278  **\brief Container of the unit score statistics.
279  **/
281 
282  /**\internal
283  **\brief Score function object to use for extensions.
284  **/
286 
287  /**\internal
288  **\brief Score function object to use for merging.
289  **/
291 
292  /**\internal
293  **\brief Score function object to use for triggering masking.
294  **/
296 
297  /**\internal
298  **\brief The window size in bases.
299  **/
301 
302  /**\internal
303  **\brief The window step.
304  **
305  ** Only windows that start at 0 mod window_step will be considered.
306  **
307  **/
309 
310  /**\internal
311  **\brief The unit step.
312  **
313  ** The distance between consequtive units within a window.
314  **
315  **/
317 
318  /**\internal
319  **\brief Flag indicating whether the merging pass is required.
320  **/
322 
323  /**\internal
324  **\brief Average score that triggers merging of neighbouring
325  ** masked intervals.
326  **/
328 
329  /**\internal
330  **\brief Neighbouring masked intervals that closer to each other
331  ** than this distance are merged unconditionally.
332  **/
334 
335  /**\internal
336  **\brief Neighbouring masked intervals that are farther apart from
337  ** each other than this distance are never merged.
338  **/
340 
341  /**\internal
342  **\brief Unit step to use for interval merging.
343  **
344  ** This is the unit step value that should be used when
345  ** computing the unit score average over the total span of
346  ** two intervals that are candidates for merging.
347  **
348  **/
350 
351  /**\internal
352  **\brief Symbolic names for different masking triggering methods.
353  **/
354  enum
355  {
356  eTrigger_Mean = 0, /**< Using mean of unit scores in the window. */
357  eTrigger_Min /**< Using min score of k unit in the window. */
359 
360  /**\internal
361  **\brief Flag indicating the use of discontiguous units.
362  **/
363  bool discontig;
364 
365  /**\internal
366  **\brief Base pattern to form discontiguous units.
367  **/
369 };
370 
372 
373 #endif
Abstract base class for score function objects.
Represents different error situations that can occur in the masking process.
Definition: seq_masker.hpp:81
EErrCode
Integer error codes.
Definition: seq_masker.hpp:88
@ eLstatSyntax
Error parsing the length statistics file.
Definition: seq_masker.hpp:90
@ eLstatParam
Error deducing parameters from lstat or command line.
Definition: seq_masker.hpp:91
@ eScoreAllocFail
Error allocating the score function object.
Definition: seq_masker.hpp:92
@ eLstatStreamIpenFail
Error opening the length statistics file.
Definition: seq_masker.hpp:89
@ eScoreP3AllocFail
Error allocating the score function object for merging pass.
Definition: seq_masker.hpp:93
NCBI_EXCEPTION_DEFAULT(CSeqMaskerException, CException)
Main interface to window based masker functionality.
Definition: seq_masker.hpp:53
Uint1 unit_step
Definition: seq_masker.hpp:316
list< mitem > TMList
Definition: seq_masker.hpp:234
Uint1 merge_unit_step
Definition: seq_masker.hpp:349
CSeqMaskerScore * score
Definition: seq_masker.hpp:285
@ eTrigger_Mean
Using mean of unit scores in the window.
Definition: seq_masker.hpp:356
@ eTrigger_Min
Using min score of k unit in the window.
Definition: seq_masker.hpp:357
Uint4 merge_cutoff_score
Definition: seq_masker.hpp:327
CSeqMaskerScore * trigger_score
Definition: seq_masker.hpp:295
Uint4 abs_merge_cutoff_dist
Definition: seq_masker.hpp:333
bool merge_pass
Definition: seq_masker.hpp:321
CSeqMaskerScore * score_p3
Definition: seq_masker.hpp:290
pair< TSeqPos, TSeqPos > TMaskedInterval
Type representing a masked interval within a sequence.
Definition: seq_masker.hpp:67
Uint4 mean_merge_cutoff_dist
Definition: seq_masker.hpp:339
TMaskList * DoMask(const objects::CSeqVector &data, TSeqPos start, TSeqPos end) const
Definition: seq_masker.cpp:170
vector< TMaskedInterval > TMaskList
A type representing the total of masking information about a sequence.
Definition: seq_masker.hpp:74
Uint4 pattern
Definition: seq_masker.hpp:368
double MergeAvg(TMList::iterator mi, const TMList::iterator &umi, Uint4 unit_size) const
Definition: seq_masker.cpp:412
Uint1 window_size
Definition: seq_masker.hpp:300
bool discontig
Definition: seq_masker.hpp:363
enum CSeqMasker::@32 trigger
static CSeqMaskerVersion AlgoVersion
Version of window masking algorithm.
Definition: seq_masker.hpp:57
Uint4 window_step
Definition: seq_masker.hpp:308
CRef< CSeqMaskerIstat > ustat
Definition: seq_masker.hpp:280
The NCBI C++ standard methods for dealing with std::string.
char data[12]
Definition: iconv.c:80
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
uint8_t Uint1
1-byte (8-bit) unsigned integer
Definition: ncbitype.h:99
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define NCBI_XALGOWINMASK_EXPORT
Definition: ncbi_export.h:1033
Portable reference counted smart and weak pointers using CWeakRef, CRef, CObject and CObjectEx.
Defines Limits for the types used in NCBI C/C++ toolkit.
Uint4 start
Start of the interval.
Definition: seq_masker.hpp:208
Uint4 end
End of the interval.
Definition: seq_masker.hpp:209
double avg
Average score of the units in the interval.
Definition: seq_masker.hpp:210
void Merge(wxMenu &menu_1, const wxMenu &menu_2)
merges all items form menu_2 into menu_1, preserving the structure if possible
Definition: wx_utils.cpp:579
Modified on Wed Jul 24 17:15:33 2024 by modify_doxy.py rev. 669887