NCBI C++ ToolKit
win_mask_gen_counts.hpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: win_mask_gen_counts.hpp 98105 2022-09-29 00:28:26Z morgulis $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Aleksandr Morgulis
27  *
28  * File Description:
29  * Header file for CWinMaskCountsGenerator class.
30  *
31  */
32 
33 #ifndef C_WIN_MASK_COUNTS_GENERATOR_H
34 #define C_WIN_MASK_COUNTS_GENERATOR_H
35 
36 #include <string>
37 #include <vector>
38 
39 #include <corelib/ncbitype.h>
40 #include <corelib/ncbistre.hpp>
41 
42 #include <objmgr/bioseq_ci.hpp>
44 #include <objmgr/scope.hpp>
49 
52 // #include "win_mask_config.hpp"
53 
55 
56 /**
57  **\brief This class encapsulates the n-mer frequency counts generation
58  ** functionality of winmasker.
59  **
60  **/
62 {
63 public:
64 
65  /**\brief Exceptions that CWinMaskCountsGenerator may throw.
66  */
68  {
69  public:
70 
71  /**\brief Error codes.
72  */
73  enum EErrCode
74  {
75  eNullGenome /**< Genome has 0 size. */
76  };
77 
78  /**\brief Return description string corresponding to an error code.
79  \return error string
80  */
81  virtual const char * GetErrCodeString() const override;
82 
84  };
85 
86  /**
87  **\brief Constructor.
88  **
89  ** Creates an instance based on configuration parameters.
90  **
91  **\param input input file name or a name of the file containing
92  ** a list of input files (one per line) depending
93  ** on the value of use_list parameter
94  **\param output name of the output file (empty means standard
95  ** output)
96  **\param infmt input format
97  **\param sformat counts format
98  **\param th string describing 4 percentage values (comma separated)
99  ** used to compute winmask score thresholds
100  **\param mem_avail memory (in megabytes) available to the function
101  **\param unit_size n-mer size (value of n)
102  **\param min_count do not consider n-mers with counts less than
103  ** the value this parameter
104  **\param max_count maximum n-mer count to consider in winmask
105  ** thresholds computations
106  **\param check_duplicates true if input checking for duplicates is
107  ** requested; false otherwise
108  **\param use_list true if input file contains the list of fasta
109  ** file names; false if input is the name of the
110  ** fasta file itself
111  **\param ids set of ids to consider
112  **\param exclude_ids set of ids to ignore
113  **\param use_ba use bit array optimization for optimized binary
114  ** unit counts format
115  **\param metadata the metadata string
116  **\param min_pct min score as percentage of counts
117  **\param extend_pct interval extension score as percentage of counts
118  **\param thres_pct masking threshold score as percentage of counts
119  **\param max_pct max score as percentage of counts
120  **
121  **/
122  CWinMaskCountsGenerator( const string & input,
123  const string & output,
124  const string & infmt,
125  const string & sformat,
126  const string & th,
127  Uint4 mem_avail,
128  Uint1 unit_size,
129  Uint8 genome_size,
130  Uint4 min_count,
131  Uint4 max_count,
132  bool check_duplicates,
133  bool use_list,
134  const CWinMaskUtil::CIdSet * ids,
135  const CWinMaskUtil::CIdSet * exclude_ids,
136  bool use_ba,
137  string const & metadata,
138  double min_pct = -1.0,
139  double extend_pct = -1.0,
140  double thres_pct = -1.0,
141  double max_pct = -1.0 );
142 
143  /**
144  **\brief Constructor.
145  **
146  ** Creates an instance based on configuration parameters.
147  **
148  **\param input input file name or a name of the file containing
149  ** a list of input files (one per line) depending
150  ** on the value of use_list parameter
151  **\param os the output stream
152  **\param infmt input format
153  **\param sformat counts format
154  **\param th string describing 4 percentage values (comma separated)
155  ** used to compute winmask score thresholds
156  **\param mem_avail memory (in megabytes) available to the function
157  **\param unit_size n-mer size (value of n)
158  **\param min_count do not consider n-mers with counts less than
159  ** the value this parameter
160  **\param max_count maximum n-mer count to consider in winmask
161  ** thresholds computations
162  **\param check_duplicates true if input checking for duplicates is
163  ** requested; false otherwise
164  **\param use_list true if input file contains the list of fasta
165  ** file names; false if input is the name of the
166  ** fasta file itself
167  **\param ids set of ids to consider
168  **\param exclude_ids set of ids to ignore
169  **\param use_ba use bit array optimization for optimized binary
170  ** unit counts format
171  **\param metadata the metadata string
172  **\param min_pct min score as percentage of counts
173  **\param extend_pct interval extension score as percentage of counts
174  **\param thres_pct masking threshold score as percentage of counts
175  **\param max_pct max score as percentage of counts
176  **
177  **/
178  CWinMaskCountsGenerator( const string & input,
179  CNcbiOstream & os,
180  const string & infmt,
181  const string & sformat,
182  const string & th,
183  Uint4 mem_avail,
184  Uint1 unit_size,
185  Uint8 genome_size,
186  Uint4 min_count,
187  Uint4 max_count,
188  bool check_duplicates,
189  bool use_list,
190  const CWinMaskUtil::CIdSet * ids,
191  const CWinMaskUtil::CIdSet * exclude_ids,
192  bool use_ba,
193  string const & metadata,
194  double min_pct = -1.0,
195  double extend_pct = -1.0,
196  double thres_pct = -1.0,
197  double max_pct = -1.0 );
198 
199  /**
200  **\brief Object destructor.
201  **
202  **/
204 
205  /**
206  **\brief This function does the actual n-mer counting.
207  **
208  ** Determines the prefix length based on the available memory and
209  ** calls process for each prefix to compute partial counts.
210  **
211  **/
212  void operator()();
213 
214 private:
215 
216  /**\internal
217  **\brief Compute n-mer frequency counts for a given prefix.
218  **
219  **\param prefix the prefix string
220  **\param prefix_size the prefix length in base pairs
221  **\param input list of input fasta files
222  **
223  **/
224  void process( Uint4 prefix, Uint1 prefix_size,
225  const vector< string > & input,
226  bool do_output );
227 
228  /**\internal
229  **\brief Return the total length of all sequences in a
230  ** fasta file.
231  **
232  **\param fname FASTA file name
233  **\return combined length of all sequences in fname
234  **
235  **/
236  Uint8 fastalen( const string & fname ) const;
237 
238  string input; /**<\internal input file (or list of input files) */
239  CRef< CSeqMaskerOstat > ustat; /**<\internal object used to output the unit counts statistics */
240  Uint8 max_mem; /**<\internal available memory in bytes */
241  Uint4 unit_size; /**<\internal n-mer length in base pairs */
242  Uint8 genome_size; /**<\internal genome size in bases */
243  Uint4 min_count; /**<\internal minimal n-mer count to consider */
244  Uint4 max_count; /**<\internal maximal n-mer count to consider for thresholds computations */
245  Uint4 t_high; /**<\internal maximal n_mer count to consider */
246  bool has_min_count; /**<\internal true iff -t_low was given on command line */
247  bool no_extra_pass; /**<\internal true iff -t_low and -t_high was given on command line */
248  bool check_duplicates; /**<\internal whether to check input for duplicates */
249  bool use_list; /**<\internal whether input is a fasta file or a file list */
250 
251  Uint4 total_ecodes; /**<\internal total number of different n-mers found */
252  vector< Uint4 > score_counts; /**<\internal counts table for each suffix */
253  double th[4]; /**<\internal percentages used to determine threshold scores */
254 
255  const CWinMaskUtil::CIdSet * ids; /**<\internal set of ids to process */
256  const CWinMaskUtil::CIdSet * exclude_ids; /**<\internal set of ids to ignore */
257 
258  string infmt; /**<\internal input format */
259 };
260 
262 
263 #endif
Exceptions that CWinMaskCountsGenerator may throw.
NCBI_EXCEPTION_DEFAULT(GenCountsException, CException)
This class encapsulates the n-mer frequency counts generation functionality of winmasker.
CRef< CSeqMaskerOstat > ustat
const CWinMaskUtil::CIdSet * ids
const CWinMaskUtil::CIdSet * exclude_ids
Base class for sets of seq_id representations used with -ids and -exclude-ids options.
static SQLCHAR output[256]
Definition: print.c:5
uint8_t Uint1
1-byte (8-bit) unsigned integer
Definition: ncbitype.h:99
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
uint64_t Uint8
8-byte (64-bit) unsigned integer
Definition: ncbitype.h:105
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
Definition: ncbistre.hpp:149
#define NCBI_XALGOWINMASK_EXPORT
Definition: ncbi_export.h:1033
<!DOCTYPE HTML >< html > n< header > n< title > PubSeq Gateway Help Page</title > n< style > n th
static int input()
NCBI C++ stream class wrappers for triggering between "new" and "old" C++ stream libraries.
Defines Limits for the types used in NCBI C/C++ toolkit.
The Object manager core.
static const char * prefix[]
Definition: pcregrep.c:405
Modified on Tue Apr 16 20:10:27 2024 by modify_doxy.py rev. 669887