NCBI C++ ToolKit
seq_masker_istat.hpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: seq_masker_istat.hpp 98105 2022-09-29 00:28:26Z morgulis $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Aleksandr Morgulis
27  *
28  * File Description:
29  * Definition for CSeqMaskerIstat class.
30  *
31  */
32 
33 #ifndef C_SEQ_MASKER_ISTAT_H
34 #define C_SEQ_MASKER_ISTAT_H
35 
36 #include <corelib/ncbitype.h>
37 #include <corelib/ncbistr.hpp>
38 #include <corelib/ncbiobj.hpp>
39 #include <corelib/version_api.hpp>
40 
44 
45 #include <memory>
46 
48 
49 /**
50  **\brief Defines an interface for accessing the unit counts information.
51  **/
53 {
54 public:
55 
56  /**
57  \brief Structure containing information about optimization
58  parameters used.
59  */
61  {
62  /**
63  \brief Object constructor.
64  \param divisor initial value of the divisor_
65  \param cba initial value of cba_
66  */
67  optimization_data( Uint4 divisor, Uint4 * cba )
68  : divisor_( divisor/(8*sizeof( Uint4 )) ), cba_( cba )
69  {}
70 
71  Uint4 divisor_; /**< How many units are represented by one
72  4-byte word in cba_ array. */
73  Uint4 * cba_; /**< Bit array with zeroes where all corresponding
74  units have counts below t_extend. */
75  };
76 
77  /**
78  **\brief Object constructor.
79  **\param arg_threshold the value of t_threshold to use instead of
80  ** the one supplied in the unit counts file
81  **\param arg_textend the value of t_textend to use instead of
82  ** the one supplied in the unit counts file
83  **\param arg_max_count the value of t_high to use instead of
84  ** the one supplied in the unit counts file
85  **\param arg_use_max_count the count to use if the unit count is
86  ** greater than t_high
87  **\param arg_min_count the value of t_low to use instead of
88  ** the one supplied in the unit counts file
89  **\param arg_use_min_count the count to use if the unit count is
90  ** less than t_low
91  **/
92  explicit CSeqMaskerIstat( Uint4 arg_threshold,
93  Uint4 arg_textend,
94  Uint4 arg_max_count,
95  Uint4 arg_use_max_count,
96  Uint4 arg_min_count,
97  Uint4 arg_use_min_count )
98  : threshold( arg_threshold ),
99  textend( arg_textend ),
100  max_count( arg_max_count ),
101  use_max_count( arg_use_max_count ),
102  min_count( arg_min_count ),
103  use_min_count( arg_use_min_count ),
104  ambig_unit( 0 ),
105  opt_data_( 0, 0 ),
106  fmt_gen_algo_ver( CSeqMaskerOstat::StatAlgoVersion )
107  { total_ = 0; }
108 
109  /**
110  **\brief Object destructor.
111  **/
112  virtual ~CSeqMaskerIstat() { if( opt_data_.cba_ ) delete[] opt_data_.cba_; }
113 
114  /**
115  **\brief Look up the count value of a given unit.
116  **\param unit the target unit
117  **\return the count of the unit
118  **/
119  Uint4 operator[]( Uint4 unit ) const
120  {
121  ++total_;
122  return at( unit );
123  }
124 
125  /**
126  **\brief Get the unit size.
127  **\return the unit size
128  **/
129  virtual Uint1 UnitSize() const = 0;
130 
131  /**
132  **\brief Get the value of the unit used to represent an ambuguity.
133  **\return ambiguity unit value
134  **/
136  { return ambig_unit; }
137 
138  /**
139  **\brief Get the value of T_threshold.
140  **\return T_threshold value
141  **/
142  Uint4 get_threshold() const { return threshold; }
143 
144  /**
145  **\brief Get the value of T_extend.
146  **\return T_extend value
147  **/
148  Uint4 get_textend() const { return textend; }
149 
150  /**
151  \brief Get the data structure optimization parameters.
152  \return pointer to optimization structure, if it is
153  initialized, NULL otherwise
154  */
156  { return opt_data_.cba_ == 0 ? 0 : &opt_data_; }
157 
158  /** Return the version of the algorithm used to generate counts */
160  return fmt_gen_algo_ver;
161  }
162 
163  /** Set the version of the algorithm used to generate counts */
165  fmt_gen_algo_ver = v;
166  }
167 
168  mutable Uint8 total_;
169 
170 protected:
171 
172  /**
173  **\brief Get the unit count of a given unit.
174  **
175  ** Derived classes should override this function
176  ** to provide access to the unit counts.
177  **
178  **\param unit the unit value being looked up
179  **\return count corrseponding to unit
180  **/
181  virtual Uint4 at( Uint4 unit ) const = 0;
182 
183 public:
184 
185  /**
186  \brief Get the true count for an n-mer.
187 
188  \param unit the n-mer value
189 
190  \return n-mer count not corrected for t_low
191  and t_high values
192  **/
193  virtual Uint4 trueat( Uint4 unit ) const = 0;
194 
195 protected:
196 
197  /**
198  **\brief Set the value of T_threshold.
199  **\param arg_threshold new T_threshold value
200  **/
201  void set_threshold( Uint4 arg_threshold )
202  { threshold = arg_threshold; }
203 
204  /**
205  **\brief Set the value of T_extend.
206  **\param arg_textend new T_extend value
207  **/
208  void set_textend( Uint4 arg_textend )
209  { textend = arg_textend; }
210 
211 public:
212 
213  /**
214  **\brief Get the current value of T_high.
215  **\return current T_high value
216  **/
217  Uint4 get_max_count() const { return max_count; }
218 
219 protected:
220 
221  /**
222  **\brief Set the value of T_high.
223  **\param arg_max_count new T_high value
224  **/
225  void set_max_count( Uint4 arg_max_count )
226  { max_count = arg_max_count; }
227 
228 public:
229  /**
230  **\brief Get the count value for units with actual counts
231  ** above T_high.
232  **\return value to use for units with count > T_high
233  **/
234  Uint4 get_use_max_count() const { return use_max_count; }
235 
236 protected:
237  /**
238  **\brief Set the count value for units with actual counts
239  ** above T_high.
240  **\param arg_use_max_count new value to use for units with
241  ** counts > T_high
242  **/
243  void set_use_max_count( Uint4 arg_use_max_count )
244  { use_max_count = arg_use_max_count; }
245 
246 public:
247 
248  /**
249  **\brief Get the value of T_low.
250  **\return current T_low value
251  **/
252  Uint4 get_min_count() const { return min_count; }
253 
254 protected:
255 
256  /**
257  **\brief Set the value of T_low.
258  **\param arg_min_count new T_low value
259  **/
260  void set_min_count( Uint4 arg_min_count )
261  {
262  if( min_count != 0 && min_count < arg_min_count ) {
263  ERR_POST( Warning << "Requested value of t_low ("
264  << min_count
265  << ") is less than the one stored with the "
266  << "N-mer counts (" << arg_min_count << ")."
267  << "The value " << arg_min_count
268  << " will be used." );
269  min_count = arg_min_count;
270  }
271  else if( min_count == 0 ) {
272  min_count = arg_min_count;
273  }
274  }
275 
276 public:
277  /**
278  **\brief Get the count value for units with actual counts
279  ** below T_low.
280  **\return value to use for units with counts < T_low
281  **/
282  Uint4 get_use_min_count() const { return use_min_count; }
283 
284  /**\brief Return the metadata string. */
285  string const & GetMetaData() const { return metadata; }
286 
287  /**\brief Return the encoding of the source statistics file. */
288  string const & GetFmtEncoding() const { return fmt_encoding; }
289 
290  /**\brief Return the format version of the source statistics file. */
292  return fmt_version.get();
293  }
294 
295 protected:
296  /**
297  **\brief Set the count value for units with actual counts
298  ** below T_low.
299  **\param arg_use_min_count new value to use for units with
300  ** counts < T_low
301  **/
302  void set_use_min_count( Uint4 arg_use_min_count )
303  { use_min_count = arg_use_min_count; }
304 
305  /**
306  **\brief Set the unit size.
307  **\param arg_unit_size new unit size value
308  **/
309  void set_unit_size( Uint1 arg_unit_size )
310  { unit_size = arg_unit_size; }
311 
312  /**
313  **\brief Set the ambiguity unit value
314  **\param arg_ambig_unit new ambiguity unit
315  **/
317  const CSeqMaskerWindow::TUnit & arg_ambig_unit )
318  { ambig_unit = arg_ambig_unit; }
319 
320  /**
321  \brief Set optimization parameters.
322 
323  Constructor of the derived class is responsible for this.
324 
325  \param opt_data new optimization parameters
326  */
327  void set_optimization_data( const optimization_data & opt_data )
328  { opt_data_ = opt_data; }
329 
330 public:
331 
332  /** Set metadata string. */
333  void SetMetaData( string const & md ) { metadata = md; }
334 
335  void SetMaxCount( Uint4 mc ) { max_map_count = mc; }
336  Uint4 GetMaxCount() const { return max_map_count; }
337 
338  void SetCountMap( std::vector< double > const & cm )
339  { count_map = cm; }
340 
341  std::vector< double > const & GetCountMap() const { return count_map; }
342 
343 protected:
344 
345  /** Set the statistics file format encoding. */
346  void SetFmtEncoding( string const & e ) { fmt_encoding = e; }
347 
348  /** Set the statistics file format version. */
349  void SetFmtVersion( string const & name,
350  int major, int minor, int patch ) {
351  fmt_version.reset(
352  new CComponentVersionInfo( name, major, minor, patch ) );
353  }
354 
355 private:
356 
357  /**\name Provide reference semantics for CSeqMaskerOstat. */
358  /**@{*/
361  /**@}*/
362 
363  Uint4 threshold; /**<\internal T_threshold */
364  Uint4 textend; /**<\internal T_extend */
365  Uint4 max_count; /**<\internal T_high */
366  Uint4 use_max_count; /**<\internal Count to use for units with actual count > T_high. */
367  Uint4 min_count; /**<\internal T_low */
368  Uint4 use_min_count; /**<\internal Count to use for units with actual count < T_low. */
369  Uint1 unit_size; /**<\internal The unit size. */
370 
371  string metadata; /**<\internal Metadata string. */
372  string fmt_encoding; /**<\internal Encoding of the stats file from which the data was read. */
373 
374  /** Format version of the statistics file from which the data was read. */
375  std::unique_ptr< CComponentVersionInfo > fmt_version;
376 
377  CSeqMaskerWindow::TUnit ambig_unit; /**<\internal Unit value to represent ambiguities. */
378 
379  optimization_data opt_data_; /**<\internal Optimization parameters. */
380 
381  /** version of the algorithm used to generate counts */
383 
384  Uint4 max_map_count = 0;
385  std::vector< double > count_map;
386 };
387 
389 
390 #endif
CObject –.
Definition: ncbiobj.hpp:180
Defines an interface for accessing the unit counts information.
virtual ~CSeqMaskerIstat()
Object destructor.
void SetStatAlgoVersion(CSeqMaskerVersion const &v)
Set the version of the algorithm used to generate counts.
virtual Uint4 trueat(Uint4 unit) const =0
Get the true count for an n-mer.
void SetFmtEncoding(string const &e)
Set the statistics file format encoding.
void SetCountMap(std::vector< double > const &cm)
Uint4 get_min_count() const
Get the value of T_low.
void set_use_min_count(Uint4 arg_use_min_count)
Set the count value for units with actual counts below T_low.
std::unique_ptr< CComponentVersionInfo > fmt_version
Format version of the statistics file from which the data was read.
std::vector< double > const & GetCountMap() const
CSeqMaskerIstat(Uint4 arg_threshold, Uint4 arg_textend, Uint4 arg_max_count, Uint4 arg_use_max_count, Uint4 arg_min_count, Uint4 arg_use_min_count)
Object constructor.
void set_textend(Uint4 arg_textend)
Set the value of T_extend.
void set_threshold(Uint4 arg_threshold)
Set the value of T_threshold.
CSeqMaskerWindow::TUnit ambig_unit
virtual Uint4 at(Uint4 unit) const =0
Get the unit count of a given unit.
CSeqMaskerVersion fmt_gen_algo_ver
version of the algorithm used to generate counts
string const & GetMetaData() const
Return the metadata string.
void SetMetaData(string const &md)
Set metadata string.
Uint4 GetMaxCount() const
void set_use_max_count(Uint4 arg_use_max_count)
Set the count value for units with actual counts above T_high.
CSeqMaskerIstat(const CSeqMaskerIstat &)
Uint4 get_use_max_count() const
Get the count value for units with actual counts above T_high.
void SetMaxCount(Uint4 mc)
CSeqMaskerWindow::TUnit AmbigUnit() const
Get the value of the unit used to represent an ambuguity.
CSeqMaskerIstat & operator=(const CSeqMaskerIstat &)
std::vector< double > count_map
void set_optimization_data(const optimization_data &opt_data)
Set optimization parameters.
Uint4 operator[](Uint4 unit) const
Look up the count value of a given unit.
void SetFmtVersion(string const &name, int major, int minor, int patch)
Set the statistics file format version.
Uint4 get_textend() const
Get the value of T_extend.
void set_ambig_unit(const CSeqMaskerWindow::TUnit &arg_ambig_unit)
Set the ambiguity unit value.
Uint4 get_max_count() const
Get the current value of T_high.
CSeqMaskerVersion const & GetStatAlgoVersion() const
Return the version of the algorithm used to generate counts.
optimization_data opt_data_
virtual Uint1 UnitSize() const =0
Get the unit size.
Uint4 get_use_min_count() const
Get the count value for units with actual counts below T_low.
CComponentVersionInfo const * GetFmtVersion() const
Return the format version of the source statistics file.
const optimization_data * get_optimization_data() const
Get the data structure optimization parameters.
string const & GetFmtEncoding() const
Return the encoding of the source statistics file.
void set_unit_size(Uint1 arg_unit_size)
Set the unit size.
void set_min_count(Uint4 arg_min_count)
Set the value of T_low.
Uint4 get_threshold() const
Get the value of T_threshold.
void set_max_count(Uint4 arg_max_count)
Set the value of T_high.
Base class for computing and saving unit counts data.
Uint4 TUnit
Integer type used to represent units within a window.
#define md
Definition: compat-1.3.h:1989
The NCBI C++ standard methods for dealing with std::string.
#define ERR_POST(message)
Error posting with file, line number information but without error codes.
Definition: ncbidiag.hpp:186
void Warning(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1191
uint8_t Uint1
1-byte (8-bit) unsigned integer
Definition: ncbitype.h:99
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
uint64_t Uint8
8-byte (64-bit) unsigned integer
Definition: ncbitype.h:105
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define CComponentVersionInfo
#define NCBI_XALGOWINMASK_EXPORT
Definition: ncbi_export.h:1033
Portable reference counted smart and weak pointers using CWeakRef, CRef, CObject and CObjectEx.
Defines Limits for the types used in NCBI C/C++ toolkit.
Structure containing information about optimization parameters used.
optimization_data(Uint4 divisor, Uint4 *cba)
Object constructor.
Uint4 divisor_
How many units are represented by one 4-byte word in cba_ array.
Uint4 * cba_
Bit array with zeroes where all corresponding units have counts below t_extend.
Modified on Tue Apr 16 20:10:12 2024 by modify_doxy.py rev. 669887