NCBI C++ ToolKit
seq_masker_ostat_opt.hpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: seq_masker_ostat_opt.hpp 84663 2018-11-27 18:22:00Z ucko $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Aleksandr Morgulis
27  *
28  * File Description:
29  * Definition of CSeqMaskerOStatOpt class.
30  *
31  */
32 
33 #ifndef C_SEQ_MASKER_OSTAT_OPT_H
34 #define C_SEQ_MASKER_OSTAT_OPT_H
35 
36 #include <corelib/ncbistre.hpp>
37 
38 #include <vector>
39 
41 
43 
44 /**
45  **\brief Class responsible for collecting unit counts statistics and
46  ** representing it in optimized hash-based format.
47  **/
49 {
50  public:
51 
52  /**
53  **\brief Exceptions that CSeqMaskerOstatOpt might throw.
54  **/
55  class Exception : public CException
56  {
57  public:
58 
59  enum EErrCode
60  {
61  eMemory /**< Memory allocation problem. */
62  };
63 
64  /**
65  **\brief Get a description string for this exception.
66  **\return C-style description string
67  **/
68  virtual const char * GetErrCodeString() const override;
69 
71  };
72 
73  /**
74  **\brief Object constructor.
75  **\param os output stream object, forwarded to CSeqMaskerOstream base
76  **\param sz requested size of the unit counts file in megabytes
77  **\param alloc flag to indicate that the stream was allocated
78  **/
79  explicit CSeqMaskerOstatOpt(
80  CNcbiOstream & os, Uint2 sz, bool alloc,
81  string const & metadata );
82 
83  /**
84  **\brief Object destructor.
85  **/
86  virtual ~CSeqMaskerOstatOpt() {}
87 
88  protected:
89 
90  /**
91  **\brief Parameters of the optimized data structure.
92  **/
93  struct params
94  {
95  Uint4 M; /**< Number of units that have a collision. */
96  Uint1 k; /**< The size of the hash key in bits. */
97  Uint1 roff; /**< Right offset of the hash key in bits. */
98  Uint1 bc; /**< Size of the collisions field in the table in bits. */
99  Uint4 * ht; /**< Hash table. */
100  Uint2 * vt; /**< Secondary counts table. */
101  Uint4 * cba; /**< Cache bit array. */
102  };
103 
104  /**
105  **\brief Dump the unit counts data to the output stream according
106  ** to the requested format.
107  **
108  ** Derived classes should override this function to format the data.
109  **
110  **\param p data structure parameters
111  **/
112  virtual void write_out( const params & p ) const = 0;
113 
114  /**
115  **\brief Get the unit size value in bases.
116  **\return unit size
117  **/
118  Uint1 UnitSize() const;
119 
120  /**
121  **\brief Get the values of masking parameters.
122  **
123  ** Masking parameters is a vector of 4 integers representing
124  ** the values of T_low, T_extend, T_threshold, and T_high.
125  **
126  **\return vector of masking parameters
127  **/
128  const vector< Uint4 > & GetParams() const;
129 
130  /**
131  **\brief Set the unit size value
132  **\param us the unit size
133  **/
134  virtual void doSetUnitSize( Uint4 us );
135 
136  /**
137  **\brief Set count information for the given unit.
138  **\param unit the unit
139  **\param count the number of times the unit and its reverse complement
140  ** appears in the genome
141  **/
142  virtual void doSetUnitCount( Uint4 unit, Uint4 count );
143 
144  /**
145  **\brief Generate a hash function and dump the optimized unit counts
146  ** data to the output stream.
147  **/
148  virtual void doFinalize();
149 
150  private:
151 
152  /**\internal
153  **\brief Find the best set of hash parameters
154  **\param k the target hash key size
155  **\param max_coll [out] returns the maximum number of collisions
156  **\param M [out] returns the number of units with collisions
157  **\param ht pointer to the hash table area
158  **\return the right offset corresponding to the best hash function
159  **/
160  Uint1 findBestRoff( Uint1 k, Uint1 & max_coll, Uint4 & M, Uint4 * ht );
161 
162  /** \internal
163  \brief Create the cache bit array with.
164 
165  Bit array contains 0 if all nmers in the corresponding group are
166  less than t_extend, 1 otherwise. The size of the group is determined
167  dynamically from the nmer size.
168 
169  \param cba [OUT] pointer to the cache bit array
170  */
171  void createCacheBitArray( Uint4 ** cba );
172 
173  Uint2 size_requested; /**<\internal User specified upper limit of the data structure size. */
174  Uint1 unit_bit_size; /**<\internal Unit size in bits. */
175 
176  vector< Uint4 > units; /**<\internal Array of units with counts >= T_low. */
177  vector< Uint2 > counts; /**<\internal Array of corresponding counts. */
178 };
179 
181 
182 #endif
Exceptions that CSeqMaskerOstatOpt might throw.
NCBI_EXCEPTION_DEFAULT(Exception, CException)
Class responsible for collecting unit counts statistics and representing it in optimized hash-based f...
virtual void write_out(const params &p) const =0
Dump the unit counts data to the output stream according to the requested format.
virtual ~CSeqMaskerOstatOpt()
Object destructor.
Base class for computing and saving unit counts data.
virtual void doSetUnitSize(Uint4 us)
virtual void doFinalize()
virtual void doSetUnitCount(Uint4, Uint4)=0
uint8_t Uint1
1-byte (8-bit) unsigned integer
Definition: ncbitype.h:99
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
uint16_t Uint2
2-byte (16-bit) unsigned integer
Definition: ncbitype.h:101
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
Definition: ncbistre.hpp:149
#define NCBI_XALGOWINMASK_EXPORT
Definition: ncbi_export.h:1033
NCBI C++ stream class wrappers for triggering between "new" and "old" C++ stream libraries.
#define M
#define count
static CRef< CUser_object > GetParams()
Parameters of the optimized data structure.
Uint1 roff
Right offset of the hash key in bits.
Uint1 bc
Size of the collisions field in the table in bits.
Uint2 * vt
Secondary counts table.
Uint1 k
The size of the hash key in bits.
Uint4 M
Number of units that have a collision.
Modified on Fri Sep 20 14:57:52 2024 by modify_doxy.py rev. 669887