NCBI C++ ToolKit
seqdboidlist.hpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 #ifndef OBJTOOLS_READERS_SEQDB__SEQDBOIDLIST_HPP
2 #define OBJTOOLS_READERS_SEQDB__SEQDBOIDLIST_HPP
3 
4 /* $Id: seqdboidlist.hpp 100101 2023-06-15 14:10:29Z merezhuk $
5  * ===========================================================================
6  *
7  * PUBLIC DOMAIN NOTICE
8  * National Center for Biotechnology Information
9  *
10  * This software/database is a "United States Government Work" under the
11  * terms of the United States Copyright Act. It was written as part of
12  * the author's official duties as a United States Government employee and
13  * thus cannot be copyrighted. This software/database is freely available
14  * to the public for use. The National Library of Medicine and the U.S.
15  * Government have not placed any restriction on its use or reproduction.
16  *
17  * Although all reasonable efforts have been taken to ensure the accuracy
18  * and reliability of the software and data, the NLM and the U.S.
19  * Government do not and cannot warrant the performance or results that
20  * may be obtained by using this software or data. The NLM and the U.S.
21  * Government disclaim all warranties, express or implied, including
22  * warranties of performance, merchantability or fitness for any particular
23  * purpose.
24  *
25  * Please cite the author in any work or product based on this material.
26  *
27  * ===========================================================================
28  *
29  * Author: Kevin Bealer
30  *
31  */
32 
33 /// @file seqdboidlist.hpp
34 /// The SeqDB oid filtering layer.
35 ///
36 /// Defines classes:
37 /// CSeqDBOIDList
38 ///
39 /// Implemented for: UNIX, MS-Windows
40 
43 #include "seqdbvolset.hpp"
44 #include "seqdbfilter.hpp"
45 #include "seqdbgilistset.hpp"
46 #include "seqdbbitset.hpp"
47 
49 
50 using namespace ncbi::objects;
51 
52 /// CSeqDBOIDList
53 ///
54 /// This class defines a set of included oids over the entire oid
55 /// range. The underlying implementation is a large bit map. If the
56 /// database has one volume, which uses an OID mask file, this object
57 /// will memory map that file and use it directly. Otherwise, an area
58 /// of memory will be allocated (one bit per OID), and the relevant
59 /// bits will be turned on in that space. This information may come
60 /// from memory mapped oid lists, or it may come from GI lists, which
61 /// are converted to OIDs using ISAM indices. Because of these two
62 /// modes of operation, care must be taken to insure that the
63 /// placement of the bits exactly corresponds to the layout of the
64 /// memory mappable oid mask files.
65 
66 class CSeqDBOIDList : public CObject {
67 public:
68  /// A large enough type to span all OIDs.
69  typedef int TOID;
70 
71  /// A type which spans possible file offsets.
73 
74  /// Constructor.
75  ///
76  /// All processing to build the oid mask array is done in the
77  /// constructor. The volumes will be queried for information on
78  /// how many and what filter files to apply to each volume, and
79  /// these files will be used to build the oid bit array.
80  ///
81  /// @param atlas
82  /// The CSeqDBAtlas object.
83  /// @param volumes
84  /// The set of database volumes.
85  /// @param filters
86  /// The filtering to apply to the database volumes.
87  /// @param gi_list
88  /// The User GI List (if there is one).
89  /// @param neg_list
90  /// The Negative User GI List (if there is one).
91  /// @param locked
92  /// The lock holder object for this thread.
93  CSeqDBOIDList(CSeqDBAtlas & atlas,
94  const CSeqDBVolSet & volumes,
95  CSeqDB_FilterTree & filters,
96  CRef<CSeqDBGiList> & gi_list,
97  CRef<CSeqDBNegativeList> & neg_list,
98  CSeqDBLockHold & locked,
99  const CSeqDBLMDBSet & lmdb_set);
100 
101  /// Destructor.
102  ///
103  /// All resources will be freed (returned to the atlas). This
104  /// class uses the atlas to get the memory it needs, so the space
105  /// for the oid bit array is counted toward the memory bound.
106  ~CSeqDBOIDList();
107 
108  /// Find an included oid from the specified point.
109  ///
110  /// This call tests whether the specified oid is included in the
111  /// map. If it is, true is returned and the argument is not
112  /// modified. If it is not included, but a subsequent oid is, the
113  /// argument is adjusted to the next included oid, and true is
114  /// returned. If no oids exist from here to the end of the array,
115  /// false is returned.
116  ///
117  /// @param next_oid
118  /// The oid to check, and also the returned oid.
119  /// @return
120  /// True if an oid was found.
121  bool CheckOrFindOID(TOID & next_oid) const
122  {
123  size_t bit = next_oid;
124  bool found = m_AllBits->CheckOrFindBit(bit);
125 
126  next_oid = static_cast<int>(bit);
127  _ASSERT(size_t(next_oid) == bit);
128 
129  return found;
130  }
131 
132  /// Deallocate the memory ranges owned by this object.
133  ///
134  /// This object may hold a lease on a file owned by the atlas. If
135  /// so, this method will release that memory. It should only be
136  /// called during destruction, since this class has no facilities
137  /// for reacquiring the memory lease.
138  void UnLease()
139  {
140  m_Lease.Clear();
141  }
142 
143  /// Dump debug information for this object
144  /// @sa CDebugDumpable
145  void DebugDump(CDebugDumpContext ddc, unsigned int depth) const;
146 
147 private:
148  /// Shorthand type to clarify code that iterates over memory.
149  typedef const unsigned char TCUC;
150 
151  /// Shorthand type to clarify code that iterates over memory.
152  typedef unsigned char TUC;
153 
154  /// Check if a bit is set.
155  ///
156  /// Returns true if the specified oid is included.
157  ///
158  /// @param oid
159  /// The oid to check.
160  /// @return
161  /// true if the oid is included.
162  inline bool x_IsSet(TOID oid) const;
163 
164  /// Build an oid mask in memory.
165  ///
166  /// This method allocates an oid bit array which spans the entire
167  /// oid range in use. It then maps all OID mask files and GI list
168  /// files. It copies the bit data from the oid mask files into
169  /// this array, translates all GI lists into OIDs and enables the
170  /// associated bits, and sets all bits to 1 for any "fully
171  /// included" volumes. This up-front work is intended to make
172  /// access to the data as fast as possible later on. In some
173  /// cases, this is not the most efficient way to do this. Faster
174  /// and more efficient storage methods are possible in cases where
175  /// very sparse GI lists are used. More efficient storage is
176  /// possible in cases where small masked databases are mixed with
177  /// large, "fully-in" volumes.
178  ///
179  /// @param volset
180  /// The set of volumes to build an oid mask for.
181  /// @param filters
182  /// The filtering to apply to the database volumes.
183  /// @param gi_list
184  /// Gi list object.
185  /// @param neg_list
186  /// Negative ID list object.
187  /// @param locked
188  /// The lock holder object for this thread.
189  void x_Setup(const CSeqDBVolSet & volset,
190  CSeqDB_FilterTree & filters,
191  CRef<CSeqDBGiList> & gi_list,
192  CRef<CSeqDBNegativeList> & neg_list,
193  CSeqDBLockHold & locked,
194  const CSeqDBLMDBSet & lmdb_set);
195 
196  /// Clear all bits in a range.
197  ///
198  /// This method turns off all bits in the specified oid range. It
199  /// is used after alias file processing to turn off bit ranges
200  /// that are masked by a user specified GI list.
201  ///
202  /// @param oid_start
203  /// The volume's starting oid.
204  /// @param oid_end
205  /// The volume's ending oid.
206  void x_ClearBitRange(int oid_start, int oid_end);
207 
208  /// Compute the oid mask bitset for a database volume.
209  ///
210  /// The filter tree will be specialized to this database volume and
211  /// the OID mask bitset for this volume will be computed.
212  ///
213  /// @param ft The filter tree for all volumes.
214  /// @param vol The volume entry object for this volume.
215  /// @param gis An object that manages the GI lists used here.
216  /// @param locked The lock holder object for this thread.
217  /// @return An OID bitset object.
219  x_ComputeFilters(const CSeqDB_FilterTree & ft,
220  const CSeqDBVolEntry & vol,
221  CSeqDBGiListSet & gis,
222  CSeqDBLockHold & locked,
223  bool isBlastDBv5);
224 
225  /// Load the named OID mask file into a bitset object.
226  ///
227  /// @param fn The filename from which to load the OID mask.
228  /// @param vol_start The first OID included in this volume.
229  /// @param vol_end The first OID after this volume.
230  /// @param locked The lock holder object for this thread.
231  /// @return An OID bitset object.
233  x_GetOidMask(const CSeqDB_Path & fn,
234  int vol_start,
235  int vol_end);
236 
237 
238  /// Load an ID (GI or TI) list file into a bitset object.
239  ///
240  /// @param ids A set of included GIs or TIs.
241  /// @param vol_start The first OID included in this volume.
242  /// @param vol_end The first OID after this volume.
243  /// @return An OID bitset object.
245  x_IdsToBitSet(const CSeqDBGiList & ids, int vol_start, int vol_end);
246 
247  /// Apply a user GI list to a volume.
248  ///
249  /// This method applies a user-specified filter to the OID list.
250  /// Unlike x_ApplyFilter, which turns on the bits of the filter,
251  /// this method turns OFF the disincluded bits. It is therefore
252  /// an AND operation between the user filter and the (already
253  /// applied) alias file filters.
254  ///
255  /// @param gis
256  /// The user gi list to apply to the volumes.
257  /// @param locked
258  /// The lock holder object for this thread.
259  void x_ApplyUserGiList(CSeqDBGiList & gis);
260 
261 
262  /// Apply a negative user GI list to a volume.
263  ///
264  /// This method applies a user-specified filter to the OID list.
265  /// It serves the same purpose for negative GI lists that
266  /// x_ApplyUserGiList serves for positive GI lists. The operation
267  /// performed here is an AND operation between the the (already
268  /// applied) alias file filters and the negation of the user
269  /// filter.
270  ///
271  /// @param neg
272  /// The negative user gi list to apply to the volumes.
273  /// @param is_v5
274  /// True if db is v5
275  void x_ApplyNegativeList(CSeqDBNegativeList & neg, bool is_v5);
276 
277  bool x_ComputeFilters(const CSeqDBVolSet & volset,
278  const CSeqDB_FilterTree & filters,
279  const CSeqDBLMDBSet & lmdb_set,
280  CSeqDB_BitSet & filter_bit,
281  CRef<CSeqDBGiList> user_list,
282  CRef<CSeqDBNegativeList> neg_user_list);
283 
284  /// The memory management layer object.
286 
287  /// A memory lease which holds the mask file (if only one is used).
289 
290  /// The total number of OIDs represented in the bit set.
292 
293  /// An OID bit set covering all volumes.
295 };
296 
297 inline bool
299 {
300  _ASSERT(m_AllBits.NotEmpty());
301  return (oid < m_NumOIDs) && m_AllBits->GetBit(oid);
302 }
303 
305 
306 #endif // OBJTOOLS_READERS_SEQDB__SEQDBOIDLIST_HPP
307 
CObject –.
Definition: ncbiobj.hpp:180
CSeqDBAtlas class.
Definition: seqdbatlas.hpp:297
CNcbiStreamoff TIndx
The type used for file offsets.
Definition: seqdbatlas.hpp:301
CSeqDBGiListSet class.
CSeqDBGiList.
CSeqDBLMDBSet.
CSeqDBLockHold.
Definition: seqdbatlas.hpp:166
CSeqDBNegativeList.
CSeqDBOIDList.
int TOID
A large enough type to span all OIDs.
bool x_IsSet(TOID oid) const
Check if a bit is set.
unsigned char TUC
Shorthand type to clarify code that iterates over memory.
void UnLease()
Deallocate the memory ranges owned by this object.
CSeqDBFileMemMap m_Lease
A memory lease which holds the mask file (if only one is used).
CSeqDBAtlas & m_Atlas
The memory management layer object.
const unsigned char TCUC
Shorthand type to clarify code that iterates over memory.
CRef< CSeqDB_BitSet > m_AllBits
An OID bit set covering all volumes.
int m_NumOIDs
The total number of OIDs represented in the bit set.
bool CheckOrFindOID(TOID &next_oid) const
Find an included oid from the specified point.
CSeqDBAtlas::TIndx TIndx
A type which spans possible file offsets.
CSeqDBVolEntry.
Definition: seqdbvolset.hpp:59
CSeqDBVolSet.
Bit set class.
Definition: seqdbbitset.hpp:49
Tree of nodes describing filtering of database sequences.
CSeqDB_Path.
static unsigned char depth[2 *(256+1+29)+1]
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
Defines BLAST database access classes.
Implementation for the CSeqDB_BitSet class, a bit vector.
File access objects for CSeqDB.
Implementation for some assorted ID list filtering code.
Defines set of GI lists.
Manages a set of database volumes.
#define _ASSERT
Modified on Fri Sep 20 14:57:50 2024 by modify_doxy.py rev. 669887