NCBI C++ ToolKit
bgzf.hpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 #ifndef SRA__READER__BAM__BGZF__HPP
2 #define SRA__READER__BAM__BGZF__HPP
3 /* $Id: bgzf.hpp 100043 2023-06-06 20:11:40Z vasilche $
4  * ===========================================================================
5  *
6  * PUBLIC DOMAIN NOTICE
7  * National Center for Biotechnology Information
8  *
9  * This software/database is a "United States Government Work" under the
10  * terms of the United States Copyright Act. It was written as part of
11  * the author's official duties as a United States Government employee and
12  * thus cannot be copyrighted. This software/database is freely available
13  * to the public for use. The National Library of Medicine and the U.S.
14  * Government have not placed any restriction on its use or reproduction.
15  *
16  * Although all reasonable efforts have been taken to ensure the accuracy
17  * and reliability of the software and data, the NLM and the U.S.
18  * Government do not and cannot warrant the performance or results that
19  * may be obtained by using this software or data. The NLM and the U.S.
20  * Government disclaim all warranties, express or implied, including
21  * warranties of performance, merchantability or fitness for any particular
22  * purpose.
23  *
24  * Please cite the author in any work or product based on this material.
25  *
26  * ===========================================================================
27  *
28  * Authors: Eugene Vasilchenko
29  *
30  * File Description:
31  * Access to BGZF files (block GZip file)
32  *
33  */
34 
35 #include <corelib/ncbistd.hpp>
36 #include <corelib/ncbifile.hpp>
37 #include <util/simple_buffer.hpp>
40 
43 
44 class CSeq_entry;
45 class CPagedFile;
46 class CPagedFilePage;
47 class CBGZFFile;
48 class CBGZFStream;
49 
50 class CPagedFilePage : public CObject
51 {
52 public:
53  typedef Uint8 TFilePos;
54 
57 
59  {
60  return m_FilePos.load(memory_order_acquire);
61  }
62  size_t GetPageSize() const
63  {
64  return m_Size;
65  }
66  const char* GetPagePtr() const
67  {
68  return m_Ptr;
69  }
70 
71  bool Contains(TFilePos file_pos) const
72  {
73  return (file_pos - GetFilePos()) < GetPageSize();
74  }
75 
76 protected:
77  friend class CPagedFile;
78 
79 private:
80  atomic<TFilePos> m_FilePos;
81  size_t m_Size;
82  const char* m_Ptr;
85 };
86 
87 
89 {
90 public:
92 
93  explicit
94  CPagedFile(const string& file_name);
95  ~CPagedFile();
96 
97 #define USE_RANGE_CACHE 1
98 #ifdef USE_RANGE_CACHE
100 #else
102 #endif
104 
105  // return page that contains the file position
106  TPage GetPage(TFilePos pos);
107 
108  pair<Uint8, double> GetReadStatistics() const;
109  void SetPreviousReadStatistics(const pair<Uint8, double>& stats);
110  // estimate best next page size to read using collected statistics
111  size_t GetNextPageSizePow2() const;
112 
113 private:
114  void x_AddReadStatistics(Uint8 bytes, double seconds);
115 
116  void x_ReadPage(CPagedFilePage& page, TFilePos file_pos, size_t size);
117 
119 
120  // three variants: direct file IO, memory mapped file, or VDB KFile
124 
125  // cache for loaded pages
127 
133 };
134 
135 
137 {
138 public:
139  enum EErrCode {
141  eFormatError, ///< includes decompression errors
142  eInvalidArg ///< invalid function argument
143  };
144  virtual const char* GetErrCodeString(void) const override;
146 };
147 
148 
149 struct SBamUtil {
150  // conversion of BAM bytes into larger values - ints and floats
151  // the source data have any alignment
152 
153  static Uint2 MakeUint2(const char* buf)
154  {
155  return Uint2(Uint1(buf[0]))|
156  (Uint2(Uint1(buf[1]))<<8);
157  }
158 
159  static Uint4 MakeUint4(const char* buf)
160  {
161  return Uint4(Uint1(buf[0]))|
162  (Uint4(Uint1(buf[1]))<<8)|
163  (Uint4(Uint1(buf[2]))<<16)|
164  (Uint4(Uint1(buf[3]))<<24);
165  }
166 
167  static Uint8 MakeUint8(const char* buf)
168  {
169  return Uint8(Uint1(buf[0]))|
170  (Uint8(Uint1(buf[1]))<<8)|
171  (Uint8(Uint1(buf[2]))<<16)|
172  (Uint8(Uint1(buf[3]))<<24)|
173  (Uint8(Uint1(buf[4]))<<32)|
174  (Uint8(Uint1(buf[5]))<<40)|
175  (Uint8(Uint1(buf[6]))<<48)|
176  (Uint8(Uint1(buf[7]))<<56);
177  }
178 
179  union UFloatUint4 {
180  float f;
182  };
183  static float MakeFloat(const char* buf)
184  {
185  UFloatUint4 u;
186  u.i = MakeUint4(buf);
187  return u.f;
188  }
189 };
190 
191 
192 class CBGZFPos
193 {
194 public:
195  typedef Uint8 TFileBlockPos; // position of block start in a file
196  typedef Uint4 TByteOffset; // position of byte within block
197  typedef Uint8 TVirtualPos; // virtual position, ordered
198 
199  static const Uint4 kMaxBlockSize = 1<<16;
200 
202  : m_VirtualPos(0)
203  {
204  }
205  explicit
207  : m_VirtualPos(pos)
208  {
209  }
210  CBGZFPos(TFileBlockPos block_pos, TByteOffset byte_offset)
211  : m_VirtualPos((block_pos<<16)+byte_offset)
212  {
213  }
214 
216  {
217  return m_VirtualPos;
218  }
219 
221  {
222  return m_VirtualPos >> 16;
223  }
225  {
226  return TByteOffset(m_VirtualPos&(0xffff));
227  }
228 
229  bool operator==(const CBGZFPos& b) const
230  {
231  return m_VirtualPos == b.m_VirtualPos;
232  }
233  bool operator!=(const CBGZFPos& b) const
234  {
235  return m_VirtualPos != b.m_VirtualPos;
236  }
237  bool operator<(const CBGZFPos& b) const
238  {
239  return m_VirtualPos < b.m_VirtualPos;
240  }
241  bool operator>(const CBGZFPos& b) const
242  {
243  return m_VirtualPos > b.m_VirtualPos;
244  }
245  bool operator<=(const CBGZFPos& b) const
246  {
247  return m_VirtualPos <= b.m_VirtualPos;
248  }
249  bool operator>=(const CBGZFPos& b) const
250  {
251  return m_VirtualPos >= b.m_VirtualPos;
252  }
253 
255  {
256  return CBGZFPos(TVirtualPos(-1));
257  }
258  bool IsInvalid() const
259  {
260  return GetVirtualPos() == TVirtualPos(-1);
261  }
262 
264 
265 private:
267 
268 };
270 ostream& operator<<(ostream& out, const CBGZFPos& p);
271 
272 typedef pair<CBGZFPos, CBGZFPos> CBGZFRange;
274 ostream& operator<<(ostream& out, const CBGZFRange& r);
275 
277 {
278 public:
279  typedef Uint8 TFileBlockPos; // position of block start in a file
280  typedef Uint4 TFileBlockSize; // size of block in a file
281  typedef Uint4 TDataSize; // size of uncompressed data
282  typedef Uint4 TCRC32;
283 
284  CBGZFBlock();
285  ~CBGZFBlock();
286 
287 
289  {
290  return m_FileBlockPos.load(memory_order_acquire);
291  }
293  {
294  return m_FileBlockSize;
295  }
297  {
298  return GetFileBlockPos() + GetFileBlockSize();
299  }
301  {
302  return m_DataSize;
303  }
304 
305  static const TFileBlockSize kMaxFileBlockSize = 1<<16;
306  static const TDataSize kMaxDataSize = 1<<16;
307 
308 protected:
309  friend class CBGZFFile;
310  friend class CBGZFStream;
311 
312 private:
313  atomic<TFileBlockPos> m_FileBlockPos;
317 };
318 
319 
321 {
322 public:
323  explicit
324  CBGZFFile(const string& file_name);
325  ~CBGZFFile();
326 
327  pair<Uint8, double> GetReadStatistics() const
328  {
329  return m_File->GetReadStatistics();
330  }
331  void SetPreviousReadStatistics(const pair<Uint8, double>& stats)
332  {
333  m_File->SetPreviousReadStatistics(stats);
334  }
335 
336  pair<Uint8, double> GetUncompressStatistics() const;
337 
338 protected:
339  friend class CBGZFStream;
340 
341  void x_AddUncompressStatistics(Uint8 bytes, double seconds);
342 
346 
347  TBlock GetBlock(TFileBlockPos file_pos,
348  CPagedFile::TPage& page,
350 
351  bool x_ReadBlock(CBGZFBlock& block,
352  TFileBlockPos file_pos,
353  CPagedFile::TPage& page,
355 
356 private:
359 
363 };
364 
365 
367 {
368 public:
369  CBGZFStream();
370  explicit
372  ~CBGZFStream();
373 
374  void Close();
375  void Open(CBGZFFile& file);
376 
378  {
379  return m_Block? m_Block->GetDataSize(): 0;
380  }
382  {
383  return m_Block? m_Block->GetFileBlockPos(): 0;
384  }
386  {
387  return m_Block? m_Block->GetNextFileBlockPos(): 0;
388  }
389  bool HaveBytesInBlock() const
390  {
391  return m_ReadPos < GetBlockDataSize();
392  }
393 
394  CBGZFPos GetPos() const
395  {
396  return CBGZFPos(GetBlockFilePos(), m_ReadPos);
397  }
399  {
400  return CBGZFPos(GetNextBlockFilePos(), 0);
401  }
403  {
404  if ( HaveBytesInBlock() ) {
405  return GetPos();
406  }
407  else {
408  return GetNextBlockPos();
409  }
410  }
412  {
413  return m_EndPos;
414  }
415  // seek to position to read till end_pos, or EOF if end_pos is invalid
416  void Seek(CBGZFPos pos, CBGZFPos end_pos = CBGZFPos::GetInvalid());
417 
418  // return non-zero number of available bytes in current decompressed buffer
419  size_t GetNextAvailableBytes();
420  // return true if there are more bytes before this position
422  {
423  if ( HaveBytesInBlock() ) {
424  return GetPos() < m_EndPos;
425  }
426  return HaveNextDataBlock();
427  }
428  // return true if there are more data blocks before this position
429  // current buffer must be read till the end
430  bool HaveNextDataBlock();
431 
432  // read up to count bytes into a buffer, may return smaller number
433  size_t Read(char* buf, size_t count);
434 
435  // read count bytes and return pointer to read data
436  // the pointer is either into decompressed buffer or into temporary buffer
437  // the returned pointer is guaranteed to be valid until next read or seek
438  const char* Read(size_t count);
439 
440 private:
441  bool x_NextBlock();
442 
443  const char* x_Read(CBGZFPos::TFileBlockPos file_pos, size_t size, char* buffer);
444 
445  // returns false if m_EndPos is invalid and EOF happened
447 
455 };
456 
457 
460 
461 #endif // SRA__READER__BAM__BGZF__HPP
ostream & operator<<(ostream &out, const CBGZFPos &p)
Definition: bgzf.cpp:337
pair< CBGZFPos, CBGZFPos > CBGZFRange
Definition: bgzf.hpp:272
Uint4 TFileBlockSize
Definition: bgzf.hpp:280
Uint8 TFileBlockPos
Definition: bgzf.hpp:279
Uint4 TCRC32
Definition: bgzf.hpp:282
TFileBlockPos GetFileBlockPos() const
Definition: bgzf.hpp:288
TFileBlockSize GetFileBlockSize() const
Definition: bgzf.hpp:292
AutoArray< char > m_Data
Definition: bgzf.hpp:316
static const TDataSize kMaxDataSize
Definition: bgzf.hpp:306
TDataSize m_DataSize
Definition: bgzf.hpp:315
atomic< TFileBlockPos > m_FileBlockPos
Definition: bgzf.hpp:313
static const TFileBlockSize kMaxFileBlockSize
Definition: bgzf.hpp:305
Uint4 TDataSize
Definition: bgzf.hpp:281
CBGZFBlock()
Definition: bgzf.cpp:349
TFileBlockSize m_FileBlockSize
Definition: bgzf.hpp:314
~CBGZFBlock()
Definition: bgzf.cpp:358
TFileBlockPos GetNextFileBlockPos() const
Definition: bgzf.hpp:296
TDataSize GetDataSize() const
Definition: bgzf.hpp:300
@ eFormatError
includes decompression errors
Definition: bgzf.hpp:141
NCBI_EXCEPTION_DEFAULT(CBGZFException, CException)
void SetPreviousReadStatistics(const pair< Uint8, double > &stats)
Definition: bgzf.hpp:331
TBlockCache::CLock TBlock
Definition: bgzf.hpp:345
pair< Uint8, double > GetReadStatistics() const
Definition: bgzf.hpp:327
CBGZFPos::TFileBlockPos TFileBlockPos
Definition: bgzf.hpp:343
CFastMutex m_StatMutex
Definition: bgzf.hpp:360
Uint8 m_TotalUncompressBytes
Definition: bgzf.hpp:361
CCacheWithLock< TFileBlockPos, CBGZFBlock > TBlockCache
Definition: bgzf.hpp:344
CRef< CPagedFile > m_File
Definition: bgzf.hpp:357
double m_TotalUncompressSeconds
Definition: bgzf.hpp:362
CRef< TBlockCache > m_BlockCache
Definition: bgzf.hpp:358
bool operator<=(const CBGZFPos &b) const
Definition: bgzf.hpp:245
DECLARE_OPERATOR_BOOL(m_VirtualPos !=0)
static const Uint4 kMaxBlockSize
Definition: bgzf.hpp:199
TByteOffset GetByteOffset() const
Definition: bgzf.hpp:224
bool operator!=(const CBGZFPos &b) const
Definition: bgzf.hpp:233
Uint8 TVirtualPos
Definition: bgzf.hpp:197
bool operator>(const CBGZFPos &b) const
Definition: bgzf.hpp:241
Uint4 TByteOffset
Definition: bgzf.hpp:196
bool operator>=(const CBGZFPos &b) const
Definition: bgzf.hpp:249
CBGZFPos()
Definition: bgzf.hpp:201
CBGZFPos(TVirtualPos pos)
Definition: bgzf.hpp:206
TVirtualPos GetVirtualPos() const
Definition: bgzf.hpp:215
TVirtualPos m_VirtualPos
Definition: bgzf.hpp:266
CBGZFPos(TFileBlockPos block_pos, TByteOffset byte_offset)
Definition: bgzf.hpp:210
Uint8 TFileBlockPos
Definition: bgzf.hpp:195
TFileBlockPos GetFileBlockPos() const
Definition: bgzf.hpp:220
static CBGZFPos GetInvalid()
Definition: bgzf.hpp:254
bool operator<(const CBGZFPos &b) const
Definition: bgzf.hpp:237
bool operator==(const CBGZFPos &b) const
Definition: bgzf.hpp:229
bool IsInvalid() const
Definition: bgzf.hpp:258
CBGZFPos GetNextBlockPos() const
Definition: bgzf.hpp:398
CBGZFPos GetSeekPos() const
Definition: bgzf.hpp:402
CPagedFile::TPage m_Page
Definition: bgzf.hpp:449
CBGZFBlock::TFileBlockPos GetNextBlockFilePos() const
Definition: bgzf.hpp:385
CBGZFBlock::TDataSize GetBlockDataSize() const
Definition: bgzf.hpp:377
bool x_ReadBlock(CBGZFPos::TFileBlockPos file_pos)
CBGZFPos GetEndPos() const
Definition: bgzf.hpp:411
bool HaveNextAvailableBytes()
Definition: bgzf.hpp:421
const char * x_Read(CBGZFPos::TFileBlockPos file_pos, size_t size, char *buffer)
CBGZFPos m_EndPos
Definition: bgzf.hpp:454
bool HaveBytesInBlock() const
Definition: bgzf.hpp:389
CBGZFPos::TByteOffset m_ReadPos
Definition: bgzf.hpp:451
CBGZFFile::TBlock m_Block
Definition: bgzf.hpp:450
CSimpleBufferT< char > m_OutReadBuffer
Definition: bgzf.hpp:453
CBGZFPos GetPos() const
Definition: bgzf.hpp:394
CBGZFBlock::TFileBlockPos GetBlockFilePos() const
Definition: bgzf.hpp:381
CSimpleBufferT< char > m_InReadBuffer
Definition: bgzf.hpp:452
CRef< CBGZFFile > m_File
Definition: bgzf.hpp:448
CFastMutex –.
Definition: ncbimtx.hpp:667
Class for support low level input/output for files.
Definition: ncbifile.hpp:3476
CMemoryFileMap –.
Definition: ncbifile.hpp:2669
CObject –.
Definition: ncbiobj.hpp:180
size_t GetPageSize() const
Definition: bgzf.hpp:62
size_t m_Size
Definition: bgzf.hpp:81
atomic< TFilePos > m_FilePos
Definition: bgzf.hpp:80
const char * GetPagePtr() const
Definition: bgzf.hpp:66
~CPagedFilePage()
Definition: bgzf.cpp:104
TFilePos GetFilePos() const
Definition: bgzf.hpp:58
CMemoryFileMap * m_MemFile
Definition: bgzf.hpp:84
const char * m_Ptr
Definition: bgzf.hpp:82
CSimpleBufferT< char > m_Buffer
Definition: bgzf.hpp:83
Uint8 TFilePos
Definition: bgzf.hpp:53
bool Contains(TFilePos file_pos) const
Definition: bgzf.hpp:71
CPagedFilePage()
Definition: bgzf.cpp:95
double m_PreviousReadSeconds
Definition: bgzf.hpp:132
CFastMutex m_StatMutex
Definition: bgzf.hpp:128
CFastMutex m_Mutex
Definition: bgzf.hpp:118
CBinaryRangeCacheWithLock< TFilePos, CPagedFilePage > TPageCache
Definition: bgzf.hpp:99
CFileIO m_File
Definition: bgzf.hpp:121
AutoPtr< CMemoryFileMap > m_MemFile
Definition: bgzf.hpp:122
CPagedFilePage::TFilePos TFilePos
Definition: bgzf.hpp:91
CBamVDBFile m_VDBFile
Definition: bgzf.hpp:123
Uint8 m_TotalReadBytes
Definition: bgzf.hpp:129
Uint8 m_PreviousReadBytes
Definition: bgzf.hpp:131
double m_TotalReadSeconds
Definition: bgzf.hpp:130
TPageCache::CLock TPage
Definition: bgzf.hpp:103
CRef< TPageCache > m_PageCache
Definition: bgzf.hpp:126
CRef –.
Definition: ncbiobj.hpp:618
Definition: Seq_entry.hpp:56
Include a standard set of the NCBI C++ Toolkit most basic headers.
const char * file_name[]
std::ofstream out("events_result.xml")
main entry point for tests
EErrCode
Error types that an application can generate.
Definition: ncbiexpt.hpp:884
void Read(CObjectIStream &in, TObjectPtr object, const CTypeRef &type)
Definition: serial.cpp:60
uint8_t Uint1
1-byte (8-bit) unsigned integer
Definition: ncbitype.h:99
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
uint16_t Uint2
2-byte (16-bit) unsigned integer
Definition: ncbitype.h:101
uint64_t Uint8
8-byte (64-bit) unsigned integer
Definition: ncbitype.h:105
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
#define NCBI_BAMREAD_EXPORT
Definition: ncbi_export.h:1235
FILE * file
char * buf
const struct ncbi::grid::netcache::search::fields::SIZE size
Defines classes: CDirEntry, CFile, CDir, CSymLink, CMemoryFile, CFileUtil, CFileLock,...
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
#define count
static uint8_t * buffer
Definition: pcre2test.c:1016
static Uint8 MakeUint8(const char *buf)
Definition: bgzf.hpp:167
static Uint4 MakeUint4(const char *buf)
Definition: bgzf.hpp:159
static Uint2 MakeUint2(const char *buf)
Definition: bgzf.hpp:153
static float MakeFloat(const char *buf)
Definition: bgzf.hpp:183
Modified on Fri Sep 20 14:57:26 2024 by modify_doxy.py rev. 669887