NCBI C++ ToolKit
writedb_files.hpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 #ifndef OBJTOOLS_WRITERS_WRITEDB__WRITEDB_FILES_HPP
2 #define OBJTOOLS_WRITERS_WRITEDB__WRITEDB_FILES_HPP
3 
4 /* $Id: writedb_files.hpp 96690 2022-04-28 11:08:55Z fongah2 $
5  * ===========================================================================
6  *
7  * PUBLIC DOMAIN NOTICE
8  * National Center for Biotechnology Information
9  *
10  * This software/database is a "United States Government Work" under the
11  * terms of the United States Copyright Act. It was written as part of
12  * the author's official duties as a United States Government employee and
13  * thus cannot be copyrighted. This software/database is freely available
14  * to the public for use. The National Library of Medicine and the U.S.
15  * Government have not placed any restriction on its use or reproduction.
16  *
17  * Although all reasonable efforts have been taken to ensure the accuracy
18  * and reliability of the software and data, the NLM and the U.S.
19  * Government do not and cannot warrant the performance or results that
20  * may be obtained by using this software or data. The NLM and the U.S.
21  * Government disclaim all warranties, express or implied, including
22  * warranties of performance, merchantability or fitness for any particular
23  * purpose.
24  *
25  * Please cite the author in any work or product based on this material.
26  *
27  * ===========================================================================
28  *
29  * Author: Kevin Bealer
30  *
31  */
32 
33 /// @file writedb_files.hpp
34 /// Code for database files construction.
35 ///
36 /// Defines classes:
37 /// CWriteDBHeader
38 ///
39 /// Implemented for: UNIX, MS-Windows
40 
44 #include <objects/seq/seq__.hpp>
45 #include <corelib/ncbistre.hpp>
46 #include <corelib/ncbifile.hpp>
47 
49 
50 /// Import definitions from the objects namespace.
52 
53 /// CWriteDB_IndexFile class
54 ///
55 /// This manufactures blast database index files from input data.
56 
58 public:
59  // Setup and control
60 
61  /// Constructor.
62  ///
63  /// The filename is constructed from basename, extension, and
64  /// index, but might be changed if the RenameSingle() method is
65  /// called. If zero is specified for maximum file size, a default
66  /// size is provided by this class. The maximum file size is not
67  /// enforced by this class, instead each derived class must do its
68  /// own enforcement.
69  ///
70  /// @param basename Database base name, shared by all files. [in]
71  /// @param extension File name extension for this file. [in]
72  /// @param index Volume index used in filename. [in]
73  /// @param max_file_size File size limit (in bytes). [in]
74  /// @param always_create If true the file will be created now. [in]
75  CWriteDB_File(const string & basename,
76  const string & extension,
77  int index,
78  Uint8 max_file_size,
79  bool always_create);
80 
81  /// Create and open the file.
82  ///
83  /// This method must be called before the first time that data is
84  /// written to the file. If the constructor is passed 'true' for
85  /// always_create, this method will be called during construction.
86  /// It is an error to call this method more than once (including
87  /// via the constructor) or to not call it but to call Write. The
88  /// rationale for making this explicit is to permit some files to
89  /// be created optionally, such as ISAM files, which should only
90  /// be created if the corresponding ID types are found.
91  void Create();
92 
93  /// Write contents of a string to the file.
94  /// @param data Data to write.
95  /// @return File offset after write.
96  unsigned int Write(const CTempString & data);
97 
98  unsigned int Write(const char * data, int length);
99 
100  /// Write an Int4 (in bigendian order) to the file.
101  /// @param data String to write.
102  /// @return File offset after write.
103  unsigned int WriteInt4(int data)
104  {
105  s_WriteInt4(m_RealFile, data);
106  m_Offset += 4;
107  return m_Offset;
108  }
109 
110  /// Write an Int8 (in bigendian order) to the file.
111  /// @param data String to write.
112  /// @return File offset after write.
113  unsigned int WriteInt8(Int8 data)
114  {
115  s_WriteInt8BE(m_RealFile, data);
116  m_Offset += 8;
117  return m_Offset;
118  }
119 
120  /// Write contents of a string to the file, appending a NUL.
121  /// @param data String to write.
122  /// @return File offset after write.
123  unsigned int WriteWithNull(const CTempString & data)
124  {
125  Write(data);
126  return Write(m_Nul);
127  }
128 
129  /// Close the file, flushing any remaining data to disk.
130  void Close();
131 
132  /// Rename this file, disincluding the volume index.
133  virtual void RenameSingle();
134 
135  virtual void RenameFileIndex(unsigned int num_digits);
136 
137  /// Construct the short name for a volume.
138  ///
139  /// Volume names consist of the database base name, ".", and the
140  /// volume index in decimal. The volume index is normally two
141  /// digits, but if more than 100 volumes are needed, the filename
142  /// will use three or more index digits as needed.
143  ///
144  /// @param base Base name to use.
145  /// @param index Volume index.
146  /// @return A short name.
147  static string MakeShortName(const string & base, int index);
148 
149  /// Get the current filename for this file.
150  ///
151  /// The filename is returned. The data returned by this method
152  /// reflects changes made by RenameSingle(), so it is probably
153  /// best to call it after that method has been called (if it will
154  /// be called).
155  ///
156  /// @return The filename.
157  const string & GetFilename() const
158  {
159  return m_Fname;
160  }
161 
162 protected:
163  /// True if the file has already been opened.
164  bool m_Created;
165 
166  /// Underlying 'output file' type used here.
167  typedef ofstream TFile;
168 
169  /// For convenience, a string containing one NUL character.
170  string m_Nul; // init me
171 
172  /// The default value for max_file_size.
173  /// @return The max file size used if otherwise unspecified.
175  {
176  // 1 gb (marketing version) - 1; about a billion
177  return 1000*1000*1000 - 1;
178  }
179 
180  /// This should flush any unwritten data to disk.
181  ///
182  /// This method must be implemented by derived classes to flush
183  /// any unwritten data to disk. In the cases of sequence and
184  /// header files, it will normally do nothing, because such files
185  /// are written as the data is available. For index (pin/nin) and
186  /// ISAM files, this method does most of the disk I/O.
187  virtual void x_Flush() = 0;
188 
189  /// Build the filename for this file.
190  void x_MakeFileName();
191 
192  // Configuration
193 
194  string m_BaseName; ///< Database base name for all files.
195  string m_Extension; ///< File extension for this file.
196  int m_Index; ///< Volume index.
197  unsigned int m_Offset; ///< Stream position.
198  Uint8 m_MaxFileSize; ///< Maximum file size in bytes.
199 
200  // The file
201 
202  bool m_UseIndex; ///< True if filenames should use volume index.
203  string m_Fname; ///< Current filename for output file.
204  TFile m_RealFile; ///< Actual stream implementing the output file.
205 };
206 
207 // For index file format, see .cpp file.
208 
209 /// This class builds the volume index file (pin or nin).
211 public:
212  /// Constructor.
213  /// @param dbname Database base name.
214  /// @param protein True for protein volumes.
215  /// @param title Database title string.
216  /// @param date Timestamp of database construction start.
217  /// @param index Index of this volume.
218  /// @param max_file_size Maximum file size in bytes (or zero).
219  CWriteDB_IndexFile(const string & dbname,
220  bool protein,
221  const string & title,
222  const string & date,
223  int index,
224  Uint8 max_file_size,
226 
227  /// Returns true if another sequence can fit into the file.
228  bool CanFit()
229  {
230  _ASSERT(m_MaxFileSize > 1024UL);
231 
232  if (m_OIDs == 0)
233  return true;
234 
235  return m_DataSize < (m_MaxFileSize - 12UL);
236  }
237 
238  /// Add a sequence to a protein index file (pin).
239  ///
240  /// The index file does not need sequence data, so this method
241  /// only needs offsets of the data in other files.
242  ///
243  /// @param Sequence length in letters.
244  /// @param hdr Length of binary ASN.1 header data.
245  /// @param seq Length in bytes of sequence data.
246  void AddSequence(int length, unsigned int hdr, unsigned int seq)
247  {
248  if (length > m_MaxLength) {
249  m_MaxLength = length;
250  }
251 
252  m_OIDs++;
253  m_Letters += length;
254  m_DataSize += 8;
255 
256  m_Hdr.push_back(hdr);
257  m_Seq.push_back(seq);
258  }
259 
260  /// Add a sequence to a nucleotide index file (nin).
261  ///
262  /// The index file does not need sequence data, so this method
263  /// only needs offsets of the data in other files.
264  ///
265  /// @param Sequence length in letters.
266  /// @param hdr Length of binary ASN.1 header data.
267  /// @param seq Length in bytes of packed sequence data.
268  /// @param amb Length in bytes of packed ambiguity data.
269  void AddSequence(int length, unsigned int hdr, unsigned int seq, unsigned int amb)
270  {
271  if (length > m_MaxLength) {
272  m_MaxLength = length;
273  }
274 
275  m_OIDs++;
276  m_Letters += length;
277 
278  m_DataSize += 12;
279  m_Hdr.push_back(hdr);
280  m_Seq.push_back(amb); // Not a bug.
281  m_Amb.push_back(seq); // Also not a bug.
282  }
283 
284 private:
285  /// Compute index file overhead. This is the overhead used by all
286  /// fields of the index file, and does account for padding.
287  /// (version 5)
288  ///
289  /// @param T Title string.
290  /// @param LMDB file name string.
291  /// @param D Create time string.
292  /// @return Combined size of all meta-data fields in nin/pin file.
293  int x_Overhead(const string & T, const string & lmdbName, const string & D);
294 
295  /// Compute index file overhead. This is the overhead used by all
296  /// fields of the index file, and does account for padding.
297  /// (version 4)
298  ///
299  /// @param T Title string.
300  /// @param D Create time string.
301  /// @return Combined size of all meta-data fields in nin/pin file.
302  int x_Overhead(const string & T, const string & D);
303 
304  /// Flush index data to disk.
305  virtual void x_Flush();
306 
307  /// Form name of LMDB database file.
308  const string x_MakeLmdbName();
309 
310  bool m_Protein; ///< True if this is a protein database.
311  string m_Title; ///< Title string for all database volumes.
312  string m_Date; ///< Database creation time stamp.
313  int m_OIDs; ///< OIDs added to database so far.
314  int m_Overhead; ///< Amount of file used by metadata.
315  Uint8 m_DataSize; ///< Required space for data once written to disk.
316  Uint8 m_Letters; ///< Letters of sequence data accumulated so far.
317  int m_MaxLength; ///< Length of longest sequence.
318 
319  // Because the lengths are found via "next offset - this offset",
320  // each array has an extra element. (This is not necesary in the
321  // case of m_Amb; the last element is never examined because of
322  // the alternation of sequences and ambiguities.)
323 
324  /// Start offset in header file of each OID's headers.
325  ///
326  /// The end offset is given by the start offset of the following
327  /// OID's headers.
328  vector<unsigned int> m_Hdr;
329 
330  /// Offset in sequence file of each OID's sequence data.
331  ///
332  /// The end of the sequence data is given by the start offset of
333  /// the ambiguity data for the same OID.
334  vector<unsigned int> m_Seq;
335 
336  /// Offset in sequence file of each OID's ambiguity data.
337  ///
338  /// The end of the ambiguity data is given by the start offset of
339  /// the sequence data for the next OID.
340  vector<unsigned int> m_Amb;
341 
342  EBlastDbVersion m_Version; ///< BLASTDB version (4 or 5).
343 };
344 
345 /// This class builds the volume header file (phr or nhr).
347 public:
348  /// Constructor.
349  /// @param dbname Database base name.
350  /// @param protein True for protein volumes.
351  /// @param index Index of this volume.
352  /// @param max_file_size Maximum file size in bytes (or zero).
353  CWriteDB_HeaderFile(const string & dbname,
354  bool protein,
355  int index,
356  Uint8 max_file_size);
357 
358  /// Returns true if the specified amount of data would fit.
359  ///
360  /// If the specified amount of data (in bytes) would fit in the
361  /// file without exceeding the max_file_size, this method returns
362  /// true.
363  ///
364  /// @param size Size of new data in bytes.
365  bool CanFit(int size)
366  {
367  _ASSERT(size >= 0);
368 
369  if (m_DataSize == 0UL) {
370  return true;
371  }
372 
373  return (m_DataSize + (Uint8) size) < m_MaxFileSize;
374  }
375 
376  /// Add binary header data to this file.
377  /// @param binhdr Binary ASN.1 version of header data. [in]
378  /// @param offset Offset of end of header data. [out]
379  void AddSequence(const string & binhdr, unsigned int & offset)
380  {
381  m_DataSize = offset = Write(binhdr);
382  }
383 
384 private:
385  /// Flush unwritten data to the output file.
386  virtual void x_Flush()
387  {
388  // There is nothing to do here - header data is written as
389  // soon as it is added.
390  }
391 
392  /// Amount of data written so far.
394 };
395 
397 public:
398  /// Constructor.
399  /// @param dbname Database base name.
400  /// @param protein True for protein volumes.
401  /// @param index Index of this volume.
402  /// @param max_file_size Maximum file size in bytes (or zero).
403  /// @param max_letter Maximum sequence letters per volume (or zero).
404  CWriteDB_SequenceFile(const string & dbname,
405  bool protein,
406  int index,
407  Uint8 max_file_size,
408  Uint8 max_letters);
409 
410  /// Returns true if the specified amount of data would fit.
411  ///
412  /// If the specified amount of data (in bytes) would fit in the
413  /// file without exceeding the max_file_size, and the specified
414  /// number of letters would fit without exceeding the maximum
415  /// letters limit, this method returns true.
416  ///
417  /// @param size Size of new data in bytes.
418  /// @param letters Number of sequence letters in new data.
419  bool CanFit(int size, int letters)
420  {
421  _ASSERT(size >= 0);
422  _ASSERT(letters >= 0);
423 
424  if (m_Offset <= 1) {
425  return true;
426  }
427 
428  if ((m_BaseLimit != 0) &&
429  ((m_Letters + (Uint8) letters) > m_BaseLimit)) {
430  return false;
431  }
432 
433  return ((Uint8)(m_Offset + size) < m_MaxFileSize);
434  }
435 
436  /// Add a protein sequence to this file.
437  ///
438  /// This method should only be called in the protein case.
439  ///
440  /// @param sequence Packed sequence data. [in]
441  /// @param offset Offset of the end of the sequence data. [out]
442  /// @param length Length of the sequence in letters. [in]
443  void AddSequence(const string & sequence,
444  unsigned int & offset,
445  int length)
446  {
447 #ifdef _DEBUG
449 #endif
450  offset = WriteWithNull(sequence);
451  m_Letters += length;
452  }
453 
454  /// Add a nucleotide sequence to this file.
455  ///
456  /// This method should only be called in the nucleotide case.
457  ///
458  /// @param sequence Packed sequence data. [in]
459  /// @param ambig Packed ambiguity data. [in]
460  /// @param off_seq Offset of the end of the sequence data. [out]
461  /// @param off_amb Offset of the end of the ambiguity data. [out]
462  /// @param length Length of the sequence in letters. [in]
463  void AddSequence(const string & sequence,
464  const string & ambig,
465  unsigned int & off_seq,
466  unsigned int & off_amb,
467  int length)
468  {
469 #ifdef _DEBUG
470  _ASSERT(! m_Protein);
471 #endif
472  off_seq = Write(sequence);
473  off_amb = Write(ambig);
474  m_Letters += length;
475  }
476 
477 private:
478  /// Flush unwritten data to the output file.
479  virtual void x_Flush()
480  {
481  // There is nothing to do here - sequence data is written as
482  // soon as it is added.
483  }
484 
485  Uint8 m_Letters; ///< Letters of sequence data added so far.
486  Uint8 m_BaseLimit; ///< Limit on letters of sequence data.
487 #ifdef _DEBUG
488  bool m_Protein; ///< True if this is a protein database.
489 #endif
490 };
491 
493 
494 
495 #endif // OBJTOOLS_WRITERS_WRITEDB__WRITEDB_FILES_HPP
496 
CObject –.
Definition: ncbiobj.hpp:180
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
Definition: tempstr.hpp:65
CWriteDB_IndexFile class.
int m_Index
Volume index.
Uint8 m_MaxFileSize
Maximum file size in bytes.
TFile m_RealFile
Actual stream implementing the output file.
unsigned int WriteInt4(int data)
Write an Int4 (in bigendian order) to the file.
bool m_Created
True if the file has already been opened.
string m_Fname
Current filename for output file.
ofstream TFile
Underlying 'output file' type used here.
const string & GetFilename() const
Get the current filename for this file.
Uint8 x_DefaultByteLimit()
The default value for max_file_size.
string m_BaseName
Database base name for all files.
virtual void x_Flush()=0
This should flush any unwritten data to disk.
unsigned int Write(const CTempString &data)
Write contents of a string to the file.
string m_Nul
For convenience, a string containing one NUL character.
unsigned int m_Offset
Stream position.
unsigned int WriteWithNull(const CTempString &data)
Write contents of a string to the file, appending a NUL.
unsigned int WriteInt8(Int8 data)
Write an Int8 (in bigendian order) to the file.
bool m_UseIndex
True if filenames should use volume index.
string m_Extension
File extension for this file.
This class builds the volume header file (phr or nhr).
CWriteDB_HeaderFile(const string &dbname, bool protein, int index, Uint8 max_file_size)
Constructor.
Uint8 m_DataSize
Amount of data written so far.
void AddSequence(const string &binhdr, unsigned int &offset)
Add binary header data to this file.
bool CanFit(int size)
Returns true if the specified amount of data would fit.
virtual void x_Flush()
Flush unwritten data to the output file.
This class builds the volume index file (pin or nin).
void AddSequence(int length, unsigned int hdr, unsigned int seq, unsigned int amb)
Add a sequence to a nucleotide index file (nin).
Uint8 m_DataSize
Required space for data once written to disk.
CWriteDB_IndexFile(const string &dbname, bool protein, const string &title, const string &date, int index, Uint8 max_file_size, EBlastDbVersion dbver=eBDB_Version4)
Constructor.
int m_Overhead
Amount of file used by metadata.
int m_OIDs
OIDs added to database so far.
EBlastDbVersion m_Version
BLASTDB version (4 or 5).
bool m_Protein
True if this is a protein database.
virtual void x_Flush()
Flush index data to disk.
void AddSequence(int length, unsigned int hdr, unsigned int seq)
Add a sequence to a protein index file (pin).
string m_Title
Title string for all database volumes.
vector< unsigned int > m_Amb
Offset in sequence file of each OID's ambiguity data.
int x_Overhead(const string &T, const string &lmdbName, const string &D)
Compute index file overhead.
vector< unsigned int > m_Hdr
Start offset in header file of each OID's headers.
const string x_MakeLmdbName()
Form name of LMDB database file.
bool CanFit()
Returns true if another sequence can fit into the file.
Uint8 m_Letters
Letters of sequence data accumulated so far.
int m_MaxLength
Length of longest sequence.
string m_Date
Database creation time stamp.
vector< unsigned int > m_Seq
Offset in sequence file of each OID's sequence data.
bool m_Protein
True if this is a protein database.
virtual void x_Flush()
Flush unwritten data to the output file.
CWriteDB_SequenceFile(const string &dbname, bool protein, int index, Uint8 max_file_size, Uint8 max_letters)
Constructor.
bool CanFit(int size, int letters)
Returns true if the specified amount of data would fit.
Uint8 m_Letters
Letters of sequence data added so far.
Uint8 m_BaseLimit
Limit on letters of sequence data.
void AddSequence(const string &sequence, const string &ambig, unsigned int &off_seq, unsigned int &off_amb, int length)
Add a nucleotide sequence to this file.
void AddSequence(const string &sequence, unsigned int &offset, int length)
Add a protein sequence to this file.
#define T(s)
Definition: common.h:230
#define basename(path)
Definition: replacements.h:116
int offset
Definition: replacements.h:160
char data[12]
Definition: iconv.c:80
void Write(CObjectOStream &out, TConstObjectPtr object, const CTypeRef &type)
Definition: serial.cpp:55
int64_t Int8
8-byte (64-bit) signed integer
Definition: ncbitype.h:104
uint64_t Uint8
8-byte (64-bit) unsigned integer
Definition: ncbitype.h:105
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define NCBI_XOBJWRITE_EXPORT
Definition: ncbi_export.h:1347
char * dbname(DBPROCESS *dbproc)
Get name of current database.
Definition: dblib.c:6929
const struct ncbi::grid::netcache::search::fields::SIZE size
Defines classes: CDirEntry, CFile, CDir, CSymLink, CMemoryFile, CFileUtil, CFileLock,...
NCBI C++ stream class wrappers for triggering between "new" and "old" C++ stream libraries.
Defines exception class and several constants for SeqDB.
EBlastDbVersion
BLAST database version.
Definition: seqdbcommon.hpp:51
@ eBDB_Version4
Definition: seqdbcommon.hpp:52
#define D(d)
#define _ASSERT
static bool ambig(char c)
Data conversion tools for CWriteDB and associated code.
void s_WriteInt8BE(ostream &str, Uint8 x)
Write an eight byte integer to a stream in big-endian format.
void s_WriteInt4(ostream &str, int x)
Write a four byte integer to a stream in big endian format.
USING_SCOPE(objects)
Import definitions from the objects namespace.
Implementation for general purpose utilities for WriteDB.
Modified on Fri Sep 20 14:57:38 2024 by modify_doxy.py rev. 669887