NCBI C++ ToolKit
writedb_column.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: writedb_column.cpp 96485 2022-04-06 15:30:07Z fongah2 $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Kevin Bealer
27  *
28  */
29 
30 /// @file writedb_column.cpp
31 /// Implementation for the CWriteDB_Column and related classes.
32 #include <ncbi_pch.hpp>
34 #include "writedb_column.hpp"
35 
37 
38 /// Import C++ std namespace.
40 
41 // CWriteDB_Column
42 
43 #if ((!defined(NCBI_COMPILER_WORKSHOP) || (NCBI_COMPILER_VERSION > 550)) && \
44  (!defined(NCBI_COMPILER_MIPSPRO)) )
46  const string & extn1,
47  const string & extn2,
48  int index,
49  const string & title,
50  const TColumnMeta & meta,
51  Uint8 max_file_size)
52  : m_UseBothByteOrder(false)
53 {
55  extn2,
56  index,
57  max_file_size));
58 
60  extn1,
61  index,
62  *m_DFile,
63  title,
64  meta,
65  max_file_size));
66 }
67 
69  const string & extn,
70  int index,
71  Uint8 max_file_size)
72 {
73  m_UseBothByteOrder = true;
75  extn,
76  index,
77  max_file_size));
78 }
79 
81 {
82 }
83 
84 void CWriteDB_Column::ListFiles(vector<string> & files, bool skip_empty) const
85 {
86  if (! (skip_empty && m_DFile->Empty())) {
87  files.push_back(m_IFile->GetFilename());
88  files.push_back(m_DFile->GetFilename());
89  if (m_UseBothByteOrder) files.push_back(m_DFile2->GetFilename());
90  }
91 }
92 
94 {
95  // Note that data size is the size *after* the blob has been
96  // written. The initial (zero) offset is written during file
97  // creation.
98 
99  Int8 data_size = m_DFile->WriteBlob(blob);
100  m_IFile->WriteBlobIndex(data_size);
101 }
102 
103 void CWriteDB_Column::AddBlob(const CBlastDbBlob & blob, const CBlastDbBlob & blob2)
104 {
105  AddBlob(blob);
107 }
108 
110 {
111  m_IFile->Close();
112  m_DFile->Close();
114 }
115 
117 {
118  return m_IFile->CanFit() && m_DFile->CanFit(size);
119 }
120 
122 {
126 }
127 
128 void CWriteDB_Column::RenameFileIndex(unsigned int num_digits)
129 {
130  m_IFile->RenameFileIndex(num_digits);
131  m_DFile->RenameFileIndex(num_digits);
132  if (m_UseBothByteOrder) m_DFile2->RenameFileIndex(num_digits);
133 }
134 
135 void CWriteDB_Column::AddMetaData(const string & key, const string & value)
136 {
137  return m_IFile->AddMetaData(key, value);
138 }
139 
140 
141 // CWriteDB_ColumnIndex
142 
143 // Format (see BlastDb .pin file for comparison)
144 //
145 // Notes:
146 // A. Fixed width stuff at top.
147 // B. Strings use prefixed lengths.
148 // C. Padding is done explicitly.
149 // D. Each of these is 4 bytes unless indicated.
150 //
151 // 0: Format version. (always 1 for now)
152 // 4: Column type. (always 1 (blob) for now)
153 // 8: Size of offsets (always 4 for now)
154 // 12: OID count.
155 // 16: Data file length (8).
156 // 24: Offset of meta data (4).
157 // 28: Offset of offset array (4). (#O).
158 // 32: Title (identifies this column file) (varies).
159 // ??: Create date (varies).
160 // ??: Metadata count (4).
161 // ??: Meta-data (varies:)
162 //
163 // (For each meta data element:)
164 // Key (varies)
165 // Value (varies
166 //
167 // Pad string (varies from 0 to 8 bytes).
168 //
169 // Offset #0 (8)
170 // Offset #1 (8)
171 // ...
172 //
173 // The rule of thumb is that integers which appear in fixed or aligned
174 // positions in the data stream are encoded as fixed width values, and
175 // other integers are packed as variable width values. This permits
176 
177 // rapid access to fixed width data without considering nearby values.
178 //
179 // I'm not sure that this argument is compelling in this case (as the
180 // total number of bytes here is small), but it may be important for
181 // other cases such as building large arrays of similar structures for
182 // fixed-width column data.)
183 
185 CWriteDB_ColumnIndex(const string & dbname,
186  const string & extn,
187  int index,
188  CWriteDB_ColumnData & datafile,
189  const string & title,
190  const TColumnMeta & meta,
191  Uint8 max_file_size)
192  : CWriteDB_File (dbname, extn, index, max_file_size, false),
193  m_DataFile (& datafile),
194  m_MetaData (meta),
195  m_Title (title),
196  m_OIDs (0),
197  m_DataLength (0)
198 {
200 }
201 
203 {
204 }
205 
207 {
208  _ASSERT(0 == (offset >> 32));
209 
210  if (m_Header.Empty()) {
211  m_Header.Reset(new CBlastDbBlob(256));
212  m_Offsets.Reset(new CBlastDbBlob(4096));
213 
214  // We build these now so that m_DataLength is accurate. They
215  // will be rebuilt just before they are written, when the file
216  // is closed.
217 
220 
221  // Offset of first data element (always zero).
222  m_Offsets->WriteInt4(0);
223 
225  }
226 
228  m_OIDs ++;
229 }
230 
232 {
233  // The Blob type makes a great binary data stream type.
234 
235  const int kFormatVersion = 1; // SeqDB has one of these.
236  const int kColumnType = 1; // Blob (only choice right now)
237  const int kOffsetSize = 4; // Data file offset size (always 4)
238 
239  m_Header->SeekWrite(0);
240  m_Header->WriteInt4(kFormatVersion);
241  m_Header->WriteInt4(kColumnType);
242  m_Header->WriteInt4(kOffsetSize);
245 }
246 
248 {
249  // The write offset (in m_Header) when calling this function
250  // should be immediately after the fixed-size header fields
251  // written by BuildHeaderFields.
252 
253  int meta_data_p = m_Header->GetWriteOffset();
254  m_Header->WriteInt4(0); // metadata start
255 
256  int array_offset_p = m_Header->GetWriteOffset();
257  m_Header->WriteInt4(0); // offset array start
258 
261 
262  int meta_off = m_Header->GetWriteOffset();
263  m_Header->WriteInt4(meta_off, meta_data_p);
264 
265  x_BuildMetaData();
266 
267  // Align to an 8 byte multiple; eString means that we can change
268  // the alignment of this field without losing compatibility.
270 
271  int array_off = m_Header->GetWriteOffset();
272  m_Header->WriteInt4(array_off, array_offset_p);
273 
274  _ASSERT((array_off & 0x7) == 0);
275 }
276 
278 {
280 
282 
283  ITERATE(TColumnMeta, iter, m_MetaData) {
284  CTempString key = iter->first, value = iter->second;
287  }
288 }
289 
291 {
292  if (! m_DataFile->Empty()) {
293  if (! m_Created) {
294  Create();
295  }
296 
297  // These need to be rebuilt to write the correct values for
298  // OID count, total length, and possibly meta data.
299 
302 
303  Write(m_Header->Str());
304  Write(m_Offsets->Str());
305 
306  // We're done with these now, so free up the memory.
307 
308  m_Header.Reset();
309  m_Offsets.Reset();
310  }
311 }
312 
314 {
316 }
317 
318 void CWriteDB_ColumnIndex::AddMetaData(const string & key, const string & value)
319 {
320  m_DataLength += (key.size() + CBlastDbBlob::VarIntSize(key.size()) +
321  value.size() + CBlastDbBlob::VarIntSize(value.size()));
322 
323  m_MetaData[key] = value;
324 }
325 
326 
327 // CWriteDB_ColumnData
328 
330  const string & extn,
331  int index,
332  Uint8 max_file_size)
333  : CWriteDB_File (dbname, extn, index, max_file_size, false),
334  m_DataLength (0)
335 {
336 }
337 
339 {
340 }
341 
343 {
344  if (! blob.Size()) {
345  return m_DataLength;
346  }
347 
348  if (! m_Created) {
349  Create();
350  }
351 
352  return m_DataLength = Write(blob.Str());
353 }
354 
356 {
357  if ((! m_Created) && (m_DataLength != 0)) {
358  Create();
359  }
360 }
361 
363 {
364  return Uint8(m_DataLength + size) < m_MaxFileSize;
365 }
366 
367 
368 // CWriteDB_ColumnBuilder
369 
371 CWriteDB_ColumnBuilder(const string & title,
372  const string & basename,
373  char file_id)
374  : m_Impl(NULL)
375 {
376  _ASSERT(isalnum(file_id));
377 
378  string index_extn = "x_a";
379  index_extn[1] = file_id;
380 
381  string data_extn = index_extn;
382  data_extn[2] = 'b';
383 
384  map<string,string> meta;
385 
387  index_extn,
388  data_extn,
389  0,
390  title,
391  meta,
392  0);
393 }
394 
396 {
397  delete m_Impl;
398 }
399 
400 void CWriteDB_ColumnBuilder::ListFiles(vector<string> & files) const
401 {
402  m_Impl->ListFiles(files, false);
403 }
404 
406 {
407  return m_Impl->AddBlob(blob);
408 }
409 
410 void CWriteDB_ColumnBuilder::AddMetaData(const string & key, const string & value)
411 {
412  return m_Impl->AddMetaData(key, value);
413 }
414 
416 {
417  m_Impl->RenameSingle();
418  m_Impl->Close();
419 }
420 #endif
421 
423 
`Blob' Class for SeqDB (and WriteDB).
Definition: seqdbblob.hpp:56
@ eString
Pad using NUL terminated string of '#' bytes.
Definition: seqdbblob.hpp:272
static int VarIntSize(Int8 x)
Compute bytes used for a variable length integer.
Definition: seqdbblob.cpp:285
int Size() const
Get size of blob contents.
Definition: seqdbblob.cpp:518
int GetWriteOffset() const
Get the current write pointer offset.
Definition: seqdbblob.cpp:552
void WritePadBytes(int align, EPadding fmt)
Align the offset by writing pad bytes.
Definition: seqdbblob.cpp:562
void WriteInt4(Int4 x)
Write a 4 byte integer to the blob.
Definition: seqdbblob.cpp:323
void WriteInt8(Int8 x)
Write an 8 byte integer to the blob.
Definition: seqdbblob.cpp:333
CTempString Str() const
Get blob contents as a CTempString.
Definition: seqdbblob.cpp:526
int WriteString(CTempString str, EStringFormat fmt)
Write string data to the blob.
Definition: seqdbblob.cpp:383
void SeekWrite(int offset)
Seek write pointer to a specific location.
Definition: seqdbblob.cpp:542
int WriteVarInt(Int8 x)
Write a variable length integer to the blob.
Definition: seqdbblob.cpp:243
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
Definition: tempstr.hpp:65
CTime –.
Definition: ncbitime.hpp:296
void Close()
Complete and close the column files.
void AddMetaData(const string &key, const string &value)
Add meta data to the column.
class CWriteDB_Column * m_Impl
Implementation object.
Definition: writedb.hpp:605
void ListFiles(vector< string > &files) const
List Filenames.
CWriteDB_ColumnBuilder(const string &title, const string &basename, char file_id='a')
Construct a BlastDb format column.
void AddBlob(const CBlastDbBlob &blob)
Add a blob to the column.
~CWriteDB_ColumnBuilder()
Destructor.
CWriteDB_ColumnData class.
~CWriteDB_ColumnData()
Destructor.
Int8 GetDataLength() const
Get size of data file (so far).
bool Empty() const
Tests whether the data file is empty.
CWriteDB_ColumnData(const string &dbname, const string &extn, int index, Uint8 max_file_size)
Constructor for an column data file.
Int8 WriteBlob(const CBlastDbBlob &blob)
Write a new data blob.
void x_Flush()
Flush any stored data.
bool CanFit(int size) const
Tests whether there is room for some number of bytes.
Uint8 m_DataLength
Length of data written so far.
CWriteDB_ColumnIndex class.
string m_Title
Title of this column.
void x_BuildMetaData()
Serialize meta data strings into header object.
CRef< CBlastDbBlob > m_Offsets
Offsets of sequences in the data file.
CRef< CBlastDbBlob > m_Header
Header data.
string m_Date
Creation timestamp for this column.
static const CBlastDbBlob::EStringFormat kStringFmt
String format used by column files.
static const int kEntrySize
Size of an entry in the index file.
~CWriteDB_ColumnIndex()
Destructor.
CRef< CWriteDB_ColumnData > m_DataFile
The data file associated with this index file.
void x_BuildHeaderStrings()
Build header string data section.
void x_Flush()
Flush index data in preparation for Close().
TColumnMeta m_MetaData
Column meta data.
CWriteDB_ColumnIndex(const string &dbname, const string &extn, int index, CWriteDB_ColumnData &datafile, const string &title, const TColumnMeta &meta, Uint8 max_file_size)
Constructor for column index file.
void x_BuildHeaderFields()
Build fixed length header fields.
void AddMetaData(const string &key, const string &value)
Add meta data to the column.
bool CanFit() const
Tests whether there is room for another entry.
Uint8 m_DataLength
Length of data accounted for so far.
void WriteBlobIndex(Int8 offset)
Write the offset of a new data blob.
CWriteDB_Column class.
void RenameFileIndex(unsigned int num_digits)
void AddBlob(const CBlastDbBlob &blob)
Add a blob to the column.
CWriteDB_Column(const string &dbname, const string &extn1, const string &extn2, int index, const string &title, const TColumnMeta &meta, Uint8 max_file_size)
Construct WriteDB style database column.
void Close()
Flush data to disk and close all associated files.
void AddMetaData(const string &key, const string &value)
Add meta data to the column.
~CWriteDB_Column()
Destructor.
bool m_UseBothByteOrder
Support for multiple byte order.
bool CanFit(int bytes) const
Tests whether there is room for a given blob.
CRef< CWriteDB_ColumnData > m_DFile2
void RenameSingle()
Rename files to single-volume names.
void ListFiles(vector< string > &files, bool skip_empty) const
List Filenames.
CRef< CWriteDB_ColumnData > m_DFile
Data file, contains one record for each key/oid pair, in big and small endian.
CRef< CWriteDB_ColumnIndex > m_IFile
Index file, contains meta data and samples of the key/oid pairs.
void AddByteOrder(const string &dbname, const string &extn, int index, Uint8 max_file_size)
Add support for multiple byte order.
CWriteDB_IndexFile class.
Uint8 m_MaxFileSize
Maximum file size in bytes.
bool m_Created
True if the file has already been opened.
const string & GetFilename() const
Get the current filename for this file.
virtual void RenameFileIndex(unsigned int num_digits)
void Create()
Create and open the file.
void Close()
Close the file, flushing any remaining data to disk.
unsigned int Write(const CTempString &data)
Write contents of a string to the file.
virtual void RenameSingle()
Rename this file, disincluding the volume index.
size_type size() const
Definition: map.hpp:148
#define false
Definition: bool.h:36
#define basename(path)
Definition: replacements.h:116
int offset
Definition: replacements.h:160
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define NULL
Definition: ncbistd.hpp:225
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
Definition: ncbiobj.hpp:719
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
int64_t Int8
8-byte (64-bit) signed integer
Definition: ncbitype.h:104
uint64_t Uint8
8-byte (64-bit) unsigned integer
Definition: ncbitype.h:105
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
string AsString(const CTimeFormat &format=kEmptyStr, TSeconds out_tz=eCurrentTimeZone) const
Transform time to string.
Definition: ncbitime.cpp:1512
@ eCurrent
Use current time. See also CCurrentTime.
Definition: ncbitime.hpp:300
char * dbname(DBPROCESS *dbproc)
Get name of current database.
Definition: dblib.c:6929
const struct ncbi::grid::netcache::search::fields::SIZE size
const struct ncbi::grid::netcache::search::fields::KEY key
const GenericPointer< typename T::ValueType > T2 value
Definition: pointer.h:1227
int isalnum(Uchar c)
Definition: ncbictype.hpp:62
#define _ASSERT
Defines BLAST database construction classes.
USING_SCOPE(std)
Import C++ std namespace.
Code for arbitrary data `column' file construction.
Modified on Sat Jun 29 13:54:55 2024 by modify_doxy.py rev. 669887