NCBI C++ ToolKit
seqdbcol.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: seqdbcol.cpp 92678 2021-02-05 18:10:16Z fongah2 $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Kevin Bealer
27  *
28  */
29 
30 /// @file seqdbcol.cpp
31 /// This is the implementation file for the CSeqDBColumnReader,
32 /// CSeqDBColumn, and CSeqDB_ColumnEntry classes,
33 /// which support read operations on BlastDb format database columns.
34 #include <ncbi_pch.hpp>
38 
40 
41 #if ((!defined(NCBI_COMPILER_WORKSHOP) || (NCBI_COMPILER_VERSION > 550)) && \
42  (!defined(NCBI_COMPILER_MIPSPRO)) )
43 // CSeqDB_ColumnReader
44 
46 CSeqDB_ColumnReader(const string & volname, char file_id)
47  : m_Impl(NULL)
48 {
49  _ASSERT(isalnum(file_id));
50 
51  string index_extn = "x_a";
52  index_extn[1] = file_id;
53 
54  string data_extn = index_extn;
55  data_extn[2] = 'b';
56 
57  // Create the actual column object.
58  m_Impl = new CSeqDBColumn(volname, index_extn, data_extn, NULL);
59 }
60 
62 {
63  delete m_Impl;
64 }
65 
66 const string & CSeqDB_ColumnReader::GetTitle() const
67 {
68  return m_Impl->GetTitle();
69 }
70 
72 {
73  return m_Impl->GetMetaData();
74 }
75 
76 const string & CSeqDB_ColumnReader::GetValue(const string & key)
77 {
78  static string mt;
79  return SeqDB_MapFind(GetMetaData(), key, mt);
80 }
81 
83 {
84  return m_Impl->GetNumOIDs();
85 }
86 
88  CBlastDbBlob & blob)
89 {
90  // The blob Clear() must be done in a path where this thread does
91  // *not* hold the atlas lock, otherwise the destructor for the
92  // blob's 'lifetime' object might try to get the same lock and the
93  // thread would self-deadlock.
94 
95  blob.Clear();
96  return m_Impl->GetBlob(oid, blob, true, NULL);
97 }
98 
99 
100 // CSeqDBColumn
101 
103  const string & index_extn,
104  const string & data_extn,
105  CSeqDBLockHold * lockedp)
106  : m_AtlasHolder (lockedp, true),
107  m_Atlas (m_AtlasHolder.Get()),
108  m_IndexFile (m_Atlas),
109  m_DataFile (m_Atlas),
110  m_IndexLease (m_Atlas),
111  m_DataLease (m_Atlas),
112  m_NumOIDs (0),
113  m_DataLength (0),
114  m_MetaDataStart (0),
115  m_OffsetArrayStart (0)
116 {
117  CSeqDBLockHold locked2(m_Atlas);
118 
119  if (lockedp == NULL) {
120  lockedp = & locked2;
121  }
122 
123  m_Atlas.Lock(*lockedp);
124 
125  try {
126  CSeqDB_Path fn1(basename + "." + index_extn);
127  CSeqDB_Path fn2(basename + "." + data_extn);
128 
129  bool found1 = m_IndexFile.Open(fn1);
130  bool found2 = m_DataFile.Open(fn2);
131 
132  if (! (found1 && found2)) {
133  NCBI_THROW(CSeqDBException, eFileErr,
134  "Could not open database column files.");
135  }
136 
137  x_ReadFields(*lockedp);
138  x_ReadMetaData(*lockedp);
139  }
140  catch(...) {
141  m_Atlas.Unlock(*lockedp);
142  throw;
143  }
144 
145 }
146 
148 {
149  CSeqDBLockHold locked(m_Atlas);
150  m_Atlas.Lock(locked);
151 
152  Flush();
153 }
154 
156  const string & extn,
157  CSeqDBAtlas & atlas)
158 {
159  string fn(basename + "." + extn);
160 
161  return ( atlas.DoesFileExist(fn));
162 }
163 
164 const string & CSeqDBColumn::GetTitle() const
165 {
166  _ASSERT(m_Title.length());
167  return m_Title;
168 }
169 
171 {
172  return m_NumOIDs;
173 }
174 
176 {
178  m_DataLease.Clear();
179 }
180 
182  TIndx end,
183  ESelectFile select_file,
184  bool lifetime,
185  CBlastDbBlob & blob,
186  CSeqDBLockHold & locked)
187 {
188  bool index = (select_file == e_Index);
189  _ASSERT(index || (select_file == e_Data));
190 
192  CSeqDBFileMemMap & lease = index ? m_IndexLease : m_DataLease;
193 
194  const char * ptr = file.GetFileDataPtr(lease, begin, end);
195 
196  CTempString data(ptr, end-begin);
197 
198  if (lifetime) {
200  blob.ReferTo(data, hold);
201  } else {
202  blob.ReferTo(data);
203  }
204 }
205 
207 {
208  const int kFixedFieldBytes = 32;
209 
210  m_Atlas.Lock(locked);
211 
212  // First, get the 32 bytes of fields that we know exist.
213 
214  CBlastDbBlob header;
215  x_GetFileRange(0, kFixedFieldBytes, e_Index, false, header, locked);
216 
217  int fmt_version = header.ReadInt4();
218 
219  if (fmt_version != 1) {
221  eFileErr,
222  "Column file uses unknown format_version.");
223  }
224 
225  int column_type = header.ReadInt4();
226 
227  if (column_type != 1) {
229  eFileErr,
230  "Column file uses unknown data type.");
231  }
232 
233  int offset_size = header.ReadInt4();
234 
235  if (offset_size != 4) {
237  eFileErr,
238  "Column file uses unsupported offset size.");
239  }
240 
241  m_NumOIDs = header.ReadInt4();
242  m_DataLength = header.ReadInt8();
243  m_MetaDataStart = header.ReadInt4();
244  m_OffsetArrayStart = header.ReadInt4();
245 
247 
251 
252  // Now we know how long the header actually is, so expand the blob
253  // to reference the whole thing. (The memory lease should already
254  // hold the data, so this will just adjust a few integer fields.)
255 
256  x_GetFileRange(0, m_MetaDataStart, e_Index, false, header, locked);
257 
258  // Get string type header fields.
259 
260  m_Title = header.ReadString (kStringFmt);
261  m_Date = header.ReadString (kStringFmt);
262 
263  SEQDB_FILE_ASSERT(m_Title.size());
264  SEQDB_FILE_ASSERT(m_Date.size());
265 
266  if (header.GetReadOffset() != m_MetaDataStart) {
268  eFileErr,
269  "CSeqDBColumn: File format error.");
270  }
271 }
272 
274 {
275  m_Atlas.Lock(locked);
276 
277  int begin = m_MetaDataStart;
278  int end = m_OffsetArrayStart;
279 
280  _ASSERT(begin > 0 && end > begin);
281 
282  CBlastDbBlob metadata;
283  x_GetFileRange(begin, end, e_Index, false, metadata, locked);
284 
285  Int8 count8 = metadata.ReadVarInt();
286 
287  if (count8 >> 31) {
289  eFileErr,
290  "CSeqDBColumn: File format error.");
291  }
292 
293  int count = (int) count8;
294 
295  for(int j = 0; j < count; j++) {
296  string key = metadata.ReadString(kStringFmt);
297  string value = metadata.ReadString(kStringFmt);
298 
299  if (m_MetaData.find(key) != m_MetaData.end()) {
301  eFileErr,
302  "CSeqDBColumn: Error; duplicate metadata key.");
303  }
304 
305  m_MetaData[key] = value;
306  }
307 
308  // Align to an 8 byte multiple; eString means that we can change
309  // the alignment of this field without losing compatibility.
310 
311  metadata.SkipPadBytes(8, CBlastDbBlob::eString);
312 
313  int header_bytes = m_OffsetArrayStart - m_MetaDataStart;
314 
315  if (metadata.GetReadOffset() != header_bytes) {
317  eFileErr,
318  "CSeqDBColumn: File format error.");
319  }
320 }
321 
323  CBlastDbBlob & blob,
324  bool keep,
325  CSeqDBLockHold * lockedp)
326 {
327  _ASSERT(0 == blob.Size());
328 
329  CSeqDBLockHold locked2(m_Atlas);
330 
331  if (lockedp == NULL) {
332  lockedp = & locked2;
333  }
334 
335  int item_size = 4;
336  int istart = m_OffsetArrayStart + item_size*oid;
337  int iend = istart + (2 * item_size);
338 
339  CBlastDbBlob offsets;
340  x_GetFileRange(istart, iend, e_Index, false, offsets, *lockedp);
341 
342  int dstart = offsets.ReadInt4();
343  int dend = offsets.ReadInt4();
344 
345  SEQDB_FILE_ASSERT(dend >= dstart);
346 
347  if (dend > dstart) {
348  x_GetFileRange(dstart, dend, e_Data, keep, blob, *lockedp);
349  } else {
350  _ASSERT(! blob.Size());
351  }
352 }
353 
354 
356 {
357  return m_MetaData;
358 }
359 
360 
361 // CSeqDB_ColumnEntry
362 
363 CSeqDB_ColumnEntry::CSeqDB_ColumnEntry(const vector<int> & indices)
364  : m_VolIndices(indices), m_HaveMap(false)
365 {
366 }
367 
368 void CSeqDB_ColumnEntry::SetMapValue(const string & k, const string & v)
369 {
370  // Store a map value, but only if this key's value has not been set.
371 
372  if (m_Map.find(k) == m_Map.end()) {
373  m_Map[k] = v;
374  }
375 }
376 #endif
377 
379 
#define true
Definition: bool.h:35
#define false
Definition: bool.h:36
`Blob' Class for SeqDB (and WriteDB).
Definition: seqdbblob.hpp:56
void SkipPadBytes(int align, EPadding fmt)
Align the offset by skipping bytes.
Definition: seqdbblob.cpp:590
int GetReadOffset() const
Get the current read pointer offset.
Definition: seqdbblob.cpp:557
Int4 ReadInt4()
Read a 4 byte integer at the pointer (and move the pointer).
Definition: seqdbblob.cpp:139
@ eString
Pad using NUL terminated string of '#' bytes.
Definition: seqdbblob.hpp:272
int Size() const
Get size of blob contents.
Definition: seqdbblob.cpp:518
void ReferTo(CTempString data)
Refer to an existing memory area.
Definition: seqdbblob.cpp:68
CTempString ReadString(EStringFormat fmt)
Read string data from the blob (moving the read pointer).
Definition: seqdbblob.cpp:159
void Clear()
Clear all owned data and reference an empty string.
Definition: seqdbblob.cpp:58
Int8 ReadInt8()
Read an 8 byte integer at the pointer (and move the pointer).
Definition: seqdbblob.cpp:149
Int8 ReadVarInt()
Read a variable length integer from the blob.
Definition: seqdbblob.cpp:82
CSeqDBAtlas class.
Definition: seqdbatlas.hpp:298
bool DoesFileExist(const string &fname)
Check if file exists.
Definition: seqdbatlas.cpp:148
void Lock(CSeqDBLockHold &locked)
Lock the atlas.
Definition: seqdbatlas.hpp:463
void Unlock(CSeqDBLockHold &locked)
Unlock the atlas.
Definition: seqdbatlas.hpp:480
CSeqDBColumn class.
Definition: seqdbcol.hpp:59
string m_Title
The title identifies this column's purpose.
Definition: seqdbcol.hpp:229
Int4 m_NumOIDs
Number of OIDs (Blobs) in this column.
Definition: seqdbcol.hpp:217
CSeqDBColumn(const string &basename, const string &index_extn, const string &data_extn, CSeqDBLockHold *lockedp)
Constructor.
Definition: seqdbcol.cpp:102
static bool ColumnExists(const string &basename, const string &extn, CSeqDBAtlas &atlas)
Determine if the column exists.
Definition: seqdbcol.cpp:155
void x_ReadMetaData(CSeqDBLockHold &locked)
Open files and read field data from the atlas.
Definition: seqdbcol.cpp:273
void Flush()
Flush any held memory.
Definition: seqdbcol.cpp:175
ESelectFile
Which file to access.
Definition: seqdbcol.hpp:167
@ e_Index
Use index file.
Definition: seqdbcol.hpp:168
@ e_Data
Use data file.
Definition: seqdbcol.hpp:169
Int8 m_DataLength
Total length of data stored in the data file.
Definition: seqdbcol.hpp:220
Int4 m_OffsetArrayStart
Start offset (in the index file) of the offset array.
Definition: seqdbcol.hpp:226
int GetNumOIDs() const
Get the number of OIDs stored here.
Definition: seqdbcol.cpp:170
CSeqDBRawFile m_DataFile
Data file.
Definition: seqdbcol.hpp:208
map< string, string > m_MetaData
All key/value metadata for this column.
Definition: seqdbcol.hpp:235
const map< string, string > & GetMetaData()
Get the column's Key/Value meta data.
Definition: seqdbcol.cpp:355
CSeqDBFileMemMap m_DataLease
Data file lease.
Definition: seqdbcol.hpp:214
void x_GetFileRange(TIndx begin, TIndx end, ESelectFile select_file, bool lifetime, CBlastDbBlob &blob, CSeqDBLockHold &locked)
Get a range of the index or data file.
Definition: seqdbcol.cpp:181
static const CBlastDbBlob::EStringFormat kStringFmt
String format used by column files.
Definition: seqdbcol.hpp:147
void GetBlob(int oid, CBlastDbBlob &blob, bool keep, CSeqDBLockHold *lockedp)
Fetch the data blob for the given oid.
Definition: seqdbcol.cpp:322
string m_Date
The create date of the column files.
Definition: seqdbcol.hpp:232
CSeqDBAtlas & m_Atlas
Reference to the atlas.
Definition: seqdbcol.hpp:202
const string & GetTitle() const
Get the column title.
Definition: seqdbcol.cpp:164
CSeqDBRawFile m_IndexFile
Index file.
Definition: seqdbcol.hpp:205
Int4 m_MetaDataStart
Start offset (in the index file) of the metadata section.
Definition: seqdbcol.hpp:223
CSeqDBFileMemMap m_IndexLease
Index file lease.
Definition: seqdbcol.hpp:211
~CSeqDBColumn()
Destructor.
Definition: seqdbcol.cpp:147
void x_ReadFields(CSeqDBLockHold &locked)
Open files and read field data from the atlas.
Definition: seqdbcol.cpp:206
CSeqDBAtlas::TIndx TIndx
File offset type.
Definition: seqdbcol.hpp:150
CSeqDBException.
Definition: seqdbcommon.hpp:73
void Clear()
Clears the memory mapobject.
Definition: seqdbatlas.hpp:734
CSeqDBLockHold.
Definition: seqdbatlas.hpp:167
Raw file.
Definition: seqdbfile.hpp:64
TIndx GetFileLength() const
Get the length of the file.
Definition: seqdbfile.hpp:143
bool Open(const CSeqDB_Path &name)
MMap or Open a file.
Definition: seqdbfile.hpp:93
Hold a memory region refcount, return to atlas when destroyed.
Definition: seqdbatlas.hpp:266
CSeqDB_ColumnEntry(const vector< int > &indices)
Constructor.
Definition: seqdbcol.cpp:363
map< string, string > m_Map
The combined metadata map for this column.
Definition: seqdbcol.hpp:306
void SetMapValue(const string &k, const string &v)
Add a meta-data key/value association.
Definition: seqdbcol.cpp:368
void GetBlob(int oid, CBlastDbBlob &blob)
Fetch the data blob for the given oid.
Definition: seqdbcol.cpp:87
const map< string, string > & GetMetaData()
Get the column's key/value meta data.
Definition: seqdbcol.cpp:71
CSeqDB_ColumnReader(const string &basename, char file_id='a')
Read a BlastDb format column.
Definition: seqdbcol.cpp:46
const string & GetTitle() const
Get the column title.
Definition: seqdbcol.cpp:66
int GetNumOIDs() const
Get the number of rows stored in this column.
Definition: seqdbcol.cpp:82
class CSeqDBColumn * m_Impl
Implementation object.
~CSeqDB_ColumnReader()
Destructor.
Definition: seqdbcol.cpp:61
const string & GetValue(const string &key)
Look up one metadata value.
Definition: seqdbcol.cpp:76
CSeqDB_Path.
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
Definition: tempstr.hpp:65
const_iterator end() const
Definition: map.hpp:152
const_iterator find(const key_type &key) const
Definition: map.hpp:153
Defines column reader class for SeqDB.
char value[7]
Definition: config.c:431
#define NULL
Definition: ncbistd.hpp:225
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
int64_t Int8
8-byte (64-bit) signed integer
Definition: ncbitype.h:104
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
FILE * file
const TYPE & Get(const CNamedParameterList *param)
const struct ncbi::grid::netcache::search::fields::KEY key
int isalnum(Uchar c)
Definition: ncbictype.hpp:62
#define basename(path)
Definition: replacements.h:116
Defines database column access classes.
This file defines several SeqDB utility functions related to byte order and file system portability.
const U & SeqDB_MapFind(const std::map< T, U > &m, const T &k, const U &dflt)
Find a map value or return a default.
#define SEQDB_FILE_ASSERT(YESNO)
#define _ASSERT
Modified on Mon Dec 11 02:40:44 2023 by modify_doxy.py rev. 669887