NCBI C++ ToolKit
lds2_db.hpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 #ifndef LDS2_DB_HPP__
2 #define LDS2_DB_HPP__
3 /* $Id: lds2_db.hpp 91830 2020-12-14 18:27:21Z grichenk $
4  * ===========================================================================
5  *
6  * PUBLIC DOMAIN NOTICE
7  * National Center for Biotechnology Information
8  *
9  * This software/database is a "United States Government Work" under the
10  * terms of the United States Copyright Act. It was written as part of
11  * the author's official duties as a United States Government employee and
12  * thus cannot be copyrighted. This software/database is freely available
13  * to the public for use. The National Library of Medicine and the U.S.
14  * Government have not placed any restriction on its use or reproduction.
15  *
16  * Although all reasonable efforts have been taken to ensure the accuracy
17  * and reliability of the software and data, the NLM and the U.S.
18  * Government do not and cannot warrant the performance or results that
19  * may be obtained by using this software or data. The NLM and the U.S.
20  * Government disclaim all warranties, express or implied, including
21  * warranties of performance, merchantability or fitness for any particular
22  * purpose.
23  *
24  * Please cite the author in any work or product based on this material.
25  *
26  * ===========================================================================
27  *
28  * Author: Aleksey Grichenko
29  *
30  * File Description: Local data storage v.2, database access.
31  *
32  */
33 
34 #include <corelib/ncbiobj.hpp>
35 #include <corelib/ncbimtx.hpp>
36 #include <util/format_guess.hpp>
38 #include <util/range.hpp>
39 #include <set>
40 
42 
43 // Forward declarations
44 class CSQLITE_Connection;
45 class CSQLITE_Statement;
46 
48 
49 
50 //////////////////////////////////////////////////////////////////
51 ///
52 /// LDS2 database.
53 ///
54 
55 /// File info structure
56 struct SLDS2_File
57 {
59 
61  string name;
63  string handler;
67 
68  SLDS2_File(void) { Reset(); }
69 
70  SLDS2_File(const string& file_name) {
71  Reset();
72  name = file_name;
73  }
74 
75  void Reset(void) {
76  id = 0;
77  name = kEmptyStr;
80  size = -1;
81  time = 0;
82  crc = 0;
83  }
84 
85  bool operator==(const SLDS2_File& f) const {
86  return id == f.id &&
87  name == f.name &&
88  format == f.format &&
89  handler == f.handler &&
90  size == f.size &&
91  time == f.time &&
92  crc == f.crc;
93  }
94 
95  bool operator!=(const SLDS2_File& f) const {
96  return !(*this == f);
97  }
98 
99  bool exists(void) const {
100  // non-existent files are indicated by negative size
101  return size >= 0;
102  }
103 };
104 
105 
106 /// Chunk info
107 struct SLDS2_Chunk {
108  /// Chunk position in the raw file
110  /// Chunk position in the processed (e.g. unzipped) stream
112  /// Extra data required to load the chunk. The owner of the
113  /// pointer is responsible for deleting the buffer.
114  void* data;
115  /// Extra data size
116  size_t data_size;
117 
119  : raw_pos(0),
120  stream_pos(0),
121  data(NULL),
122  data_size(0)
123  {}
124 
126  : raw_pos(raw),
127  stream_pos(str),
128  data(NULL),
129  data_size(0)
130  {}
131 
133  {
134  DeleteData();
135  }
136 
137  void DeleteData(void)
138  {
139  if ( data ) {
140  delete[] (unsigned char*)data;
141  data = NULL;
142  }
143  data_size = 0;
144  }
145 
146  void InitData(size_t sz)
147  {
148  if ( data ) {
149  DeleteData();
150  }
151  data_size = sz;
152  if ( data_size ) {
153  data = new unsigned char[data_size];
154  }
155  }
156 
157 private:
158  // Prohibit copy operations.
161 };
162 
163 
164 /// Top level object info
166 {
167  /// Top-level object types
168  enum EBlobType {
169  eUnknown = 0,
171  eBioseq = 2,
173  /// Used for indexing individual seq-entries from a top-level
174  /// bioseq-set.
179  eSeq_submit = 8
180  };
181 
186 
188  : id(0),
189  type(eUnknown),
190  file_id(0),
191  file_pos(-1)
192  {}
193 };
194 
195 
196 /// Info about seq-id used in an annotation.
198 {
200 
201  bool external;
203 
205  : external(true),
206  range(TRange::GetEmpty())
207  {}
208 };
209 
210 
211 /// Annotation info.
213 {
215 
216  /// Annotation type
217  enum EType {
218  eUnknown = 0,
221  eSeq_graph = 3
222  };
223 
227  bool is_named;
228  string name;
230 
232  : id(-1),
233  type(eUnknown),
234  blob_id(-1),
235  is_named(false)
236  {}
237 };
238 
239 
241 {
242 public:
243  /// Database access mode flags.
244  enum EAccessMode {
245  eRead, ///< Read-only access.
246  eWrite, ///< Read/write access.
247  eMemory ///< Copy db to memory and open read-only.
248  };
249 
250  CLDS2_Database(const string& db_file, EAccessMode mode = eWrite);
251 
252  ~CLDS2_Database(void);
253 
254  /// Create the database. If the LDS2 database already exists all data will
255  /// be cleaned up. Access mode is automatically set to read/write.
256  /// NOTE: The function may fail if the db has been accessed from other
257  /// threads and some of the threads are still alive.
258  void Create(void);
259 
260  /// Open LDS2 database. If the database does not exist, throws exception.
261  void Open(EAccessMode mode = eWrite);
262 
263  /// Get database file name.
264  const string& GetDbFile(void) const { return m_DbFile; }
265 
266  /// Get SQLite flags, see CSQLITE_Connection::TOperationFlags.
267  int GetSQLiteFlags(void) const { return m_DbFlags; }
268  /// Set SQLite flags. This funtion resets the db connection.
269  void SetSQLiteFlags(int flags);
270 
271  /// Get current access mode.
272  EAccessMode GetAccessMode(void) const { return m_Mode; }
273  /// Set new access mode, re-open the database.
274  void SetAccessMode(EAccessMode mode);
275 
277 
278  /// Get all known file names
279  void GetFileNames(TStringSet& files) const;
280  /// Get complete file info
281  SLDS2_File GetFileInfo(const string& file_name) const;
282  /// Add new file record. On success file_info.id is not zero.
283  void AddFile(SLDS2_File& info);
284  /// Update info for the known file. The 'id' of the info will change.
285  void UpdateFile(SLDS2_File& info);
286  /// Delete file and all related entries from the database
287  void DeleteFile(const string& file_name);
288  void DeleteFile(Int8 file_id);
289 
290  /// Add blob, return the new blob id.
291  Int8 AddBlob(Int8 file_id,
292  SLDS2_Blob::EBlobType blob_type,
293  Int8 file_pos);
294 
296 
297  /// Add bioseq, return the new bioseq id.
298  Int8 AddBioseq(Int8 blob_id, const TSeqIdSet& ids);
299 
300  /// Add annotation, return the new annot id.
301  Int8 AddAnnot(SLDS2_Annot& annot);
302 
303  /// Check if the db contains a bioseq with the given id.
304  /// Return -1 on conflict.
305  Int8 GetBioseqId(const CSeq_id_Handle& idh) const;
306 
307  /// List of ids (blob_id, bioseq_id, lds_id etc.)
309  /// List of seq-ids.
310  typedef vector<CSeq_id_Handle> TSeqIds;
311 
312  /// Get all lds-id synonyms for the seq-id (including lds-id
313  /// for the seq-id itself). Return empty set if there is a
314  /// conflict.
315  void GetSynonyms(const CSeq_id_Handle& idh, TLdsIdSet& ids);
316 
317  /// Get all synonyms for the seq-id (including the original seq-id).
318  /// Return empty set on conflict.
319  void GetSynonyms(const CSeq_id_Handle& idh, TSeqIds& ids);
320 
321  /// Find blob containing the requested bioseq. Return empty info if
322  /// the seq-id is unknown or there are multiple bioseqs with the same id.
323  SLDS2_Blob GetBlobInfo(const CSeq_id_Handle& idh);
324 
325  /// Get blob info by blob id
326  SLDS2_Blob GetBlobInfo(Int8 blob_id);
327 
328  /// Get file info.
329  SLDS2_File GetFileInfo(Int8 file_id);
330 
331  /// A set of ids (file_id, blob_id etc.).
332  typedef vector<SLDS2_Blob> TBlobSet;
333 
334  /// Get all blobs, containing bioseqs with the seq-id.
335  void GetBioseqBlobs(const CSeq_id_Handle& idh, TBlobSet& blobs);
336 
337  /// Annotation type flags
339  fAnnot_Internal = 1, ///< Annots from the blob with the bioseq
340  fAnnot_External = 2, ///< Annots from blobs not containing the bioseq
341 
342  fAnnot_All = fAnnot_Internal | fAnnot_External
343  };
344  typedef int TAnnotChoice;
345 
346  /// Get all blobs, containing annotations for the seq-id.
347  void GetAnnotBlobs(const CSeq_id_Handle& idh,
348  TAnnotChoice choice,
349  TBlobSet& blobs);
350 
351  /// Get number of annotations grouped into a single blob.
352  Int8 GetAnnotCountForBlob(Int8 blob_id);
353 
354  typedef vector< AutoPtr<SLDS2_Annot> > TLDS2Annots;
355 
356  /// Get details about all annotations from a blob.
357  void GetAnnots(Int8 blob_id, TLDS2Annots& infos);
358 
359  /// Store the chunk info in the database.
360  void AddChunk(const SLDS2_File& file_info,
361  const SLDS2_Chunk& chunk_info);
362 
363  /// Load chunk containing the required stream position.
364  /// Return true on success.
365  bool FindChunk(const SLDS2_File& file_info,
366  SLDS2_Chunk& chunk_info,
367  Int8 stream_pos);
368 
369  /// Get seq-id for the given lds-id.
370  CRef<CSeq_id> GetSeq_idForLdsSeqId(int lds_id);
371 
372  /// Prepare to update the DB. Drop most indexes.
373  /// To rebuild the indexes call Analyze.
374  void PrepareUpdate(void);
375 
376  /// Start update transaction.
377  void BeginUpdate(void);
378 
379  /// End update transaction, commit the changes.
380  void EndUpdate(void);
381 
382  /// Cancel the update, rollback the changes.
383  void CancelUpdate(void);
384 
385  /// Start reading transaction, lock the db.
386  void BeginRead(void);
387 
388  /// End reading transaction, release the lock.
389  void EndRead(void);
390 
391  /// Analyze the DB.
392  void Analyze(void);
393 
394  /// Dump the selected table (use empty string to dump table names
395  /// or * to dump all tables.
396  void Dump(const string& table, CNcbiOstream& out);
397 
398 private:
401 
402  // Execute multiple sql queries.
403  void x_ExecuteSqls(const char* sqls[], size_t len);
404  // Initialize 'get bioseqs' sql statement for the id handle.
405  CSQLITE_Statement& x_InitGetBioseqsSql(const CSeq_id_Handle& idh) const;
406 
407  enum EIdType {
408  eIdOriginal, // Seq-id is present in the original data.
409  eIdMatch // Seq-id was created as a match for an original one.
410  };
411 
412  // Return lds-id for the seq-id. Adds new lds-id if necessary.
413  Int8 x_GetLdsSeqId(const CSeq_id_Handle& id, EIdType id_type);
414 
415  // Load seq-id from the blob.
416  CRef<CSeq_id> x_BlobToSeq_id(CSQLITE_Statement& st,
417  int size_idx,
418  int data_idx) const;
419  // Prepared statements
420  enum EStatement {
421  eSt_GetFileNames = 0,
451  eSt_StatementsCount
452  };
453  typedef vector< AutoPtr<CSQLITE_Statement> > TStatements;
454 
455  // Structure to hold per-thread connection and all statements.
457  unique_ptr<CSQLITE_Connection> Connection;
459 
460  SLDS2_DbConnection(void);
461  };
463 
464  // Get SLDS2_DbConnection for the current thread (create one
465  // if necessary).
466  SLDS2_DbConnection& x_GetDbConnection(void) const;
467  // Access database connection for the current thread.
468  CSQLITE_Connection& x_GetConn(void) const;
469  // Reset connection and clear statements cache.
470  void x_ResetDbConnection(void);
471  // Get the requested statement, prepare it if necessary.
472  CSQLITE_Statement& x_GetStatement(EStatement st) const;
473 
474  string m_DbFile;
476  // Connections and prepared statements are per-thread.
480 };
481 
482 
485 
486 #endif // LDS2_DB_HPP__
CFastMutex –.
Definition: ncbimtx.hpp:667
EFormat
The formats are checked in the same order as declared here.
@ eUnknown
unknown format
EAccessMode GetAccessMode(void) const
Get current access mode.
Definition: lds2_db.hpp:272
CTls< SLDS2_DbConnection > TDbConnectionsTls
Definition: lds2_db.hpp:462
set< string > TStringSet
Definition: lds2_db.hpp:276
vector< CSeq_id_Handle > TSeqIds
List of seq-ids.
Definition: lds2_db.hpp:310
vector< SLDS2_Blob > TBlobSet
A set of ids (file_id, blob_id etc.).
Definition: lds2_db.hpp:332
CLDS2_Database & operator=(const CLDS2_Database &)
set< Int8 > TLdsIdSet
List of ids (blob_id, bioseq_id, lds_id etc.)
Definition: lds2_db.hpp:308
EAnnotChoice
Annotation type flags.
Definition: lds2_db.hpp:338
CLDS2_Database(const CLDS2_Database &)
@ eSt_GetLdsSeqIdForIntId
Definition: lds2_db.hpp:424
@ eSt_GetAnnotBlobsAllByTxtId
Definition: lds2_db.hpp:435
@ eSt_GetLdsSeqIdForTxtId
Definition: lds2_db.hpp:425
@ eSt_GetAnnotBlobsByIntId
Definition: lds2_db.hpp:432
@ eSt_GetSeq_idForLdsSeqId
Definition: lds2_db.hpp:449
@ eSt_GetAnnotInfosForBlob
Definition: lds2_db.hpp:437
@ eSt_GetBioseqIdForIntId
Definition: lds2_db.hpp:426
@ eSt_GetBioseqIdForTxtId
Definition: lds2_db.hpp:427
@ eSt_GetAnnotBlobsByTxtId
Definition: lds2_db.hpp:434
@ eSt_GetAnnotCountForBlob
Definition: lds2_db.hpp:436
@ eSt_GetAnnotBlobsAllByIntId
Definition: lds2_db.hpp:433
@ eSt_GetFileInfoByName
Definition: lds2_db.hpp:422
@ eSt_GetBioseqForIntId
Definition: lds2_db.hpp:430
@ eSt_DeleteFileByName
Definition: lds2_db.hpp:445
@ eSt_GetBioseqForTxtId
Definition: lds2_db.hpp:431
@ eSt_GetSeq_idSynonyms
Definition: lds2_db.hpp:450
EAccessMode m_Mode
Definition: lds2_db.hpp:479
CFastMutex m_DbInitMutex
Definition: lds2_db.hpp:477
int GetSQLiteFlags(void) const
Get SQLite flags, see CSQLITE_Connection::TOperationFlags.
Definition: lds2_db.hpp:267
vector< AutoPtr< CSQLITE_Statement > > TStatements
Definition: lds2_db.hpp:453
set< CSeq_id_Handle > TSeqIdSet
Definition: lds2_db.hpp:295
EAccessMode
Database access mode flags.
Definition: lds2_db.hpp:244
@ eWrite
Read/write access.
Definition: lds2_db.hpp:246
@ eRead
Read-only access.
Definition: lds2_db.hpp:245
const string & GetDbFile(void) const
Get database file name.
Definition: lds2_db.hpp:264
vector< AutoPtr< SLDS2_Annot > > TLDS2Annots
Definition: lds2_db.hpp:354
CRef< TDbConnectionsTls > m_DbConn
Definition: lds2_db.hpp:478
void PrepareUpdate(void)
Prepare to update the DB.
void CancelUpdate(void)
Cancel the update, rollback the changes.
string m_DbFile
Definition: lds2_db.hpp:474
CObject –.
Definition: ncbiobj.hpp:180
Connection to SQLite database.
SQL statement executing on SQLite database.
void Reset(void)
Reset the statement to release all locks and to be ready to execute again.
CTls –.
Definition: ncbithr.hpp:166
void(*)(CSeq_entry_Handle seh, IWorkbench *wb, const CSerialObject &obj) handler
static uch flags
const char * file_name[]
std::ofstream out("events_result.xml")
main entry point for tests
#define true
Definition: bool.h:35
#define false
Definition: bool.h:36
static const char * str(char *buf, int n)
Definition: stats.c:84
#define NULL
Definition: ncbistd.hpp:225
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
int64_t Int8
8-byte (64-bit) signed integer
Definition: ncbitype.h:104
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
Definition: ncbistre.hpp:149
#define kEmptyStr
Definition: ncbistr.hpp:123
#define NCBI_LDS2_EXPORT
Definition: ncbi_export.h:584
<!DOCTYPE HTML >< html > n< header > n< title > PubSeq Gateway Help Page</title > n< style > n table
int len
static MDB_envinfo info
Definition: mdb_load.c:37
mdb_mode_t mode
Definition: lmdb++.h:38
const struct ncbi::grid::netcache::search::fields::SIZE size
Multi-threading – mutexes; rw-locks; semaphore.
Portable reference counted smart and weak pointers using CWeakRef, CRef, CObject and CObjectEx.
static Format format
Definition: njn_ioutil.cpp:53
double f(double x_, const double &y_)
Definition: njn_root.hpp:188
void Dump(CSplitCacheApp *app, const C &obj, ESerialDataFormat format, const string &key, const string &suffix=kEmptyStr)
unique_ptr< CSQLITE_Connection > Connection
Definition: lds2_db.hpp:457
Info about seq-id used in an annotation.
Definition: lds2_db.hpp:198
SLDS2_AnnotIdInfo(void)
Definition: lds2_db.hpp:204
CRange< TSeqPos > TRange
Definition: lds2_db.hpp:199
Annotation info.
Definition: lds2_db.hpp:213
EType type
Definition: lds2_db.hpp:225
string name
Definition: lds2_db.hpp:228
map< CSeq_id_Handle, SLDS2_AnnotIdInfo > TIdMap
Definition: lds2_db.hpp:214
EType
Annotation type.
Definition: lds2_db.hpp:217
Int8 blob_id
Definition: lds2_db.hpp:226
TIdMap ref_ids
Definition: lds2_db.hpp:229
bool is_named
Definition: lds2_db.hpp:227
SLDS2_Annot(void)
Definition: lds2_db.hpp:231
Top level object info.
Definition: lds2_db.hpp:166
Int8 file_pos
Definition: lds2_db.hpp:185
SLDS2_Blob(void)
Definition: lds2_db.hpp:187
Int8 file_id
Definition: lds2_db.hpp:184
EBlobType type
Definition: lds2_db.hpp:183
EBlobType
Top-level object types.
Definition: lds2_db.hpp:168
@ eBioseq_set_element
Used for indexing individual seq-entries from a top-level bioseq-set.
Definition: lds2_db.hpp:175
@ eSeq_align_set
Definition: lds2_db.hpp:177
Chunk info.
Definition: lds2_db.hpp:107
SLDS2_Chunk(const SLDS2_Chunk &)
void InitData(size_t sz)
Definition: lds2_db.hpp:146
~SLDS2_Chunk(void)
Definition: lds2_db.hpp:132
SLDS2_Chunk(void)
Definition: lds2_db.hpp:118
void DeleteData(void)
Definition: lds2_db.hpp:137
Int8 stream_pos
Chunk position in the processed (e.g. unzipped) stream.
Definition: lds2_db.hpp:111
SLDS2_Chunk & operator=(const SLDS2_Chunk &)
size_t data_size
Extra data size.
Definition: lds2_db.hpp:116
void * data
Extra data required to load the chunk.
Definition: lds2_db.hpp:114
SLDS2_Chunk(Int8 raw, Int8 str)
Definition: lds2_db.hpp:125
Int8 raw_pos
Chunk position in the raw file.
Definition: lds2_db.hpp:109
LDS2 database.
Definition: lds2_db.hpp:57
TFormat format
Definition: lds2_db.hpp:62
bool exists(void) const
Definition: lds2_db.hpp:99
bool operator!=(const SLDS2_File &f) const
Definition: lds2_db.hpp:95
Int8 size
Definition: lds2_db.hpp:64
SLDS2_File(const string &file_name)
Definition: lds2_db.hpp:70
Int8 time
Definition: lds2_db.hpp:65
CFormatGuess::EFormat TFormat
Definition: lds2_db.hpp:58
string handler
Definition: lds2_db.hpp:63
bool operator==(const SLDS2_File &f) const
Definition: lds2_db.hpp:85
SLDS2_File(void)
Definition: lds2_db.hpp:68
Uint4 crc
Definition: lds2_db.hpp:66
string name
Definition: lds2_db.hpp:61
Int8 id
Definition: lds2_db.hpp:60
void Reset(void)
Definition: lds2_db.hpp:75
Definition: type.c:6
Modified on Sat Apr 13 11:47:10 2024 by modify_doxy.py rev. 669887