NCBI C++ ToolKit
seqdb_lmdb.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: seqdb_lmdb.cpp 100751 2023-09-07 12:41:08Z boratyng $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Christiam Camacho
27  *
28  */
29 
30 /// @file seqdb_lmdb.cpp
31 /// Implements interface to interact with LMDB files
32 
33 #include <ncbi_pch.hpp>
36 #include <corelib/ncbifile.hpp>
38 #include <cmath>
39 
41 
42 #define SEQDB_LMDB_TIMING
43 #ifdef SEQDB_LMDB_TIMING
44 template<class T>
45 static string s_FormatNum(T value)
46 {
47  CNcbiOstrstream oss;
48  oss.imbue(std::locale(""));
49  oss << std::fixed << value;
50  return CNcbiOstrstreamToString(oss);
51 }
52 
53 #define SPEED(time, nentries) s_FormatNum((size_t)((nentries)/(time)))
54 #endif /* SEQDB_LMDB_TIMING */
55 
57 {
58  auto txn = lmdb::txn::begin(env, nullptr, MDB_RDONLY);
59  if(file_type == eLMDB) {
60  try {
62  }
63  catch (...){ /* It's ok not to have acc in a db */}
66  }
67  else if (file_type == eTaxId2Offsets) {
69  }
70  else {
71  NCBI_THROW( CSeqDBException, eArgErr, "Invalid lmdb file type");
72  }
73  txn.commit();
74  txn.reset();
75  return;
76 }
77 
78 CBlastLMDBManager::CBlastEnv::CBlastEnv(const string & fname, ELMDBFileType file_type, bool read_only, Uint8 map_size) :
79  m_Filename(fname), m_FileType(file_type),m_Env(lmdb::env::create()), m_Count(1), m_ReadOnly(read_only)
80 {
81  const MDB_dbi num_db(3);
82  m_Env.set_max_dbs(num_db);
83  m_dbis.resize(eDbiMax, UINT_MAX);
84  if(m_ReadOnly) {
85  CFile tf(fname);
86  Uint8 readMapSize = (tf.GetLength()/10000 + 1) *10000;
87  if (!tf.Exists()) {
88  NCBI_THROW( CSeqDBException, eFileErr, "File " + fname + " not found. If you renamed any BLAST database files, please use original file names, and makeblastdb to rename the database. If you deleted any BLAST database files, you need to recreate the database.");
89  }
90  m_Env.set_mapsize(readMapSize);
91  try {
93  }
94  catch (lmdb::error& e) {
95  NCBI_THROW(CSeqDBException, eFileErr, "LMDB runtime error: " + (string)e.what());
96  }
98  }
99  else {
100  LOG_POST(Info <<"Initial Map Size: " << map_size);
101  /// map_size 0 means use lmdb default
102  if(map_size != 0) {
103  m_Env.set_mapsize(map_size);
104  }
105  try {
106  m_Env.open(m_Filename.c_str(), MDB_NOSUBDIR , 0664);
107  }
108  catch (lmdb::error& e) {
109  NCBI_THROW(CSeqDBException, eFileErr, "LMDB runtime error: " + (string)e.what());
110  }
111  }
112 }
113 
115 {
116  for (unsigned int i=0; i < m_dbis.size(); i++){
117  if (m_dbis[i] != UINT_MAX) {
118  mdb_dbi_close(m_Env,m_dbis[i]);
119  }
120  }
121  m_Env.close();
122 }
123 
125 {
126  if(m_dbis[dbi_type] == UINT_MAX) {
127  string err = "DB contains no ";
128  switch (dbi_type) {
129  case eDbiVolinof:
130  case eDbiVolname:
131  err += "vol info.";
132  break;
133  case eDbiAcc2oid:
134  err += "accession info.";
135  break;
136  case eDbiTaxid2offset:
137  err += "tax id info";
138  break;
139  default:
140  NCBI_THROW( CSeqDBException, eArgErr, "Invalid dbi type");
141  }
142  NCBI_THROW( CSeqDBException, eArgErr, err);
143 
144  }
145  return m_dbis[dbi_type];
146 }
147 
149 {
150  if(!m_ReadOnly) {
151  m_Env.set_mapsize(map_size);
152  }
153 }
154 
156  static CSafeStatic<CBlastLMDBManager> lmdb_manager;
157  return lmdb_manager.Get();
158 }
159 
160 lmdb::env & CBlastLMDBManager::GetReadEnvVol(const string & fname, MDB_dbi & db_volname, MDB_dbi & db_volinfo)
161 {
162  CBlastEnv* p = GetBlastEnv(fname, eLMDB);
163  db_volinfo = p->GetDbi(CBlastEnv::eDbiVolinof);
164  db_volname = p->GetDbi(CBlastEnv::eDbiVolname);
165  return p->GetEnv();
166 }
167 lmdb::env & CBlastLMDBManager::GetReadEnvAcc(const string & fname, MDB_dbi & db_acc, bool* opened)
168 {
169  CBlastEnv* p = GetBlastEnv(fname, eLMDB, opened);
170  db_acc = p->GetDbi(CBlastEnv::eDbiAcc2oid);
171  return p->GetEnv();
172 }
173 lmdb::env & CBlastLMDBManager::GetReadEnvTax(const string & fname, MDB_dbi & db_tax, bool* opened)
174 {
175  CBlastEnv* p = GetBlastEnv(fname, eTaxId2Offsets, opened);
176  db_tax = p->GetDbi(CBlastEnv::eDbiTaxid2offset);
177  return p->GetEnv();
178 }
179 
180 
183  bool* opened)
184 {
185  CFastMutexGuard guard(m_Mutex);
186  NON_CONST_ITERATE(list <CBlastEnv* >, itr, m_EnvList) {
187  if((*itr)->GetFilename() == fname) {
188  (*itr)->AddReference();
189  if ( opened && !*opened ) {
190  (*itr)->AddReference();
191  *opened = true;
192  }
193  return (*itr);
194  }
195  }
196  CBlastEnv * p (new CBlastEnv(fname, file_type));
197  m_EnvList.push_back(p);
198  if ( opened && !*opened ) {
199  p->AddReference();
200  *opened = true;
201  }
202  return p;
203 }
204 
205 lmdb::env & CBlastLMDBManager::GetWriteEnv(const string & fname, Uint8 map_size)
206 {
207  CFastMutexGuard guard(m_Mutex);
208  NON_CONST_ITERATE(list <CBlastEnv* >, itr, m_EnvList) {
209  if((*itr)->GetFilename() == fname) {
210  (*itr)->AddReference();
211  return (*itr)->GetEnv();
212  }
213  }
214  CBlastEnv * p (new CBlastEnv(fname, eLMDBFileTypeEnd, false, map_size));
215  m_EnvList.push_back(p);
216  return p->GetEnv();
217 }
218 
219 
220 void CBlastLMDBManager::CloseEnv(const string & fname)
221 {
222  CFastMutexGuard guard(m_Mutex);
223  NON_CONST_ITERATE(list <CBlastEnv* >, itr, m_EnvList) {
224  if((*itr)->GetFilename() == fname) {
225  if((*itr)->RemoveReference() == 0) {
226  delete *itr;
227  itr = m_EnvList.erase(itr);
228  break;
229  }
230  }
231  }
232 }
233 
235 {
236  NON_CONST_ITERATE(list <CBlastEnv* >, itr, m_EnvList) {
237  delete *itr;
238  }
239  m_EnvList.clear();
240 }
241 
242 CSeqDBLMDB::CSeqDBLMDB(const string & fname)
243  : m_LMDBFile(fname),
244  m_Oid2SeqIdsFile(GetFileNameFromExistingLMDBFile(fname, ELMDBFileType::eOid2SeqIds)),
245  m_Oid2TaxIdsFile(GetFileNameFromExistingLMDBFile(fname, ELMDBFileType::eOid2TaxIds)),
246  m_TaxId2OidsFile(GetFileNameFromExistingLMDBFile(fname, ELMDBFileType::eTaxId2Oids)),
247  m_TaxId2OffsetsFile(GetFileNameFromExistingLMDBFile(fname, ELMDBFileType::eTaxId2Offsets)),
248  m_LMDBFileOpened(false),
249  m_NumOids(0)
250 {
251 }
252 
254 {
255  if ( m_LMDBFileOpened ) {
257  m_LMDBFileOpened = false;
258  }
259 }
260 
261 void
262 CSeqDBLMDB::GetOid(const string & accession, vector<blastdb::TOid> & oids, const bool allow_dup) const
263 {
264  try {
265  oids.clear();
266  {
267  MDB_dbi dbi_handle;
269  lmdb::dbi dbi(dbi_handle);
270  auto txn = lmdb::txn::begin(env, nullptr, MDB_RDONLY);
271  auto cursor = lmdb::cursor::open(txn, dbi);
272 
273  string acc = accession;
274  lmdb::val data2find(acc);
275 
276  if (cursor.get(data2find, MDB_SET)) {
277  lmdb::val k, val;
278  cursor.get(k, val, MDB_GET_CURRENT);
279  const char* d = val.data();
280  oids.push_back(((d[3] << 24)&0xFF000000) | ((d[2] << 16) & 0xFF0000) | ((d[1] << 8) & 0xFF00) | (d[0]&0xFF));
281 
282  if(allow_dup) {
283  while (cursor.get(k,val, MDB_NEXT_DUP)) {
284  d = val.data();
285  oids.push_back(((d[3] << 24)&0xFF000000) | ((d[2] << 16) & 0xFF0000) | ((d[1] << 8) & 0xFF00) | (d[0]&0xFF));
286  }
287  }
288  }
289  cursor.close();
290  txn.reset();
291  }
293  } catch (lmdb::error & e) {
294  string dbname;
296  if(e.code() == MDB_NOTFOUND) {
297  NCBI_THROW( CSeqDBException, eArgErr, "Seqid list specified but no accession table is found in " + dbname);
298  }
299  else {
300  NCBI_THROW( CSeqDBException, eArgErr, "Accessions to Oids lookup error in " + dbname);
301  }
302  }
303 }
304 
305 
306 void CSeqDBLMDB::GetVolumesInfo(vector<string> & vol_names, vector<blastdb::TOid> & vol_num_oids)
307 {
308  MDB_dbi db_volname_handle;
309  MDB_dbi db_volinfo_handle;
310  lmdb::env & env = CBlastLMDBManager::GetInstance().GetReadEnvVol(m_LMDBFile, db_volname_handle, db_volinfo_handle);
311  vol_names.clear();
312  vol_num_oids.clear();
313  {
314  auto txn = lmdb::txn::begin(env, nullptr, MDB_RDONLY);
315  lmdb::dbi db_volname(db_volname_handle);
316  lmdb::dbi db_volinfo(db_volinfo_handle);
317  MDB_stat volinfo_stat, volname_stat;
318  lmdb::dbi_stat(txn, db_volinfo, &volinfo_stat);
319  lmdb::dbi_stat(txn, db_volname, &volname_stat);
320  if(volinfo_stat.ms_entries != volname_stat.ms_entries) {
321  NCBI_THROW( CSeqDBException, eArgErr, "Volinfo error ");
322  }
323 
324  vol_names.resize(volinfo_stat.ms_entries);
325  vol_num_oids.resize(volinfo_stat.ms_entries);
326 
327  auto cursor_volname = lmdb::cursor::open(txn, db_volname);
328  auto cursor_volinfo = lmdb::cursor::open(txn, db_volinfo);
329  for (unsigned int i=0; i < volinfo_stat.ms_entries; i++) {
330  lmdb::val data2find(&i, sizeof(Int4));
331  if (cursor_volname.get(data2find, MDB_SET)) {
332  {
333  lmdb::val k, val;
334  cursor_volname.get(k, val, MDB_GET_CURRENT);
335  vol_names[i].assign(val.data(), val.size());
336  }
337  if (cursor_volinfo.get(data2find, MDB_SET)) {
338  lmdb::val k, val;
339  cursor_volinfo.get(k, val, MDB_GET_CURRENT);
340  const char* d = val.data();
341  vol_num_oids[i] = (((d[3] << 24)&0xFF000000) | ((d[2] << 16) & 0xFF0000) | ((d[1] << 8) & 0xFF00) | (d[0]&0xFF));
342  }
343  else {
344  NCBI_THROW( CSeqDBException, eArgErr, "No volinfo for " + vol_names[i]);
345  }
346  }
347  }
348  cursor_volname.close();
349  cursor_volinfo.close();
350  txn.reset();
351  }
353 
354  for(unsigned int j=0; j < vol_num_oids.size(); j++){
355  m_NumOids += vol_num_oids[j];
356  }
357 }
358 
359 void
360 CSeqDBLMDB::GetOids(const vector<string>& accessions, vector<blastdb::TOid>& oids) const
361 {
362  try {
363  oids.clear();
364  oids.resize(accessions.size(), kSeqDBEntryNotFound);
365 
366  MDB_dbi dbi_handle;
368  {
369  lmdb::dbi dbi(dbi_handle);
370  auto txn = lmdb::txn::begin(env, nullptr, MDB_RDONLY);
371 
372  auto cursor = lmdb::cursor::open(txn, dbi);
373 
374  unsigned int i=0;
375  for (i=0; i < accessions.size(); i++) {
376  string acc = accessions[i];
377  lmdb::val data2find(acc);
378  if (cursor.get(data2find, MDB_SET)) {
379  lmdb::val k, val;
380  cursor.get(k, val, MDB_GET_CURRENT);
381  const char* d = val.data();
382  oids[i] = (((d[3] << 24)&0xFF000000) | ((d[2] << 16) & 0xFF0000) | ((d[1] << 8) & 0xFF00) | (d[0]&0xFF));
383  }
384  }
385 
386  cursor.close();
387  txn.reset();
388  }
390  } catch (lmdb::error & e) {
391  string dbname;
393  if(e.code() == MDB_NOTFOUND) {
394  NCBI_THROW( CSeqDBException, eArgErr, "Seqid list specified but no accession table is found in " + dbname);
395  }
396  else {
397  NCBI_THROW( CSeqDBException, eArgErr, "Accessions to Oids lookup error in " + dbname);
398  }
399  }
400 }
401 
403 {
404  SOidSeqIdPair(blastdb::TOid o, const string & i) : oid(o), id(i) {}
406  string id;
407  static bool cmp_oid(const SOidSeqIdPair & v, const SOidSeqIdPair & k) {
408  if(v.oid == k.oid) {
409  return (v.id < k.id);
410  }
411  return (v.oid < k.oid );
412  }
413 };
414 
416 {
417 public:
418  CLookupSeqIds(CMemoryFile & file): m_IndexStart((Uint8*) file.GetPtr()), m_DataStart((char *) file.GetPtr()) {
419  if(m_IndexStart == NULL){
420  NCBI_THROW( CSeqDBException, eArgErr, "Failed to open oid-to-seqid lookup file");
421  }
422 
423  Uint8 num_of_oids = *m_IndexStart;
424  m_IndexStart ++;
425  m_DataStart += (8 * (num_of_oids + 1));
426  }
427 
428  inline void GetSeqIdListForOid(blastdb::TOid oid, vector<string> & idlist);
429 private:
430 
432  char * m_DataStart;
433 };
434 
435 void CLookupSeqIds::GetSeqIdListForOid(blastdb::TOid oid, vector<string> & idlist)
436 {
437  Uint8 * index_ptr = m_IndexStart + oid;
438  Char * end = m_DataStart + (*index_ptr);
439  index_ptr--;
440  Char * begin = (oid == 0) ? m_DataStart:m_DataStart + (*index_ptr);
441  while (begin < end) {
442  unsigned char id_len = *begin;
443  begin ++;
444  if(id_len == 0xFF) {
445  Uint4 long_id_len = *((Uint4 *) begin);
446  begin +=4;
447  string id;
448  id.assign(begin, long_id_len);
449  begin += long_id_len;
450  idlist.push_back(id);
451  }
452  else {
453  string id;
454  id.assign(begin, id_len);
455  begin += id_len;
456  idlist.push_back(id);
457  }
458  }
459 }
460 
461 bool s_CompareIdList(vector<string> & file_idlist, vector<string> &input_idlist)
462 {
463  bool rv = false;
464  vector<string>::iterator f_itr = file_idlist.begin();
465  vector<string>::iterator i_itr = input_idlist.begin();
466  while(f_itr != file_idlist.end() && i_itr != input_idlist.end()) {
467  if(*i_itr == *f_itr) {
468  i_itr++;
469  f_itr++;
470  continue;
471  }
472  else {
474  // Input id is PDB with just mol id
475  if(seq_id.IsPdb() && !seq_id.GetPdb().IsSetChain_id()) {
477  if (file_seqid.IsPdb() && file_seqid.GetPdb().GetMol().Get() == *i_itr) {
478  f_itr++;
479  string tmp_pdb = *i_itr;
480  while ((f_itr != file_idlist.end()) && ((*f_itr).find_first_of(tmp_pdb) == 0)){
481  f_itr++;
482  }
483  // Skip pdb id in input list but with chain id
484  while ((i_itr != input_idlist.end()) && ((*i_itr).find_first_of(tmp_pdb) == 0)){
485  i_itr++;
486  }
487  continue;
488  }
489  }
490  else {
492  if( file_seq_id.GetSeqIdString(false) == *i_itr) {
493  i_itr++;
494  // Skip identical id in input list but with version
495  if((i_itr != input_idlist.end()) && (file_seq_id.GetSeqIdString(true) == *i_itr)){
496  i_itr++;
497  }
498  f_itr++;
499  continue;
500  }
501  }
502  break;
503  }
504  }
505  if(f_itr == file_idlist.end()){
506  rv=true;
507  }
508 
509  file_idlist.clear();
510  input_idlist.clear();
511  return rv;
512 }
513 
514 void
515 CSeqDBLMDB::NegativeSeqIdsToOids(const vector<string>& ids, vector<blastdb::TOid>& rv) const
516 {
517  rv.clear();
518  vector<blastdb::TOid> oids;
519  GetOids(ids, oids);
520  vector<SOidSeqIdPair> pairs;
521  for (unsigned int i=0; i < ids.size(); i++) {
522  if(oids[i] == kSeqDBEntryNotFound) {
523  continue;
524  }
525  else {
526  SOidSeqIdPair p(oids[i], ids[i]);
527  pairs.push_back(p);
528  }
529  }
530 
531  if(pairs.size() == 0) {
532  return;
533  }
534 
535  sort (pairs.begin(), pairs.end(), SOidSeqIdPair::cmp_oid);
536 
537  CMemoryFile oid_file(m_Oid2SeqIdsFile);
538  CLookupSeqIds lookup(oid_file);
539  blastdb::TOid current_oid = 0;
540  unsigned int i=0;
541  while (i < pairs.size()) {
542  vector<string> file_idlist;
543  vector<string> input_idlist;
544  current_oid = pairs[i].oid;
545  lookup.GetSeqIdListForOid(current_oid, file_idlist);
546  while ((i < pairs.size()) && (current_oid == pairs[i].oid)) {
547  input_idlist.push_back(pairs[i].id);
548  i++;
549  }
550  if(s_CompareIdList(file_idlist, input_idlist)) {
551  rv.push_back(current_oid);
552  }
553 
554  }
555 
556 }
557 
558 void CSeqDBLMDB::GetDBTaxIds(vector<TTaxId> & tax_ids) const
559 {
560 
561  tax_ids.clear();
562  try {
563  MDB_dbi dbi_handle;
565  {
566  auto txn = lmdb::txn::begin(env, nullptr, MDB_RDONLY);
567  auto dbi(dbi_handle);
568  auto cursor = lmdb::cursor::open(txn, dbi);
569  lmdb::val key;
570  while (cursor.get(key, MDB_NEXT)) {
571  TTaxId taxid = TAX_ID_FROM(Int4, *((Int4 *)key.data()));
572  tax_ids.push_back(taxid);
573  }
574  cursor.close();
575  txn.reset();
576  }
577  }
578  catch (lmdb::error & e) {
579  string dbname;
581  if(e.code() == MDB_NOTFOUND) {
582  NCBI_THROW( CSeqDBException, eArgErr, "No taxonomy info found in " + dbname);
583  }
584  else {
585  NCBI_THROW( CSeqDBException, eArgErr, "Taxonomy Id to Oids lookup error in " + dbname);
586  }
587  }
589 }
590 
591 void CSeqDBLMDB::GetOidsForTaxIds(const set<TTaxId> & tax_ids, vector<blastdb::TOid>& oids, vector<TTaxId> & tax_ids_found) const
592 {
593  try {
594  oids.clear();
595  tax_ids_found.clear();
596  vector<Uint8> offsets;
597  MDB_dbi dbi_handle;
599  {
600  auto txn = lmdb::txn::begin(env, nullptr, MDB_RDONLY);
601  lmdb::dbi dbi(dbi_handle);
602  auto cursor = lmdb::cursor::open(txn, dbi);
603  ITERATE(set<TTaxId>, itr, tax_ids) {
604  Int4 tax_id = TAX_ID_TO(Int4, *itr);
605  lmdb::val data2find(tax_id);
606 
607  if (cursor.get(data2find, MDB_SET)) {
608  lmdb::val k, val;
609  cursor.get(k, val, MDB_GET_CURRENT);
610  const char* d = val.data();
611  offsets.push_back((((Uint8) d[7] << 56) &0xFF00000000000000) | (((Uint8) d[6] << 48) & 0xFF000000000000) |
612  (((Uint8) d[5] << 40) &0xFF0000000000) | (((Uint8) d[4] << 32) & 0xFF00000000) |
613  (((Uint8) d[3] << 24) &0xFF000000) | (((Uint8) d[2] << 16) & 0xFF0000) |
614  (((Uint8) d[1] << 8) &0xFF00) | ((Uint8) d[0]&0xFF));
615  while (cursor.get(k,val, MDB_NEXT_DUP)) {
616  d = val.data();
617  offsets.push_back((((Uint8) d[7] << 56) &0xFF00000000000000) | (((Uint8) d[6] << 48) & 0xFF000000000000) |
618  (((Uint8) d[5] << 40) &0xFF0000000000) | (((Uint8) d[4] << 32) & 0xFF00000000) |
619  (((Uint8) d[3] << 24) &0xFF000000) | (((Uint8) d[2] << 16) & 0xFF0000) |
620  (((Uint8) d[1] << 8) &0xFF00) | ((Uint8) d[0]&0xFF));
621  }
622  tax_ids_found.push_back(*itr);
623  }
624  }
625  cursor.close();
626  txn.reset();
627  }
629  vector<bool> oids_set(m_NumOids, false);
630 
631  CMemoryFile oid_file(m_TaxId2OidsFile);
632  const char * start_ptr = (char *) oid_file.GetPtr();
633  for (unsigned int i=0; i < offsets.size(); i++) {
634  Uint4 * list_ptr = (Uint4 * ) (start_ptr + offsets[i]);
635  Uint4 num_of_oids = *list_ptr;
636  Uint4 count = 0 ;
637  list_ptr ++;
638  while(count < num_of_oids) {
639  if(!oids_set[*list_ptr]) {
640  oids.push_back(*list_ptr);
641  oids_set[*list_ptr] = true;
642  }
643  count++;
644  list_ptr++;
645  }
646  }
647 
648  int oids_sz = oids.size();
649  if((oids_sz*log(oids_sz)) < m_NumOids) {
650  sort(oids.begin(), oids.end());
651  }
652  else {
653  oids.resize(0);
654  oids.reserve(oids_sz);
655  for (int i=0; i < oids_set.size(); i++) {
656  if(oids_set[i]) {
657  oids.push_back(i);
658  }
659  }
660  }
661 
662  } catch (lmdb::error & e) {
663  string dbname;
665  if(e.code() == MDB_NOTFOUND) {
666  NCBI_THROW( CSeqDBException, eArgErr, "No taxonomy info found in " + dbname);
667  }
668  else {
669  NCBI_THROW( CSeqDBException, eArgErr, "Taxonomy Id to Oids lookup error in " + dbname);
670  }
671  }
672 }
673 
674 
676 {
677 public:
679  if(m_IndexStart == NULL){
680  NCBI_THROW( CSeqDBException, eArgErr, "Failed to open oid-to-taxids lookup file");
681  }
682 
683  Uint8 num_of_oids = *m_IndexStart;
684  m_IndexStart ++;
685  m_DataStart += (2* (num_of_oids + 1));
686  }
687 
688  inline void GetTaxIdListForOid(blastdb::TOid oid, vector<TTaxId> & taxid_list);
689 private:
690 
693 };
694 
695 void CLookupTaxIds::GetTaxIdListForOid(blastdb::TOid oid, vector<TTaxId> & taxid_list)
696 {
697  taxid_list.clear();
698  Uint8 * index_ptr = m_IndexStart + oid;
699  Int4 * end = m_DataStart + (*index_ptr);
700  index_ptr--;
701  Int4 * begin = (oid == 0) ? m_DataStart:m_DataStart + (*index_ptr);
702  while (begin < end) {
703  taxid_list.push_back(TAX_ID_FROM(Int4, *begin));
704  begin++;
705  }
706 }
707 
708 void
709 CSeqDBLMDB::NegativeTaxIdsToOids(const set<TTaxId>& tax_ids, vector<blastdb::TOid>& rv, vector<TTaxId> & tax_ids_found) const
710 {
711  rv.clear();
712  vector<blastdb::TOid> oids;
713  GetOidsForTaxIds(tax_ids, oids, tax_ids_found);
714  CMemoryFile oid_file(m_Oid2TaxIdsFile);
715  set<TTaxId> tax_id_list(tax_ids_found.begin(), tax_ids_found.end());
716  CLookupTaxIds lookup(oid_file);
717  for(unsigned int i=0; i < oids.size(); i++) {
718  vector<TTaxId> file_list;
719  lookup.GetTaxIdListForOid(oids[i], file_list);
720  if(file_list.size() > tax_ids.size()) {
721  continue;
722  }
723  else {
724  unsigned int j = 0;
725  for(; j < file_list.size(); j++) {
726  if(tax_id_list.find(file_list[j]) == tax_id_list.end()) {
727  break;
728  }
729  }
730  if(j == file_list.size()) {
731  rv.push_back(oids[i]);
732  }
733  }
734  }
735 }
736 
737 void CSeqDBLMDB::GetTaxIdsForOids(const vector<blastdb::TOid> & oids, set<TTaxId> & tax_ids) const
738 {
739  CMemoryFile oid_file(m_Oid2TaxIdsFile);
740  CLookupTaxIds lookup(oid_file);
741  for(unsigned int i=0; i < oids.size(); i++) {
742  vector<TTaxId> taxid_list;
743  lookup.GetTaxIdListForOid(oids[i], taxid_list);
744  tax_ids.insert(taxid_list.begin(), taxid_list.end());
745  }
746 }
747 
748 
749 string BuildLMDBFileName(const string& basename, bool is_protein, bool use_index, unsigned int index)
750 {
751  if (basename.empty()) {
752  throw invalid_argument("Basename is empty");
753  }
754 
755  string vol_str=kEmptyStr;
756  if(use_index) {
757  vol_str = (index > 9) ?".": ".0";
758  vol_str += NStr::UIntToString(index);
759  }
760  return basename + vol_str + (is_protein ? ".pdb" : ".ndb");
761 }
762 
763 string GetFileNameFromExistingLMDBFile(const string& lmdb_filename, ELMDBFileType file_type)
764 {
765 
766  string filename (lmdb_filename, 0, lmdb_filename.size() - 2);
767  switch (file_type) {
768  case eLMDB :
769  filename += "db";
770  break;
771  case eOid2SeqIds :
772  filename +="os";
773  break;
774  case eOid2TaxIds :
775  filename +="ot";
776  break;
777  case eTaxId2Offsets :
778  filename += "tf";
779  break;
780  case eTaxId2Oids :
781  filename += "to";
782  break;
783  default :
784  NCBI_THROW( CSeqDBException, eArgErr, "Invalid LMDB file type");
785  break;
786  }
787  return filename;
788 }
789 
790 void DeleteLMDBFiles(bool db_is_protein, const string & filename)
791 {
792  vector<string> extn;
793  SeqDB_GetLMDBFileExtensions(db_is_protein, extn);
794  ITERATE(vector<string>, itr, extn) {
795  CFile f(filename + "." + (*itr));
796  if (f.Exists()) {
797  f.Remove();
798  }
799  }
800 }
801 
802 
804 
MDB_dbi GetDbi(EDbiType dbi_type)
Definition: seqdb_lmdb.cpp:124
void SetMapSize(Uint8 map_size)
Definition: seqdb_lmdb.cpp:148
void InitDbi(lmdb::env &env, ELMDBFileType file_type)
Definition: seqdb_lmdb.cpp:56
vector< MDB_dbi > m_dbis
Definition: seqdb_lmdb.hpp:177
CBlastEnv(const string &fname, ELMDBFileType file_type, bool read_only=true, Uint8 map_size=0)
Definition: seqdb_lmdb.cpp:78
Class for manageing LMDB env, each env should only be open once.
Definition: seqdb_lmdb.hpp:139
static CBlastLMDBManager & GetInstance()
Definition: seqdb_lmdb.cpp:155
lmdb::env & GetWriteEnv(const string &fname, Uint8 map_size)
Definition: seqdb_lmdb.cpp:205
lmdb::env & GetReadEnvAcc(const string &fname, MDB_dbi &db_acc, bool *opened=0)
Definition: seqdb_lmdb.cpp:167
CFastMutex m_Mutex
Definition: seqdb_lmdb.hpp:185
list< CBlastEnv * > m_EnvList
Definition: seqdb_lmdb.hpp:184
lmdb::env & GetReadEnvTax(const string &fname, MDB_dbi &db_tax, bool *opened=0)
Definition: seqdb_lmdb.cpp:173
void CloseEnv(const string &fname)
Definition: seqdb_lmdb.cpp:220
CBlastEnv * GetBlastEnv(const string &fname, ELMDBFileType file_type, bool *opened=0)
Definition: seqdb_lmdb.cpp:181
lmdb::env & GetReadEnvVol(const string &fname, MDB_dbi &db_volname, MDB_dbi &db_volinfo)
Definition: seqdb_lmdb.cpp:160
CFile –.
Definition: ncbifile.hpp:1605
void GetSeqIdListForOid(blastdb::TOid oid, vector< string > &idlist)
Definition: seqdb_lmdb.cpp:435
CLookupSeqIds(CMemoryFile &file)
Definition: seqdb_lmdb.cpp:418
Uint8 * m_IndexStart
Definition: seqdb_lmdb.cpp:431
char * m_DataStart
Definition: seqdb_lmdb.cpp:432
Int4 * m_DataStart
Definition: seqdb_lmdb.cpp:692
void GetTaxIdListForOid(blastdb::TOid oid, vector< TTaxId > &taxid_list)
Definition: seqdb_lmdb.cpp:695
Uint8 * m_IndexStart
Definition: seqdb_lmdb.cpp:691
CLookupTaxIds(CMemoryFile &file)
Definition: seqdb_lmdb.cpp:678
CMemoryFile –.
Definition: ncbifile.hpp:2861
CNcbiOstrstreamToString class helps convert CNcbiOstrstream to a string Sample usage:
Definition: ncbistre.hpp:802
CSafeStatic<>::
T & Get(void)
Create the variable if not created yet, return the reference.
CSeqDBException.
Definition: seqdbcommon.hpp:73
string m_Oid2TaxIdsFile
Definition: seqdb_lmdb.hpp:105
string m_Oid2SeqIdsFile
Definition: seqdb_lmdb.hpp:104
void GetOids(const vector< string > &accessions, vector< blastdb::TOid > &oids) const
Get OIDs for a vector of string accessions.
Definition: seqdb_lmdb.cpp:360
void GetOid(const string &accession, vector< blastdb::TOid > &oids, const bool allow_dup=false) const
Get OIDs for single string accession.
Definition: seqdb_lmdb.cpp:262
string m_TaxId2OffsetsFile
Definition: seqdb_lmdb.hpp:107
bool m_LMDBFileOpened
Definition: seqdb_lmdb.hpp:108
void GetTaxIdsForOids(const vector< blastdb::TOid > &oids, set< TTaxId > &tax_ids) const
Get Tax Ids for oid list.
Definition: seqdb_lmdb.cpp:737
CSeqDBLMDB(const string &fname)
Definition: seqdb_lmdb.cpp:242
string m_TaxId2OidsFile
Definition: seqdb_lmdb.hpp:106
string m_LMDBFile
Definition: seqdb_lmdb.hpp:103
blastdb::TOid m_NumOids
Definition: seqdb_lmdb.hpp:109
void GetOidsForTaxIds(const set< TTaxId > &tax_ids, vector< blastdb::TOid > &oids, vector< TTaxId > &tax_ids_found) const
Get Oids for Tax Ids list, idenitcal Oids are merged.
Definition: seqdb_lmdb.cpp:591
void NegativeSeqIdsToOids(const vector< string > &ids, vector< blastdb::TOid > &rv) const
Get Oids excluded from a vector of input accessions An oid only get exlcuded if all its seqids are fo...
Definition: seqdb_lmdb.cpp:515
void NegativeTaxIdsToOids(const set< TTaxId > &ids, vector< blastdb::TOid > &rv, vector< TTaxId > &tax_ids_found) const
Get Oids to exclude for Tax ids @parm ids Input tax ids to exclude /Output tax ids found.
Definition: seqdb_lmdb.cpp:709
void GetVolumesInfo(vector< string > &vol_names, vector< blastdb::TOid > &vol_num_oids)
Return info for all volumes.
Definition: seqdb_lmdb.cpp:306
virtual ~CSeqDBLMDB()
Definition: seqdb_lmdb.cpp:253
void GetDBTaxIds(vector< TTaxId > &tax_ids) const
Get All Unique Tax Ids for db @parma tax_ids Return all unique tax ids found in db.
Definition: seqdb_lmdb.cpp:558
CSeqDB_Path.
CSeqDB_Substring FindBaseName() const
Returns the portion of this path containing the base name.
void GetString(string &s) const
Return the data by assigning it to a string.
static cursor open(MDB_txn *const txn, const MDB_dbi dbi)
Creates an LMDB cursor.
Definition: lmdb++.h:1740
Resource class for `MDB_dbi` handles.
Definition: lmdb++.h:1395
static dbi open(MDB_txn *const txn, const char *const name=nullptr, const unsigned int flags=default_flags)
Opens a database handle.
Definition: lmdb++.h:1412
MDB_dbi handle() const noexcept
Returns the underlying `MDB_dbi` handle.
Definition: lmdb++.h:1464
Resource class for `MDB_env*` handles.
Definition: lmdb++.h:1094
env & open(const char *const path, const unsigned int flags=default_flags, const mode mode=default_mode)
Opens this environment.
Definition: lmdb++.h:1203
env & set_max_dbs(const MDB_dbi count)
Definition: lmdb++.h:1243
env & set_mapsize(const std::size_t size)
Definition: lmdb++.h:1225
Base class for LMDB exception conditions.
Definition: lmdb++.h:63
virtual const char * what() const noexcept
Returns the underlying LMDB error code.
Definition: lmdb++.h:98
int code() const noexcept
Returns the underlying LMDB error code.
Definition: lmdb++.h:84
static txn begin(MDB_env *const env, MDB_txn *const parent=nullptr, const unsigned int flags=default_flags)
Creates a new LMDB transaction.
Definition: lmdb++.h:1277
Wrapper class for `MDB_val` structures.
Definition: lmdb++.h:948
iterator_bool insert(const value_type &val)
Definition: set.hpp:149
size_type size() const
Definition: set.hpp:132
const_iterator find(const key_type &key) const
Definition: set.hpp:137
const_iterator end() const
Definition: set.hpp:136
static ush * file_type
#define T(s)
Definition: common.h:230
#define false
Definition: bool.h:36
static int lookup(const char *name, const struct lookup_int *table)
Definition: attributes.c:50
static HENV env
Definition: transaction2.c:38
#define basename(path)
Definition: replacements.h:116
static FILE * f
Definition: readconf.c:23
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define TAX_ID_TO(T, tax_id)
Definition: ncbimisc.hpp:1110
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
SStrictId_Tax::TId TTaxId
Taxon id type.
Definition: ncbimisc.hpp:1048
#define TAX_ID_FROM(T, value)
Definition: ncbimisc.hpp:1111
#define NULL
Definition: ncbistd.hpp:225
#define LOG_POST(message)
This macro is deprecated and it's strongly recomended to move in all projects (except tests) to macro...
Definition: ncbidiag.hpp:226
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
void Info(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1185
Int8 GetLength(void) const
Get size of file.
Definition: ncbifile.cpp:3204
void * GetPtr(void) const
Get pointer to beginning of data.
Definition: ncbifile.hpp:4282
virtual bool Exists(void) const
Check existence of file.
Definition: ncbifile.hpp:4039
const TPrim & Get(void) const
Definition: serialbase.hpp:347
string GetSeqIdString(bool with_version=false) const
Return seqid string with optional version for text seqid type.
Definition: Seq_id.cpp:2145
@ fParse_RawText
Try to ID raw non-numeric accessions.
Definition: Seq_id.hpp:81
@ fParse_PartialOK
Warn rather than throwing an exception when a FASTA-style ID set contains unparsable portions,...
Definition: Seq_id.hpp:80
@ fParse_AnyLocal
Treat otherwise unidentified strings as local accessions as long as they don't resemble FASTA-style I...
Definition: Seq_id.hpp:90
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
char Char
Alias for char.
Definition: ncbitype.h:93
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
uint64_t Uint8
8-byte (64-bit) unsigned integer
Definition: ncbitype.h:105
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define kEmptyStr
Definition: ncbistr.hpp:123
static string UIntToString(unsigned int value, TNumToStringFlags flags=0, int base=10)
Convert UInt to string.
Definition: ncbistr.hpp:5103
const TPdb & GetPdb(void) const
Get the variant data.
Definition: Seq_id_.cpp:435
bool IsSetChain_id(void) const
chain identifier; length-independent generalization of 'chain' Check if a value has been assigned to ...
bool IsPdb(void) const
Check if variant Pdb is selected.
Definition: Seq_id_.hpp:922
const TMol & GetMol(void) const
Get the Mol member data.
char * dbname(DBPROCESS *dbproc)
Get name of current database.
Definition: dblib.c:6929
#define MDB_NOTFOUND
key/data pair not found (EOF)
Definition: lmdb.h:407
void mdb_dbi_close(MDB_env *env, MDB_dbi dbi)
Close a database handle.
Definition: mdb.c:9867
@ MDB_SET
Position at specified key.
Definition: lmdb.h:390
@ MDB_NEXT_DUP
Position at next data item of current key.
Definition: lmdb.h:380
@ MDB_NEXT
Position at next data item.
Definition: lmdb.h:379
@ MDB_GET_CURRENT
Return key/data at current cursor position.
Definition: lmdb.h:372
#define MDB_INTEGERKEY
numeric keys in native byte order: either unsigned int or size_t.
Definition: lmdb.h:317
#define MDB_DUPFIXED
with MDB_DUPSORT, sorted dup items have fixed size
Definition: lmdb.h:319
#define MDB_DUPSORT
use sorted duplicates
Definition: lmdb.h:314
#define MDB_NOLOCK
don't do any locking, caller must manage their own locks
Definition: lmdb.h:301
#define MDB_NOSUBDIR
no environment directory
Definition: lmdb.h:287
#define MDB_RDONLY
read only
Definition: lmdb.h:291
size_t ms_entries
Number of data items.
Definition: lmdb.h:463
unsigned int MDB_dbi
A handle for an individual database in the DB environment.
Definition: lmdb.h:241
FILE * file
int i
constexpr auto sort(_Init &&init)
<lmdb++.h> - C++11 wrapper for LMDB.
Definition: lmdb++.h:37
static void dbi_stat(MDB_txn *txn, MDB_dbi dbi, MDB_stat *stat)
Definition: lmdb++.h:659
const struct ncbi::grid::netcache::search::fields::KEY key
const GenericPointer< typename T::ValueType > T2 value
Definition: pointer.h:1227
Defines classes: CDirEntry, CFile, CDir, CSymLink, CMemoryFile, CFileUtil, CFileLock,...
static const char * locale
Definition: pcre2grep.c:212
static PCRE2_SIZE * offsets
Definition: pcre2grep.c:266
#define count
void DeleteLMDBFiles(bool db_is_protein, const string &filename)
Definition: seqdb_lmdb.cpp:790
string BuildLMDBFileName(const string &basename, bool is_protein, bool use_index, unsigned int index)
Build the canonical LMDB file name for BLAST databases.
Definition: seqdb_lmdb.cpp:749
bool s_CompareIdList(vector< string > &file_idlist, vector< string > &input_idlist)
Definition: seqdb_lmdb.cpp:461
static string s_FormatNum(T value)
Definition: seqdb_lmdb.cpp:45
string GetFileNameFromExistingLMDBFile(const string &lmdb_filename, ELMDBFileType file_type)
Definition: seqdb_lmdb.cpp:763
Defines interface to interact with LMDB files.
const string taxid2offset_str
Definition: seqdb_lmdb.hpp:199
ELMDBFileType
Definition: seqdb_lmdb.hpp:121
@ eLMDBFileTypeEnd
Definition: seqdb_lmdb.hpp:127
@ eLMDB
Definition: seqdb_lmdb.hpp:122
@ eTaxId2Offsets
Definition: seqdb_lmdb.hpp:125
@ eOid2TaxIds
Definition: seqdb_lmdb.hpp:124
@ eOid2SeqIds
Definition: seqdb_lmdb.hpp:123
@ eTaxId2Oids
Definition: seqdb_lmdb.hpp:126
const string volinfo_str
Definition: seqdb_lmdb.hpp:196
const string volname_str
Definition: seqdb_lmdb.hpp:197
const string acc2oid_str
Definition: seqdb_lmdb.hpp:198
void SeqDB_GetLMDBFileExtensions(bool db_is_protein, vector< string > &extn)
Retrieves file extensions for BLAST LMDB files.
const blastdb::TOid kSeqDBEntryNotFound
Int4 TOid
Ordinal ID in BLAST databases.
Definition: seqdbcommon.hpp:58
This file defines several SeqDB utility functions related to byte order and file system portability.
Statistics for a database in the environment.
Definition: lmdb.h:456
SOidSeqIdPair(blastdb::TOid o, const string &i)
Definition: seqdb_lmdb.cpp:404
static bool cmp_oid(const SOidSeqIdPair &v, const SOidSeqIdPair &k)
Definition: seqdb_lmdb.cpp:407
blastdb::TOid oid
Definition: seqdb_lmdb.cpp:405
Modified on Fri Sep 20 14:57:50 2024 by modify_doxy.py rev. 669887