1 /* $Id: seqdb_lmdb.cpp 100751 2023-09-07 12:41:08Z boratyng $
2  * ===========================================================================
3  *
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Christiam Camacho
27  *
28  */
30 /// @file seqdb_lmdb.cpp
31 /// Implements interface to interact with LMDB files
33 #include <ncbi_pch.hpp>
36 #include <corelib/ncbifile.hpp>
38 #include <cmath>
44 template<class T>
45 static string s_FormatNum(T value)
46 {
47  CNcbiOstrstream oss;
48  oss.imbue(std::locale(""));
49  oss << std::fixed << value;
50  return CNcbiOstrstreamToString(oss);
51 }
53 #define SPEED(time, nentries) s_FormatNum((size_t)((nentries)/(time)))
54 #endif /* SEQDB_LMDB_TIMING */
57 {
58  auto txn = lmdb::txn::begin(env, nullptr, MDB_RDONLY);
59  if(file_type == eLMDB) {
60  try {
62  }
63  catch (...){ /* It's ok not to have acc in a db */}
66  }
67  else if (file_type == eTaxId2Offsets) {
69  }
70  else {
71  NCBI_THROW( CSeqDBException, eArgErr, "Invalid lmdb file type");
72  }
73  txn.commit();
74  txn.reset();
75  return;
76 }
78 CBlastLMDBManager::CBlastEnv::CBlastEnv(const string & fname, ELMDBFileType file_type, bool read_only, Uint8 map_size) :
79  m_Filename(fname), m_FileType(file_type),m_Env(lmdb::env::create()), m_Count(1), m_ReadOnly(read_only)
80 {
81  const MDB_dbi num_db(3);
82  m_Env.set_max_dbs(num_db);
83  m_dbis.resize(eDbiMax, UINT_MAX);
84  if(m_ReadOnly) {
85  CFile tf(fname);
86  Uint8 readMapSize = (tf.GetLength()/10000 + 1) *10000;
87  if (!tf.Exists()) {
88  NCBI_THROW( CSeqDBException, eFileErr, "File " + fname + " not found. If you renamed any BLAST database files, please use original file names, and makeblastdb to rename the database. If you deleted any BLAST database files, you need to recreate the database.");
89  }
90  m_Env.set_mapsize(readMapSize);
91  try {
93  }
94  catch (lmdb::error& e) {
95  NCBI_THROW(CSeqDBException, eFileErr, "LMDB runtime error: " + (string)e.what());
96  }
98  }
99  else {
100  LOG_POST(Info <<"Initial Map Size: " << map_size);
101  /// map_size 0 means use lmdb default
102  if(map_size != 0) {
103  m_Env.set_mapsize(map_size);
104  }
105  try {
106, MDB_NOSUBDIR , 0664);
107  }
108  catch (lmdb::error& e) {
109  NCBI_THROW(CSeqDBException, eFileErr, "LMDB runtime error: " + (string)e.what());
110  }
111  }
112 }
115 {
116  for (unsigned int i=0; i < m_dbis.size(); i++){
117  if (m_dbis[i] != UINT_MAX) {
118  mdb_dbi_close(m_Env,m_dbis[i]);
119  }
120  }
121  m_Env.close();
122 }
125 {
126  if(m_dbis[dbi_type] == UINT_MAX) {
127  string err = "DB contains no ";
128  switch (dbi_type) {
129  case eDbiVolinof:
130  case eDbiVolname:
131  err += "vol info.";
132  break;
133  case eDbiAcc2oid:
134  err += "accession info.";
135  break;
136  case eDbiTaxid2offset:
137  err += "tax id info";
138  break;
139  default:
140  NCBI_THROW( CSeqDBException, eArgErr, "Invalid dbi type");
141  }
142  NCBI_THROW( CSeqDBException, eArgErr, err);
144  }
145  return m_dbis[dbi_type];
146 }
149 {
150  if(!m_ReadOnly) {
151  m_Env.set_mapsize(map_size);
152  }
153 }
156  static CSafeStatic<CBlastLMDBManager> lmdb_manager;
157  return lmdb_manager.Get();
158 }
160 lmdb::env & CBlastLMDBManager::GetReadEnvVol(const string & fname, MDB_dbi & db_volname, MDB_dbi & db_volinfo)
161 {
162  CBlastEnv* p = GetBlastEnv(fname, eLMDB);
163  db_volinfo = p->GetDbi(CBlastEnv::eDbiVolinof);
164  db_volname = p->GetDbi(CBlastEnv::eDbiVolname);
165  return p->GetEnv();
166 }
167 lmdb::env & CBlastLMDBManager::GetReadEnvAcc(const string & fname, MDB_dbi & db_acc, bool* opened)
168 {
169  CBlastEnv* p = GetBlastEnv(fname, eLMDB, opened);
170  db_acc = p->GetDbi(CBlastEnv::eDbiAcc2oid);
171  return p->GetEnv();
172 }
173 lmdb::env & CBlastLMDBManager::GetReadEnvTax(const string & fname, MDB_dbi & db_tax, bool* opened)
174 {
175  CBlastEnv* p = GetBlastEnv(fname, eTaxId2Offsets, opened);
176  db_tax = p->GetDbi(CBlastEnv::eDbiTaxid2offset);
177  return p->GetEnv();
178 }
183  bool* opened)
184 {
185  CFastMutexGuard guard(m_Mutex);
186  NON_CONST_ITERATE(list <CBlastEnv* >, itr, m_EnvList) {
187  if((*itr)->GetFilename() == fname) {
188  (*itr)->AddReference();
189  if ( opened && !*opened ) {
190  (*itr)->AddReference();
191  *opened = true;
192  }
193  return (*itr);
194  }
195  }
196  CBlastEnv * p (new CBlastEnv(fname, file_type));
197  m_EnvList.push_back(p);
198  if ( opened && !*opened ) {
199  p->AddReference();
200  *opened = true;
201  }
202  return p;
203 }
205 lmdb::env & CBlastLMDBManager::GetWriteEnv(const string & fname, Uint8 map_size)
206 {
207  CFastMutexGuard guard(m_Mutex);
208  NON_CONST_ITERATE(list <CBlastEnv* >, itr, m_EnvList) {
209  if((*itr)->GetFilename() == fname) {
210  (*itr)->AddReference();
211  return (*itr)->GetEnv();
212  }
213  }
214  CBlastEnv * p (new CBlastEnv(fname, eLMDBFileTypeEnd, false, map_size));
215  m_EnvList.push_back(p);
216  return p->GetEnv();
217 }
220 void CBlastLMDBManager::CloseEnv(const string & fname)
221 {
222  CFastMutexGuard guard(m_Mutex);
223  NON_CONST_ITERATE(list <CBlastEnv* >, itr, m_EnvList) {
224  if((*itr)->GetFilename() == fname) {
225  if((*itr)->RemoveReference() == 0) {
226  delete *itr;
227  itr = m_EnvList.erase(itr);
228  break;
229  }
230  }
231  }
232 }
235 {
236  NON_CONST_ITERATE(list <CBlastEnv* >, itr, m_EnvList) {
237  delete *itr;
238  }
239  m_EnvList.clear();
240 }
242 CSeqDBLMDB::CSeqDBLMDB(const string & fname)
243  : m_LMDBFile(fname),
244  m_Oid2SeqIdsFile(GetFileNameFromExistingLMDBFile(fname, ELMDBFileType::eOid2SeqIds)),
245  m_Oid2TaxIdsFile(GetFileNameFromExistingLMDBFile(fname, ELMDBFileType::eOid2TaxIds)),
246  m_TaxId2OidsFile(GetFileNameFromExistingLMDBFile(fname, ELMDBFileType::eTaxId2Oids)),
247  m_TaxId2OffsetsFile(GetFileNameFromExistingLMDBFile(fname, ELMDBFileType::eTaxId2Offsets)),
248  m_LMDBFileOpened(false),
249  m_NumOids(0)
250 {
251 }
254 {
255  if ( m_LMDBFileOpened ) {
257  m_LMDBFileOpened = false;
258  }
259 }
261 void
262 CSeqDBLMDB::GetOid(const string & accession, vector<blastdb::TOid> & oids, const bool allow_dup) const
263 {
264  try {
265  oids.clear();
266  {
267  MDB_dbi dbi_handle;
269  lmdb::dbi dbi(dbi_handle);
270  auto txn = lmdb::txn::begin(env, nullptr, MDB_RDONLY);
271  auto cursor = lmdb::cursor::open(txn, dbi);
273  string acc = accession;
274  lmdb::val data2find(acc);
276  if (cursor.get(data2find, MDB_SET)) {
277  lmdb::val k, val;
278  cursor.get(k, val, MDB_GET_CURRENT);
279  const char* d =;
280  oids.push_back(((d[3] << 24)&0xFF000000) | ((d[2] << 16) & 0xFF0000) | ((d[1] << 8) & 0xFF00) | (d[0]&0xFF));
282  if(allow_dup) {
283  while (cursor.get(k,val, MDB_NEXT_DUP)) {
284  d =;
285  oids.push_back(((d[3] << 24)&0xFF000000) | ((d[2] << 16) & 0xFF0000) | ((d[1] << 8) & 0xFF00) | (d[0]&0xFF));
286  }
287  }
288  }
289  cursor.close();
290  txn.reset();
291  }
293  } catch (lmdb::error & e) {
294  string dbname;
296  if(e.code() == MDB_NOTFOUND) {
297  NCBI_THROW( CSeqDBException, eArgErr, "Seqid list specified but no accession table is found in " + dbname);
298  }
299  else {
300  NCBI_THROW( CSeqDBException, eArgErr, "Accessions to Oids lookup error in " + dbname);
301  }
302  }
303 }
306 void CSeqDBLMDB::GetVolumesInfo(vector<string> & vol_names, vector<blastdb::TOid> & vol_num_oids)
307 {
308  MDB_dbi db_volname_handle;
309  MDB_dbi db_volinfo_handle;
310  lmdb::env & env = CBlastLMDBManager::GetInstance().GetReadEnvVol(m_LMDBFile, db_volname_handle, db_volinfo_handle);
311  vol_names.clear();
312  vol_num_oids.clear();
313  {
314  auto txn = lmdb::txn::begin(env, nullptr, MDB_RDONLY);
315  lmdb::dbi db_volname(db_volname_handle);
316  lmdb::dbi db_volinfo(db_volinfo_handle);
317  MDB_stat volinfo_stat, volname_stat;
318  lmdb::dbi_stat(txn, db_volinfo, &volinfo_stat);
319  lmdb::dbi_stat(txn, db_volname, &volname_stat);
320  if(volinfo_stat.ms_entries != volname_stat.ms_entries) {
321  NCBI_THROW( CSeqDBException, eArgErr, "Volinfo error ");
322  }
324  vol_names.resize(volinfo_stat.ms_entries);
325  vol_num_oids.resize(volinfo_stat.ms_entries);
327  auto cursor_volname = lmdb::cursor::open(txn, db_volname);
328  auto cursor_volinfo = lmdb::cursor::open(txn, db_volinfo);
329  for (unsigned int i=0; i < volinfo_stat.ms_entries; i++) {
330  lmdb::val data2find(&i, sizeof(Int4));
331  if (cursor_volname.get(data2find, MDB_SET)) {
332  {
333  lmdb::val k, val;
334  cursor_volname.get(k, val, MDB_GET_CURRENT);
335  vol_names[i].assign(, val.size());
336  }
337  if (cursor_volinfo.get(data2find, MDB_SET)) {
338  lmdb::val k, val;
339  cursor_volinfo.get(k, val, MDB_GET_CURRENT);
340  const char* d =;
341  vol_num_oids[i] = (((d[3] << 24)&0xFF000000) | ((d[2] << 16) & 0xFF0000) | ((d[1] << 8) & 0xFF00) | (d[0]&0xFF));
342  }
343  else {
344  NCBI_THROW( CSeqDBException, eArgErr, "No volinfo for " + vol_names[i]);
345  }
346  }
347  }
348  cursor_volname.close();
349  cursor_volinfo.close();
350  txn.reset();
351  }
354  for(unsigned int j=0; j < vol_num_oids.size(); j++){
355  m_NumOids += vol_num_oids[j];
356  }
357 }
359 void
360 CSeqDBLMDB::GetOids(const vector<string>& accessions, vector<blastdb::TOid>& oids) const
361 {
362  try {
363  oids.clear();
364  oids.resize(accessions.size(), kSeqDBEntryNotFound);
366  MDB_dbi dbi_handle;
368  {
369  lmdb::dbi dbi(dbi_handle);
370  auto txn = lmdb::txn::begin(env, nullptr, MDB_RDONLY);
372  auto cursor = lmdb::cursor::open(txn, dbi);
374  unsigned int i=0;
375  for (i=0; i < accessions.size(); i++) {
376  string acc = accessions[i];
377  lmdb::val data2find(acc);
378  if (cursor.get(data2find, MDB_SET)) {
379  lmdb::val k, val;
380  cursor.get(k, val, MDB_GET_CURRENT);
381  const char* d =;
382  oids[i] = (((d[3] << 24)&0xFF000000) | ((d[2] << 16) & 0xFF0000) | ((d[1] << 8) & 0xFF00) | (d[0]&0xFF));
383  }
384  }
386  cursor.close();
387  txn.reset();
388  }
390  } catch (lmdb::error & e) {
391  string dbname;
393  if(e.code() == MDB_NOTFOUND) {
394  NCBI_THROW( CSeqDBException, eArgErr, "Seqid list specified but no accession table is found in " + dbname);
395  }
396  else {
397  NCBI_THROW( CSeqDBException, eArgErr, "Accessions to Oids lookup error in " + dbname);
398  }
399  }
400 }
403 {
404  SOidSeqIdPair(blastdb::TOid o, const string & i) : oid(o), id(i) {}
406  string id;
407  static bool cmp_oid(const SOidSeqIdPair & v, const SOidSeqIdPair & k) {
408  if(v.oid == k.oid) {
409  return ( <;
410  }
411  return (v.oid < k.oid );
412  }
413 };
416 {
417 public:
418  CLookupSeqIds(CMemoryFile & file): m_IndexStart((Uint8*) file.GetPtr()), m_DataStart((char *) file.GetPtr()) {
419  if(m_IndexStart == NULL){
420  NCBI_THROW( CSeqDBException, eArgErr, "Failed to open oid-to-seqid lookup file");
421  }
423  Uint8 num_of_oids = *m_IndexStart;
424  m_IndexStart ++;
425  m_DataStart += (8 * (num_of_oids + 1));
426  }
428  inline void GetSeqIdListForOid(blastdb::TOid oid, vector<string> & idlist);
429 private:
432  char * m_DataStart;
433 };
435 void CLookupSeqIds::GetSeqIdListForOid(blastdb::TOid oid, vector<string> & idlist)
436 {
437  Uint8 * index_ptr = m_IndexStart + oid;
438  Char * end = m_DataStart + (*index_ptr);
439  index_ptr--;
440  Char * begin = (oid == 0) ? m_DataStart:m_DataStart + (*index_ptr);
441  while (begin < end) {
442  unsigned char id_len = *begin;
443  begin ++;
444  if(id_len == 0xFF) {
445  Uint4 long_id_len = *((Uint4 *) begin);
446  begin +=4;
447  string id;
448  id.assign(begin, long_id_len);
449  begin += long_id_len;
450  idlist.push_back(id);
451  }
452  else {
453  string id;
454  id.assign(begin, id_len);
455  begin += id_len;
456  idlist.push_back(id);
457  }
458  }
459 }
461 bool s_CompareIdList(vector<string> & file_idlist, vector<string> &input_idlist)
462 {
463  bool rv = false;
464  vector<string>::iterator f_itr = file_idlist.begin();
465  vector<string>::iterator i_itr = input_idlist.begin();
466  while(f_itr != file_idlist.end() && i_itr != input_idlist.end()) {
467  if(*i_itr == *f_itr) {
468  i_itr++;
469  f_itr++;
470  continue;
471  }
472  else {
474  // Input id is PDB with just mol id
475  if(seq_id.IsPdb() && !seq_id.GetPdb().IsSetChain_id()) {
477  if (file_seqid.IsPdb() && file_seqid.GetPdb().GetMol().Get() == *i_itr) {
478  f_itr++;
479  string tmp_pdb = *i_itr;
480  while ((f_itr != file_idlist.end()) && ((*f_itr).find_first_of(tmp_pdb) == 0)){
481  f_itr++;
482  }
483  // Skip pdb id in input list but with chain id
484  while ((i_itr != input_idlist.end()) && ((*i_itr).find_first_of(tmp_pdb) == 0)){
485  i_itr++;
486  }
487  continue;
488  }
489  }
490  else {
492  if( file_seq_id.GetSeqIdString(false) == *i_itr) {
493  i_itr++;
494  // Skip identical id in input list but with version
495  if((i_itr != input_idlist.end()) && (file_seq_id.GetSeqIdString(true) == *i_itr)){
496  i_itr++;
497  }
498  f_itr++;
499  continue;
500  }
501  }
502  break;
503  }
504  }
505  if(f_itr == file_idlist.end()){
506  rv=true;
507  }
509  file_idlist.clear();
510  input_idlist.clear();
511  return rv;
512 }
514 void
515 CSeqDBLMDB::NegativeSeqIdsToOids(const vector<string>& ids, vector<blastdb::TOid>& rv) const
516 {
517  rv.clear();
518  vector<blastdb::TOid> oids;
519  GetOids(ids, oids);
520  vector<SOidSeqIdPair> pairs;
521  for (unsigned int i=0; i < ids.size(); i++) {
522  if(oids[i] == kSeqDBEntryNotFound) {
523  continue;
524  }
525  else {
526  SOidSeqIdPair p(oids[i], ids[i]);
527  pairs.push_back(p);
528  }
529  }
531  if(pairs.size() == 0) {
532  return;
533  }
535  sort (pairs.begin(), pairs.end(), SOidSeqIdPair::cmp_oid);
537  CMemoryFile oid_file(m_Oid2SeqIdsFile);
538  CLookupSeqIds lookup(oid_file);
539  blastdb::TOid current_oid = 0;
540  unsigned int i=0;
541  while (i < pairs.size()) {
542  vector<string> file_idlist;
543  vector<string> input_idlist;
544  current_oid = pairs[i].oid;
545  lookup.GetSeqIdListForOid(current_oid, file_idlist);
546  while ((i < pairs.size()) && (current_oid == pairs[i].oid)) {
547  input_idlist.push_back(pairs[i].id);
548  i++;
549  }
550  if(s_CompareIdList(file_idlist, input_idlist)) {
551  rv.push_back(current_oid);
552  }
554  }
556 }
558 void CSeqDBLMDB::GetDBTaxIds(vector<TTaxId> & tax_ids) const
559 {
561  tax_ids.clear();
562  try {
563  MDB_dbi dbi_handle;
565  {
566  auto txn = lmdb::txn::begin(env, nullptr, MDB_RDONLY);
567  auto dbi(dbi_handle);
568  auto cursor = lmdb::cursor::open(txn, dbi);
569  lmdb::val key;
570  while (cursor.get(key, MDB_NEXT)) {
571  TTaxId taxid = TAX_ID_FROM(Int4, *((Int4 *);
572  tax_ids.push_back(taxid);
573  }
574  cursor.close();
575  txn.reset();
576  }
577  }
578  catch (lmdb::error & e) {
579  string dbname;
581  if(e.code() == MDB_NOTFOUND) {
582  NCBI_THROW( CSeqDBException, eArgErr, "No taxonomy info found in " + dbname);
583  }
584  else {
585  NCBI_THROW( CSeqDBException, eArgErr, "Taxonomy Id to Oids lookup error in " + dbname);
586  }
587  }
589 }
591 void CSeqDBLMDB::GetOidsForTaxIds(const set<TTaxId> & tax_ids, vector<blastdb::TOid>& oids, vector<TTaxId> & tax_ids_found) const
592 {
593  try {
594  oids.clear();
595  tax_ids_found.clear();
596  vector<Uint8> offsets;
597  MDB_dbi dbi_handle;
599  {
600  auto txn = lmdb::txn::begin(env, nullptr, MDB_RDONLY);
601  lmdb::dbi dbi(dbi_handle);
602  auto cursor = lmdb::cursor::open(txn, dbi);
603  ITERATE(set<TTaxId>, itr, tax_ids) {
604  Int4 tax_id = TAX_ID_TO(Int4, *itr);
605  lmdb::val data2find(tax_id);
607  if (cursor.get(data2find, MDB_SET)) {
608  lmdb::val k, val;
609  cursor.get(k, val, MDB_GET_CURRENT);
610  const char* d =;
611  offsets.push_back((((Uint8) d[7] << 56) &0xFF00000000000000) | (((Uint8) d[6] << 48) & 0xFF000000000000) |
612  (((Uint8) d[5] << 40) &0xFF0000000000) | (((Uint8) d[4] << 32) & 0xFF00000000) |
613  (((Uint8) d[3] << 24) &0xFF000000) | (((Uint8) d[2] << 16) & 0xFF0000) |
614  (((Uint8) d[1] << 8) &0xFF00) | ((Uint8) d[0]&0xFF));
615  while (cursor.get(k,val, MDB_NEXT_DUP)) {
616  d =;
617  offsets.push_back((((Uint8) d[7] << 56) &0xFF00000000000000) | (((Uint8) d[6] << 48) & 0xFF000000000000) |
618  (((Uint8) d[5] << 40) &0xFF0000000000) | (((Uint8) d[4] << 32) & 0xFF00000000) |
619  (((Uint8) d[3] << 24) &0xFF000000) | (((Uint8) d[2] << 16) & 0xFF0000) |
620  (((Uint8) d[1] << 8) &0xFF00) | ((Uint8) d[0]&0xFF));
621  }
622  tax_ids_found.push_back(*itr);
623  }
624  }
625  cursor.close();
626  txn.reset();
627  }
629  vector<bool> oids_set(m_NumOids, false);
631  CMemoryFile oid_file(m_TaxId2OidsFile);
632  const char * start_ptr = (char *) oid_file.GetPtr();
633  for (unsigned int i=0; i < offsets.size(); i++) {
634  Uint4 * list_ptr = (Uint4 * ) (start_ptr + offsets[i]);
635  Uint4 num_of_oids = *list_ptr;
636  Uint4 count = 0 ;
637  list_ptr ++;
638  while(count < num_of_oids) {
639  if(!oids_set[*list_ptr]) {
640  oids.push_back(*list_ptr);
641  oids_set[*list_ptr] = true;
642  }
643  count++;
644  list_ptr++;
645  }
646  }
648  int oids_sz = oids.size();
649  if((oids_sz*log(oids_sz)) < m_NumOids) {
650  sort(oids.begin(), oids.end());
651  }
652  else {
653  oids.resize(0);
654  oids.reserve(oids_sz);
655  for (int i=0; i < oids_set.size(); i++) {
656  if(oids_set[i]) {
657  oids.push_back(i);
658  }
659  }
660  }
662  } catch (lmdb::error & e) {
663  string dbname;
665  if(e.code() == MDB_NOTFOUND) {
666  NCBI_THROW( CSeqDBException, eArgErr, "No taxonomy info found in " + dbname);
667  }
668  else {
669  NCBI_THROW( CSeqDBException, eArgErr, "Taxonomy Id to Oids lookup error in " + dbname);
670  }
671  }
672 }
676 {
677 public:
679  if(m_IndexStart == NULL){
680  NCBI_THROW( CSeqDBException, eArgErr, "Failed to open oid-to-taxids lookup file");
681  }
683  Uint8 num_of_oids = *m_IndexStart;
684  m_IndexStart ++;
685  m_DataStart += (2* (num_of_oids + 1));
686  }
688  inline void GetTaxIdListForOid(blastdb::TOid oid, vector<TTaxId> & taxid_list);
689 private:
693 };
695 void CLookupTaxIds::GetTaxIdListForOid(blastdb::TOid oid, vector<TTaxId> & taxid_list)
696 {
697  taxid_list.clear();
698  Uint8 * index_ptr = m_IndexStart + oid;
699  Int4 * end = m_DataStart + (*index_ptr);
700  index_ptr--;
701  Int4 * begin = (oid == 0) ? m_DataStart:m_DataStart + (*index_ptr);
702  while (begin < end) {
703  taxid_list.push_back(TAX_ID_FROM(Int4, *begin));
704  begin++;
705  }
706 }
708 void
709 CSeqDBLMDB::NegativeTaxIdsToOids(const set<TTaxId>& tax_ids, vector<blastdb::TOid>& rv, vector<TTaxId> & tax_ids_found) const
710 {
711  rv.clear();
712  vector<blastdb::TOid> oids;
713  GetOidsForTaxIds(tax_ids, oids, tax_ids_found);
714  CMemoryFile oid_file(m_Oid2TaxIdsFile);
715  set<TTaxId> tax_id_list(tax_ids_found.begin(), tax_ids_found.end());
716  CLookupTaxIds lookup(oid_file);
717  for(unsigned int i=0; i < oids.size(); i++) {
718  vector<TTaxId> file_list;
719  lookup.GetTaxIdListForOid(oids[i], file_list);
720  if(file_list.size() > tax_ids.size()) {
721  continue;
722  }
723  else {
724  unsigned int j = 0;
725  for(; j < file_list.size(); j++) {
726  if(tax_id_list.find(file_list[j]) == tax_id_list.end()) {
727  break;
728  }
729  }
730  if(j == file_list.size()) {
731  rv.push_back(oids[i]);
732  }
733  }
734  }
735 }
737 void CSeqDBLMDB::GetTaxIdsForOids(const vector<blastdb::TOid> & oids, set<TTaxId> & tax_ids) const
738 {
739  CMemoryFile oid_file(m_Oid2TaxIdsFile);
740  CLookupTaxIds lookup(oid_file);
741  for(unsigned int i=0; i < oids.size(); i++) {
742  vector<TTaxId> taxid_list;
743  lookup.GetTaxIdListForOid(oids[i], taxid_list);
744  tax_ids.insert(taxid_list.begin(), taxid_list.end());
745  }
746 }
749 string BuildLMDBFileName(const string& basename, bool is_protein, bool use_index, unsigned int index)
750 {
751  if (basename.empty()) {
752  throw invalid_argument("Basename is empty");
753  }
755  string vol_str=kEmptyStr;
756  if(use_index) {
757  vol_str = (index > 9) ?".": ".0";
758  vol_str += NStr::UIntToString(index);
759  }
760  return basename + vol_str + (is_protein ? ".pdb" : ".ndb");
761 }
763 string GetFileNameFromExistingLMDBFile(const string& lmdb_filename, ELMDBFileType file_type)
764 {
766  string filename (lmdb_filename, 0, lmdb_filename.size() - 2);
767  switch (file_type) {
768  case eLMDB :
769  filename += "db";
770  break;
771  case eOid2SeqIds :
772  filename +="os";
773  break;
774  case eOid2TaxIds :
775  filename +="ot";
776  break;
777  case eTaxId2Offsets :
778  filename += "tf";
779  break;
780  case eTaxId2Oids :
781  filename += "to";
782  break;
783  default :
784  NCBI_THROW( CSeqDBException, eArgErr, "Invalid LMDB file type");
785  break;
786  }
787  return filename;
788 }
790 void DeleteLMDBFiles(bool db_is_protein, const string & filename)
791 {
792  vector<string> extn;
793  SeqDB_GetLMDBFileExtensions(db_is_protein, extn);
794  ITERATE(vector<string>, itr, extn) {
795  CFile f(filename + "." + (*itr));
796  if (f.Exists()) {
797  f.Remove();
798  }
799  }
800 }
