42 #define SEQDB_LMDB_TIMING
43 #ifdef SEQDB_LMDB_TIMING
49 oss << std::fixed <<
value;
53 #define SPEED(time, nentries) s_FormatNum((size_t)((nentries)/(time)))
79 m_Filename(fname), m_FileType(
file_type),m_Env(
lmdb::
env::create()), m_Count(1), m_ReadOnly(read_only)
88 NCBI_THROW(
CSeqDBException, eFileErr,
"File " + fname +
" not found. If you renamed any BLAST database files, please use original file names, and makeblastdb to rename the database. If you deleted any BLAST database files, you need to recreate the database.");
116 for (
unsigned int i=0;
i < m_dbis.size();
i++){
117 if (m_dbis[
i] != UINT_MAX) {
126 if(m_dbis[dbi_type] == UINT_MAX) {
127 string err =
"DB contains no ";
134 err +=
"accession info.";
136 case eDbiTaxid2offset:
137 err +=
"tax id info";
145 return m_dbis[dbi_type];
151 m_Env.set_mapsize(map_size);
157 return lmdb_manager.
Get();
187 if((*itr)->GetFilename() == fname) {
188 (*itr)->AddReference();
189 if ( opened && !*opened ) {
190 (*itr)->AddReference();
198 if ( opened && !*opened ) {
209 if((*itr)->GetFilename() == fname) {
210 (*itr)->AddReference();
211 return (*itr)->GetEnv();
224 if((*itr)->GetFilename() == fname) {
225 if((*itr)->RemoveReference() == 0) {
248 m_LMDBFileOpened(
false),
262 CSeqDBLMDB::GetOid(
const string & accession, vector<blastdb::TOid> & oids,
const bool allow_dup)
const
273 string acc = accession;
276 if (cursor.get(data2find,
MDB_SET)) {
279 const char* d =
val.data();
280 oids.push_back(((d[3] << 24)&0xFF000000) | ((d[2] << 16) & 0xFF0000) | ((d[1] << 8) & 0xFF00) | (d[0]&0xFF));
285 oids.push_back(((d[3] << 24)&0xFF000000) | ((d[2] << 16) & 0xFF0000) | ((d[1] << 8) & 0xFF00) | (d[0]&0xFF));
312 vol_num_oids.clear();
317 MDB_stat volinfo_stat, volname_stat;
331 if (cursor_volname.get(data2find,
MDB_SET)) {
335 vol_names[
i].assign(
val.data(),
val.size());
337 if (cursor_volinfo.get(data2find,
MDB_SET)) {
340 const char* d =
val.data();
341 vol_num_oids[
i] = (((d[3] << 24)&0xFF000000) | ((d[2] << 16) & 0xFF0000) | ((d[1] << 8) & 0xFF00) | (d[0]&0xFF));
348 cursor_volname.close();
349 cursor_volinfo.close();
354 for(
unsigned int j=0; j < vol_num_oids.size(); j++){
375 for (
i=0;
i < accessions.size();
i++) {
376 string acc = accessions[
i];
378 if (cursor.get(data2find,
MDB_SET)) {
381 const char* d =
val.data();
382 oids[
i] = (((d[3] << 24)&0xFF000000) | ((d[2] << 16) & 0xFF0000) | ((d[1] << 8) & 0xFF00) | (d[0]&0xFF));
409 return (v.
id < k.
id);
441 while (begin < end) {
442 unsigned char id_len = *begin;
448 id.assign(begin, long_id_len);
449 begin += long_id_len;
450 idlist.push_back(
id);
454 id.assign(begin, id_len);
456 idlist.push_back(
id);
464 vector<string>::iterator f_itr = file_idlist.begin();
465 vector<string>::iterator i_itr = input_idlist.begin();
466 while(f_itr != file_idlist.end() && i_itr != input_idlist.end()) {
467 if(*i_itr == *f_itr) {
479 string tmp_pdb = *i_itr;
480 while ((f_itr != file_idlist.end()) && ((*f_itr).find_first_of(tmp_pdb) == 0)){
484 while ((i_itr != input_idlist.end()) && ((*i_itr).find_first_of(tmp_pdb) == 0)){
495 if((i_itr != input_idlist.end()) && (file_seq_id.
GetSeqIdString(
true) == *i_itr)){
505 if(f_itr == file_idlist.end()){
510 input_idlist.clear();
518 vector<blastdb::TOid> oids;
520 vector<SOidSeqIdPair> pairs;
521 for (
unsigned int i=0;
i < ids.size();
i++) {
531 if(pairs.size() == 0) {
541 while (
i < pairs.size()) {
542 vector<string> file_idlist;
543 vector<string> input_idlist;
544 current_oid = pairs[
i].oid;
545 lookup.GetSeqIdListForOid(current_oid, file_idlist);
546 while ((
i < pairs.size()) && (current_oid == pairs[
i].oid)) {
547 input_idlist.push_back(pairs[
i].
id);
551 rv.push_back(current_oid);
567 auto dbi(dbi_handle);
572 tax_ids.push_back(taxid);
595 tax_ids_found.clear();
607 if (cursor.get(data2find,
MDB_SET)) {
610 const char* d =
val.data();
611 offsets.push_back((((
Uint8) d[7] << 56) &0xFF00000000000000) | (((
Uint8) d[6] << 48) & 0xFF000000000000) |
612 (((
Uint8) d[5] << 40) &0xFF0000000000) | (((
Uint8) d[4] << 32) & 0xFF00000000) |
613 (((
Uint8) d[3] << 24) &0xFF000000) | (((
Uint8) d[2] << 16) & 0xFF0000) |
614 (((
Uint8) d[1] << 8) &0xFF00) | ((
Uint8) d[0]&0xFF));
617 offsets.push_back((((
Uint8) d[7] << 56) &0xFF00000000000000) | (((
Uint8) d[6] << 48) & 0xFF000000000000) |
618 (((
Uint8) d[5] << 40) &0xFF0000000000) | (((
Uint8) d[4] << 32) & 0xFF00000000) |
619 (((
Uint8) d[3] << 24) &0xFF000000) | (((
Uint8) d[2] << 16) & 0xFF0000) |
620 (((
Uint8) d[1] << 8) &0xFF00) | ((
Uint8) d[0]&0xFF));
622 tax_ids_found.push_back(*itr);
632 const char * start_ptr = (
char *) oid_file.
GetPtr();
633 for (
unsigned int i=0;
i <
offsets.size();
i++) {
635 Uint4 num_of_oids = *list_ptr;
638 while(
count < num_of_oids) {
639 if(!oids_set[*list_ptr]) {
640 oids.push_back(*list_ptr);
641 oids_set[*list_ptr] =
true;
648 int oids_sz = oids.size();
650 sort(oids.begin(), oids.end());
654 oids.reserve(oids_sz);
655 for (
int i=0;
i < oids_set.size();
i++) {
702 while (begin < end) {
712 vector<blastdb::TOid> oids;
715 set<TTaxId> tax_id_list(tax_ids_found.begin(), tax_ids_found.end());
717 for(
unsigned int i=0;
i < oids.size();
i++) {
718 vector<TTaxId> file_list;
719 lookup.GetTaxIdListForOid(oids[
i], file_list);
720 if(file_list.size() > tax_ids.
size()) {
725 for(; j < file_list.size(); j++) {
726 if(tax_id_list.
find(file_list[j]) == tax_id_list.
end()) {
730 if(j == file_list.size()) {
731 rv.push_back(oids[
i]);
741 for(
unsigned int i=0;
i < oids.size();
i++) {
742 vector<TTaxId> taxid_list;
743 lookup.GetTaxIdListForOid(oids[
i], taxid_list);
744 tax_ids.
insert(taxid_list.begin(), taxid_list.end());
752 throw invalid_argument(
"Basename is empty");
757 vol_str = (index > 9) ?
".":
".0";
760 return basename + vol_str + (is_protein ?
".pdb" :
".ndb");
766 string filename (lmdb_filename, 0, lmdb_filename.size() - 2);
794 ITERATE(vector<string>, itr, extn) {
795 CFile f(filename +
"." + (*itr));
unsigned int AddReference()
MDB_dbi GetDbi(EDbiType dbi_type)
void SetMapSize(Uint8 map_size)
void InitDbi(lmdb::env &env, ELMDBFileType file_type)
CBlastEnv(const string &fname, ELMDBFileType file_type, bool read_only=true, Uint8 map_size=0)
Class for manageing LMDB env, each env should only be open once.
static CBlastLMDBManager & GetInstance()
lmdb::env & GetWriteEnv(const string &fname, Uint8 map_size)
lmdb::env & GetReadEnvAcc(const string &fname, MDB_dbi &db_acc, bool *opened=0)
list< CBlastEnv * > m_EnvList
lmdb::env & GetReadEnvTax(const string &fname, MDB_dbi &db_tax, bool *opened=0)
void CloseEnv(const string &fname)
CBlastEnv * GetBlastEnv(const string &fname, ELMDBFileType file_type, bool *opened=0)
lmdb::env & GetReadEnvVol(const string &fname, MDB_dbi &db_volname, MDB_dbi &db_volinfo)
void GetSeqIdListForOid(blastdb::TOid oid, vector< string > &idlist)
CLookupSeqIds(CMemoryFile &file)
void GetTaxIdListForOid(blastdb::TOid oid, vector< TTaxId > &taxid_list)
CLookupTaxIds(CMemoryFile &file)
CNcbiOstrstreamToString class helps convert CNcbiOstrstream to a string Sample usage:
T & Get(void)
Create the variable if not created yet, return the reference.
void GetOids(const vector< string > &accessions, vector< blastdb::TOid > &oids) const
Get OIDs for a vector of string accessions.
void GetOid(const string &accession, vector< blastdb::TOid > &oids, const bool allow_dup=false) const
Get OIDs for single string accession.
string m_TaxId2OffsetsFile
void GetTaxIdsForOids(const vector< blastdb::TOid > &oids, set< TTaxId > &tax_ids) const
Get Tax Ids for oid list.
CSeqDBLMDB(const string &fname)
void GetOidsForTaxIds(const set< TTaxId > &tax_ids, vector< blastdb::TOid > &oids, vector< TTaxId > &tax_ids_found) const
Get Oids for Tax Ids list, idenitcal Oids are merged.
void NegativeSeqIdsToOids(const vector< string > &ids, vector< blastdb::TOid > &rv) const
Get Oids excluded from a vector of input accessions An oid only get exlcuded if all its seqids are fo...
void NegativeTaxIdsToOids(const set< TTaxId > &ids, vector< blastdb::TOid > &rv, vector< TTaxId > &tax_ids_found) const
Get Oids to exclude for Tax ids @parm ids Input tax ids to exclude /Output tax ids found.
void GetVolumesInfo(vector< string > &vol_names, vector< blastdb::TOid > &vol_num_oids)
Return info for all volumes.
void GetDBTaxIds(vector< TTaxId > &tax_ids) const
Get All Unique Tax Ids for db @parma tax_ids Return all unique tax ids found in db.
CSeqDB_Substring FindBaseName() const
Returns the portion of this path containing the base name.
void GetString(string &s) const
Return the data by assigning it to a string.
static cursor open(MDB_txn *const txn, const MDB_dbi dbi)
Creates an LMDB cursor.
Resource class for `MDB_dbi` handles.
static dbi open(MDB_txn *const txn, const char *const name=nullptr, const unsigned int flags=default_flags)
Opens a database handle.
MDB_dbi handle() const noexcept
Returns the underlying `MDB_dbi` handle.
Resource class for `MDB_env*` handles.
env & open(const char *const path, const unsigned int flags=default_flags, const mode mode=default_mode)
Opens this environment.
env & set_max_dbs(const MDB_dbi count)
env & set_mapsize(const std::size_t size)
Base class for LMDB exception conditions.
virtual const char * what() const noexcept
Returns the underlying LMDB error code.
int code() const noexcept
Returns the underlying LMDB error code.
static txn begin(MDB_env *const env, MDB_txn *const parent=nullptr, const unsigned int flags=default_flags)
Creates a new LMDB transaction.
Wrapper class for `MDB_val` structures.
iterator_bool insert(const value_type &val)
const_iterator find(const key_type &key) const
const_iterator end() const
static int lookup(const char *name, const struct lookup_int *table)
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
#define TAX_ID_TO(T, tax_id)
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
SStrictId_Tax::TId TTaxId
Taxon id type.
#define TAX_ID_FROM(T, value)
#define LOG_POST(message)
This macro is deprecated and it's strongly recomended to move in all projects (except tests) to macro...
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
void Info(CExceptionArgs_Base &args)
Int8 GetLength(void) const
Get size of file.
void * GetPtr(void) const
Get pointer to beginning of data.
virtual bool Exists(void) const
Check existence of file.
const TPrim & Get(void) const
string GetSeqIdString(bool with_version=false) const
Return seqid string with optional version for text seqid type.
@ fParse_RawText
Try to ID raw non-numeric accessions.
@ fParse_PartialOK
Warn rather than throwing an exception when a FASTA-style ID set contains unparsable portions,...
@ fParse_AnyLocal
Treat otherwise unidentified strings as local accessions as long as they don't resemble FASTA-style I...
int32_t Int4
4-byte (32-bit) signed integer
uint32_t Uint4
4-byte (32-bit) unsigned integer
uint64_t Uint8
8-byte (64-bit) unsigned integer
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
static string UIntToString(unsigned int value, TNumToStringFlags flags=0, int base=10)
Convert UInt to string.
const TPdb & GetPdb(void) const
Get the variant data.
bool IsSetChain_id(void) const
chain identifier; length-independent generalization of 'chain' Check if a value has been assigned to ...
bool IsPdb(void) const
Check if variant Pdb is selected.
const TMol & GetMol(void) const
Get the Mol member data.
char * dbname(DBPROCESS *dbproc)
Get name of current database.
#define MDB_NOTFOUND
key/data pair not found (EOF)
void mdb_dbi_close(MDB_env *env, MDB_dbi dbi)
Close a database handle.
@ MDB_SET
Position at specified key.
@ MDB_NEXT_DUP
Position at next data item of current key.
@ MDB_NEXT
Position at next data item.
@ MDB_GET_CURRENT
Return key/data at current cursor position.
#define MDB_INTEGERKEY
numeric keys in native byte order: either unsigned int or size_t.
#define MDB_DUPFIXED
with MDB_DUPSORT, sorted dup items have fixed size
#define MDB_DUPSORT
use sorted duplicates
#define MDB_NOLOCK
don't do any locking, caller must manage their own locks
#define MDB_NOSUBDIR
no environment directory
#define MDB_RDONLY
read only
size_t ms_entries
Number of data items.
unsigned int MDB_dbi
A handle for an individual database in the DB environment.
constexpr auto sort(_Init &&init)
<lmdb++.h> - C++11 wrapper for LMDB.
static void dbi_stat(MDB_txn *txn, MDB_dbi dbi, MDB_stat *stat)
const struct ncbi::grid::netcache::search::fields::KEY key
const GenericPointer< typename T::ValueType > T2 value
Defines classes: CDirEntry, CFile, CDir, CSymLink, CMemoryFile, CFileUtil, CFileLock,...
static const char * locale
static PCRE2_SIZE * offsets
void DeleteLMDBFiles(bool db_is_protein, const string &filename)
string BuildLMDBFileName(const string &basename, bool is_protein, bool use_index, unsigned int index)
Build the canonical LMDB file name for BLAST databases.
bool s_CompareIdList(vector< string > &file_idlist, vector< string > &input_idlist)
static string s_FormatNum(T value)
string GetFileNameFromExistingLMDBFile(const string &lmdb_filename, ELMDBFileType file_type)
Defines interface to interact with LMDB files.
const string taxid2offset_str
void SeqDB_GetLMDBFileExtensions(bool db_is_protein, vector< string > &extn)
Retrieves file extensions for BLAST LMDB files.
const blastdb::TOid kSeqDBEntryNotFound
Int4 TOid
Ordinal ID in BLAST databases.
This file defines several SeqDB utility functions related to byte order and file system portability.
Statistics for a database in the environment.
SOidSeqIdPair(blastdb::TOid o, const string &i)
static bool cmp_oid(const SOidSeqIdPair &v, const SOidSeqIdPair &k)