53 : m_AtlasHolder (
NULL, use_atlas_lock),
54 m_Atlas (m_AtlasHolder.
Get()),
55 m_DBNames (db_name_list),
56 m_Aliases (m_Atlas, db_name_list, prot_nucl),
58 m_Aliases.GetVolumeNames(),
63 m_RestrictBegin (oid_begin),
64 m_RestrictEnd (oid_end),
70 m_ExactTotalLength(0),
71 m_TotalLengthStats(0),
75 m_SeqType (prot_nucl),
76 m_OidListSetup (
false),
77 m_UserGiList (gi_list),
78 m_NegativeList (neg_list),
80 m_NeedTotalsScan (
false),
81 m_UseGiMask (m_Aliases.HasGiMask()),
82 m_MaskDataColumn (kUnknownTitle),
88 vector <string> mask_list;
159 : m_AtlasHolder (
NULL, use_atlas_lock),
160 m_Atlas (m_AtlasHolder.
Get()),
161 m_Aliases (m_Atlas,
"",
'-'),
168 m_ExactTotalLength(0),
171 m_OidListSetup (
true),
172 m_NeedTotalsScan (
false),
174 m_MaskDataColumn (kUnknownTitle),
192 if ((oid_begin == 0) && (oid_end == 0)) {
250 const vector< CRef<CSeqDB_FilterTree> >& nodes = ft->
GetNodes();
251 if (nodes.size() == 1) {
304 vector<int> & oid_list,
337 begin_chunk = * state_obj;
343 end_chunk = begin_chunk +
static_cast<int>(
buffer->results.size());
345 end_chunk = begin_chunk + oid_size;
351 *state_obj = end_chunk;
362 int next_oid = begin_chunk;
365 while(next_oid < end_chunk) {
368 next_oid < end_chunk) {
369 oid_list.push_back(next_oid++);
371 next_oid = end_chunk;
377 oid_list.resize(oid_size);
378 while (iter < oid_size) {
383 oid_list[iter++] = next_oid++;
389 if (iter < oid_size) {
390 oid_list.resize(iter);
392 *state_obj = next_oid;
418 return vol->GetSeqLengthProt(vol_oid);
422 return vol->GetSeqLengthExact(vol_oid);
437 return vol->GetSeqLengthProt(vol_oid);
441 return vol->GetSeqLengthApprox(vol_oid);
463 if ((! defline_set.
Empty()) && defline_set->
CanGet()) {
465 if (! (*defline)->CanGetSeqid()) {
469 if (! (*defline)->IsSetTaxid()) {
474 if (! (**seqid).IsGi()) {
478 gi_to_taxid[(**seqid).GetGi()] = (*defline)->GetTaxid();
485 vector<TTaxId> & taxids,
498 if ((! defline_set.
Empty()) && defline_set->
CanGet()) {
500 if ((*defline)->IsSetTaxid()) {
501 taxids.push_back((*defline)->GetTaxid());
517 if ((! defline_set.
Empty()) && defline_set->
CanGet()) {
536 gi_to_taxid_set.clear();
542 if ((! defline_set.
Empty()) && defline_set->
CanGet()) {
544 if (! (*defline)->CanGetSeqid()) {
549 if (! (**seqid).IsGi()) {
554 gi_to_taxid_set[(**seqid).GetGi()].
insert(
564 vector<TTaxId>& taxids,
578 if ((! defline_set.
Empty()) && defline_set->
CanGet()) {
584 if ((*defline)->CanGetSeqid()) {
588 (*defline)->GetSeqid()
590 if ((**seqid).IsGi()) {
592 (*defline)->GetLeafTaxIds();
622 return vol->GetBioseq(vol_oid,
662 if (
buffer->checked_out > 0) {
671 const char ** seq)
const
675 if (index < buffer->results.size()) {
677 *seq =
buffer->results[index].address;
678 return buffer->results[index].length;
683 *seq =
buffer->results[0].address;
684 return buffer->results[0].length;
702 res.
length = vol->GetSequence(vol_oid++, &seq);
703 if (res.
length < 0)
return;
708 buffer->results.push_back(res);
709 res.
length = vol->GetSequence(vol_oid++, &seq);
730 return vol->GetSequence(vol_oid,
buffer);
748 return vol->GetSeqData(vol_oid, begin, end, locked);
765 return vol->GetAmbigSeq(vol_oid,
786 return vol->GetAmbigPartialSeq(vol_oid,
889 _ASSERT((rv & 0x7FFFFFFF) == rv);
908 TGi gi = vol->GetSeqGI(vol_oid, locked);
911 list< CRef<CSeq_id> > ids =
912 vol->GetSeqIDs(vol_oid);
915 return (**id).GetGi();
932 _ASSERT((rv & 0x7FFFFFFF) == rv);
953 _ASSERT((num_oids & 0x7FFFFFFF) == num_oids);
955 return (
int) num_oids;
998 return vol->GetSeqType();
1017 string fmt =
"b d, Y H:m P";
1025 }
else if (d != date) {
1027 CTime t1(date, fmt);
1069 return vol->GetFilteredHeader(vol_oid, locked);
1104 for(
int i = 0;
i < (
int) s.size();
i++) {
1105 if (s[
i] ==
char(0)) {
1140 return vol->GetPig(vol_oid, pig, locked);
1228 return vol->GetGi(vol_oid, gi, locked);
1248 for(
unsigned int i=0;
i <
tmp.size();
i++) {
1251 oids.push_back(
tmp[
i]);
1256 vector<int> vol_oids;
1264 if (vol_oids.empty()) {
1270 ITERATE(vector<int>, iter, vol_oids) {
1271 int oid1 = ((*iter) + vol_start);
1276 if (find(oids.begin(), oids.end(), oid1) != oids.end()) {
1283 oids.push_back(oid1);
1297 vector<blastdb::TOid> oids;
1301 for(
unsigned int i=0;
i < oids.size();
i++) {
1304 rv.push_back(oids[
i]);
1310 "Taxonomy list is not supported in v4 BLAST db");
1326 vector<blastdb::TOid> oids;
1328 oids.push_back(oid);
1338 "Taxonomy list is not supported in v4 BLAST db");
1350 "Taxonomy list is not supported in v4 BLAST db");
1359 oids.resize(accs.size());
1363 for(
unsigned int i=0;
i < oids.size();
i++) {
1374 for(
unsigned int i=0;
i < accs.size();
i++) {
1375 vector<blastdb::TOid>
tmp;
1402 bool is_BL_ORD_ID =
false;
1407 if (dbt.
GetDb() ==
"BL_ORD_ID") {
1408 is_BL_ORD_ID =
true;
1422 for(
unsigned int i=0;
i <
tmp.size();
i++) {
1425 oids.push_back(
tmp[
i]);
1432 vector<int> vol_oids;
1446 if (vol_oids.empty()) {
1452 ITERATE(vector<int>, iter, vol_oids) {
1453 int oid1 = ((*iter) + vol_start);
1459 oids.push_back(oid1);
1480 "OID not in valid range.");
1486 "Residue offset not in valid range.");
1499 if ((first_seq < vol_cnt) && (residue < vol_len)) {
1500 return vol_start + volp->
GetOidAtOffset(first_seq, residue, locked);
1505 vol_start += vol_cnt;
1507 if (first_seq > vol_cnt) {
1508 first_seq -= vol_cnt;
1513 if (residue > vol_len) {
1522 "Could not find valid split point oid.");
1528 vector<string> & paths,
1529 vector<string> * alias_paths,
1533 bool use_atlas_lock =
true;
1573 Uint8 base_count(0);
1588 if (totlen || maxlen || minlen) {
1599 max_count =
max(
len, max_count);
1600 min_count =
min(
len, min_count);
1606 *numseq = oid_count;
1610 *totlen = base_count;
1614 *maxlen = max_count;
1618 *minlen = min_count;
1626 oss <<
"Taxid " << taxid <<
" not found";
1634 Uint8 * total_length,
1675 int * ambig_length)
const
1682 vol->GetRawSeqAndAmbig(vol_oid,
1726 *high_out = high_in;
1729 *count_out = count_in;
1731 if (low_out && (*low_out > low_in)) {
1734 if (high_out && (*high_out < high_in)) {
1735 *high_out = high_in;
1738 *count_out += count_in;
1784 int vlow(0), vhigh(0), vcount(0);
1845 vol->SetOffsetRanges(vol_oid,
1890 vector<int> vol_oids;
1896 if (vol_oids.empty()) {
1902 ITERATE(vector<int>, iter, vol_oids) {
1903 int oid1 = (*iter) + vol_start;
1909 oids.push_back(oid1);
1942 if (! ngis.empty()) {
1945 }
else if (! ntis.empty()) {
1948 }
else if (!stis.empty()) {
1961 #if ((!defined(NCBI_COMPILER_WORKSHOP) || (NCBI_COMPILER_VERSION > 550)) && \
1962 (!defined(NCBI_COMPILER_MIPSPRO)) )
1975 titles.assign(
all.begin(),
all.end());
1994 vector<int> vol_ids;
2002 vol_ids.push_back(
id);
2087 "This column ID was not found.");
2113 int vol_idx = -1, vol_oid = -1;
2118 if (vol_col_id >= 0) {
2140 template<
class K,
class C>
2143 return c.find(k) != c.end();
2158 string v = iter->second;
2159 vector<string> items;
2162 if (items.size() == 4) {
2167 return & iter->second;
2177 : m_NextId(100), m_Empty(
true), m_CacheRealAlgo(-1)
2186 algorithms.push_back(iter->first);
2192 string real_desc = desc;
2193 vector<string> items;
2195 if (items.size() == 4) {
2196 real_desc = items[2];
2204 if ((! found_id) || (
m_DescToId[real_desc] !=
id)) {
2256 "Cannot find volume in algorithm map.");
2263 "Cannot find volume algorithm in algorithm map.");
2266 return trans[algo_id];
2273 "Cannot find string algorithm id in algorithm map.");
2279 #if ((!defined(NCBI_COMPILER_WORKSHOP) || (NCBI_COMPILER_VERSION > 550)) && \
2280 (!defined(NCBI_COMPILER_MIPSPRO)) )
2296 algorithms.resize(0);
2319 vector<int> algorithms;
2321 if (algorithms.empty()) {
2327 <<
"Available filtering algorithms applied to database sequences:"
2330 retval << setw(13) << left <<
"Algorithm ID"
2331 << setw(40) << left <<
"Algorithm name"
2332 << setw(40) << left <<
"Algorithm options" << endl;
2333 ITERATE(vector<int>, algo_id, algorithms) {
2334 string algo, algo_opts, algo_name;
2336 if (algo_opts.empty()) {
2337 algo_opts.assign(
"default options used");
2340 retval << setw(13) << left << (*algo_id)
2341 << setw(40) << left << algo_name
2342 << setw(40) << left << algo_opts << endl;
2344 retval << setw(13) << left << (*algo_id)
2345 << setw(40) << left <<
algo
2346 << setw(40) << left << algo_opts << endl;
2360 string & program_name,
2364 if (enum_type_vals ==
NULL) {
2365 enum_type_vals = GetTypeInfo_enum_EBlast_filter_program();
2369 vector<string> items;
2372 if (items.size() == 2) {
2375 program.assign(items[0]);
2376 program_name.assign(enum_type_vals->
FindName(pid,
false));
2378 }
else if (items.size() == 4) {
2384 "Error in stored mask algorithm description data.");
2390 string & program_name,
2411 if (found ==
false) {
2413 oss <<
"Filtering algorithm ID " << algorithm_id
2414 <<
" is not supported." << endl;
2450 if (vol_col_id < 0) {
2464 oss <<
"Error: volume (" << volp->
GetVolName()
2465 <<
") mask data has duplicates value (" << *dup <<
")";
2472 const string & desc1 = iter->second;
2492 const void * src = (
const void *) blob.
ReadRaw(
n*8);
2497 template<
class TRead>
2504 for(
int rng = 0; rng < num_ranges; rng++) {
2507 if (
algo == vol_algo) {
2511 int skip_amt = num_pairs * 2 * TRead::numeric_size;
2537 int vol_oid = 0, vol_idx = -1;
2551 if (blob.
Size() != 0) {
2555 int vol_algo_id = -1;
2563 s_ReadRanges<SReadInt4>(vol_algo_id, ranges, blob);
2575 if (num_threads < 1) {
2577 }
else if (num_threads == 1) {
2578 num_threads = force_mt ? 1 : 0;
2583 for (
int thread =
m_NumThreads; thread < num_threads; ++thread) {
2594 for (
int thread = num_threads; thread <
m_NumThreads; ++thread) {
2632 for (
int vol = 0; vol < nvols; ++vol) {
2640 for (
int vol = 0; vol < nvols; ++vol) {
2697 if ((! defline_set.
Empty()) && defline_set->
CanGet()) {
2699 if (! (*defline)->CanGetSeqid()) {
2704 if((*df_seqid)->Match(seq_id)) {
2706 if(!df_taxids.
empty()) {
2722 for (
unsigned int i=0;
i < oids.size();
i++) {
2726 if (!taxid_set.
empty()) {
2727 taxids.insert(taxids.begin(), taxid_set.
begin(), taxid_set.
end());
Declaration of ADT to retrieve sequences for the BLAST engine.
#define BLAST_SEQSRC_MINLENGTH
Default minimal sequence length.
`Blob' Class for SeqDB (and WriteDB).
int GetReadOffset() const
Get the current read pointer offset.
Int4 ReadInt4()
Read a 4 byte integer at the pointer (and move the pointer).
int Size() const
Get size of blob contents.
void Clear()
Clear all owned data and reference an empty string.
void SeekRead(int offset)
Move the read pointer to a specific location.
const char * ReadRaw(int size)
Read raw data (moving the read pointer).
void SetFrame(const string &frame)
void Log(const string &name, const char *value, CDebugDumpFormatter::EValueType type=CDebugDumpFormatter::eValue, const string &comment=kEmptyStr)
CNcbiOstrstreamToString class helps convert CNcbiOstrstream to a string Sample usage:
CObjectIStreamAsnBinary –.
void GetAliasFileValues(TAliasFileValues &afv, const CSeqDBVolSet &volset)
Get Name/Value Data From Alias Files.
Int8 GetNumSeqsStats(const CSeqDBVolSet &volset) const
Get the number of sequences available.
Uint8 GetTotalLength(const CSeqDBVolSet &volset) const
Get the total length of the set of databases.
Uint8 GetTotalLengthStats(const CSeqDBVolSet &volset) const
Get the total length of the set of databases.
bool NeedTotalsScan(const CSeqDBVolSet &volset) const
Check whether a db scan is need to compute correct totals.
Int4 GetMinLength(const CSeqDBVolSet &volset) const
Get the number of sequences available.
Int8 GetNumOIDs(const CSeqDBVolSet &volset) const
Get the size of the OID range.
void GetMaskList(vector< string > &mask_list)
Get Gi-based Mask Names From Alias Files.
void FindVolumePaths(vector< string > &vols, vector< string > *alias, bool recursive) const
Find the base names of volumes.
bool HasFilters()
Check if any volume filtering exists.
string GetTitle(const CSeqDBVolSet &volset) const
Get the title.
Int8 GetNumSeqs(const CSeqDBVolSet &volset) const
Get the number of sequences available.
CRef< CSeqDB_FilterTree > GetFilterTree()
Get filtering tree for all volumes.
Guard object for the SeqDBAtlas singleton.
CSeqDBAtlas & Get()
Get the CSeqDBAtlas object.
static void RetRegion(const char *datap)
Free allocated memory.
Uint8 GetSliceSize()
Get the current slice size.
void Lock(CSeqDBLockHold &locked)
Lock the atlas.
void Unlock(CSeqDBLockHold &locked)
Unlock the atlas.
int GetNumGis() const
Get the number of GIs in the array.
void GetGiList(vector< TGi > &gis) const
Get the gi list.
void GetTiList(vector< TTi > &tis) const
Get the ti list.
int GetNumTis() const
Get the number of TIs in the array.
void GetMaskData(int algo_id, TGi gi, CSeqDB::TSequenceRanges &ranges, CSeqDBLockHold &locked)
Get the mask data for GI.
int GetAlgorithmId(const string &algo_name) const
Get the mask algorithsm id for a string id.
const string & GetDesc(int algo_id, CSeqDBLockHold &locked)
Get the mask description for algo id.
void GetAvailableMaskAlgorithms(vector< int > &algo) const
Get the available mask algorithsm ids.
SeqDB ID list for performing boolean set operations.
bool Blank() const
Check if an ID list is blank.
void GetTaxIDs(int oid, map< TGi, TTaxId > &gi_to_taxid, bool persist)
Get gi to taxid map for an OID.
int m_NumSeqsStats
Number of sequences in the overall database.
void GetDBTaxIds(set< TTaxId > &tax_ids)
Get all unique tax ids from db.
CSeqDBAliasFile m_Aliases
Alias node hierarchy management object.
char GetSeqType() const
Get the sequence type.
int x_GetSeqBuffer(SSeqResBuffer *buffer, int oid, const char **seq) const
Get sequence from buffer.
CRef< CSeqDBGiList > m_UserGiList
The User GI list for the entire CSeqDB object.
void x_InitIdSet()
Initialize Id Set.
int GetOidAtOffset(int first_seq, Uint8 residue) const
Find the OID corresponding to the offset given in residues, into the database as a whole.
void GetRawSeqAndAmbig(int oid, const char **buffer, int *seq_length, int *ambig_length) const
Raw Sequence and Ambiguity Data.
int GetMinLength() const
Returns the length of the smallest sequence in the database.
int m_NumThreads
number of thread clients
void x_RetSeqBuffer(SSeqResBuffer *buffer) const
Return sequence to buffer.
CSeqDBIdSet m_IdSet
The positive or negative ID list for the entire CSeqDB object.
int x_GetCacheID(CSeqDBLockHold &locked) const
Get local cache ID for current thread.
int x_GetMinLength() const
Returns the shortest sequence lengths of all volumes.
void GetLeafTaxIDs(int oid, map< TGi, set< TTaxId > > &gi_to_taxid_set, bool persist)
Get gi to taxid map for an OID.
string m_Date
Cached most recent date string for GetDate().
void AccessionsToOids(const vector< string > &accs, vector< blastdb::TOid > &oids)
CRef< CSeqDBOIDList > m_OIDList
The list of included OIDs (construction is deferred).
EBlastDbVersion GetBlastDbVersion() const
Return blast db version.
CSeqDBImpl(const string &db_name_list, char prot_nucl, int oid_begin, int oid_end, CSeqDBGiList *gi_list, CSeqDBNegativeList *neg_list, CSeqDBIdSet idset, bool use_atlas_lock)
Standard Constructor.
void x_BuildMaskAlgorithmList(CSeqDBLockHold &locked)
Get a list of algorithm IDs for which mask data exists.
int m_MaskDataColumn
Column ID for mask data column.
int GetMaskAlgorithmId(const string &algo_name)
Get the numeric ID for a algorithm name.
CFastMutex m_OIDLock
Mutex which synchronizes access to the OID list.
int GetColumnId(const string &title)
Get an ID number for a given column title.
bool GiToOidwFilterCheck(TGi gi, int &oid)
GiToOis is meant to simply return oid for a gi if one exisits This method finds the oid and checks if...
void GetColumnBlob(int col_id, int oid, bool keep, CBlastDbBlob &blob)
Fetch the data blob for the given column and oid.
void GetStringBounds(string *low_id, string *high_id, int *count)
Get String Bounds.
void SetIterationRange(int oid_begin, int oid_end)
Set Iteration Range.
TGi x_GetSeqGI(int oid, CSeqDBLockHold &locked)
Look up for the GI of a sequence.
int GetNumOIDs() const
Returns the size of the (possibly sparse) OID range.
Uint8 m_TotalLength
Total length of database (in bases).
void GetMaskAlgorithmDetails(int algorithm_id, string &program, string &program_name, string &algo_opts)
Get information about one type of masking available here.
bool m_UseGiMask
Which type of masks are we using?
CSeqDBAtlas & m_Atlas
Reference to memory management layer.
CRef< CSeqDBNegativeList > m_NegativeList
The Negative ID list for the entire CSeqDB object.
void SetNumberOfThreads(int num_threads, bool force_mt=false)
Invoke the garbage collector to free up memory.
TGi GetSeqGI(int oid)
Look up for the GI of a sequence.
int GetMaxLength() const
Returns the length of the largest sequence in the database.
int m_RestrictEnd
Ending OID as provided to the constructor.
void TaxIdsToOids(set< TTaxId > &tax_ids, vector< blastdb::TOid > &rv)
Get Oid list for input tax ids.
void RetAmbigSeq(const char **buffer) const
Returns any resources associated with the sequence.
void FlushOffsetRangeCache()
Flush all offset ranges cached.
void GetAllTaxIDs(int oid, set< TTaxId > &taxids)
Get all tax ids (leaf and non-leaf for an oid.
Uint8 GetVolumeLength() const
Returns the sum of the lengths of all volumes.
void GetLMDBFileNames(vector< string > &lmdb_list) const
const string & GetDBNameList() const
Get list of database names.
void x_FillSeqBuffer(SSeqResBuffer *buffer, int oid) const
Fill up the buffer.
CObjectIStreamAsnBinary * reusable_inpstr
unsigned GetSequenceHash(int oid)
Get the sequence hash for a given OID.
char m_SeqType
Type of sequences used by this instance.
void x_GetTaxIdsForSeqId(const CSeq_id &seq_id, int oid, CBlast_def_line::TTaxIds &taxid_set)
CRef< CBioseq > GetBioseq(int oid, TGi target_gi, const CSeq_id *target_seq_id, bool seqdata)
Get a CBioseq for a sequence.
static void FindVolumePaths(const string &dbname, char prot_nucl, vector< string > &paths, vector< string > *alias_paths, bool recursive, bool expand_links)
Find volume paths.
void ListColumns(vector< string > &titles)
List columns titles found in this database.
int GetAmbigPartialSeq(int oid, char **buffer, int nucl_code, ESeqDBAllocType alloc_type, CSeqDB::TSequenceRanges *partial_ranges, CSeqDB::TSequenceRanges *masks) const
bool m_OidListSetup
True if OID list setup is done (or was not required).
Uint8 m_TotalLengthStats
Total length of database (in bases).
bool OidToPig(int oid, int &pig) const
Translate a PIG to an OID.
void GetAliasFileValues(TAliasFileValues &afv)
Get Name/Value Data From Alias Files.
Uint8 x_GetTotalLength() const
Returns the sum of the lengths of all available sequences.
Uint8 GetExactTotalLength()
Returns the exact sum of the lengths of all available sequences.
void GetTaxIdsForSeqId(const CSeq_id &seq_id, vector< TTaxId > &taxids)
CRef< CSeqDBGiMask > m_GiMask
Gi-based mask.
CRef< CSeq_data > GetSeqData(int oid, TSeqPos begin, TSeqPos end) const
Fetch data as a CSeq_data object.
map< string, int > m_ColumnTitleMap
Map string column titles to global column IDs.
const map< string, string > & GetColumnMetaData(int column_id)
Get all metadata for the specified column.
CRef< CBlast_def_line_set > x_GetHdr(int oid, CSeqDBLockHold &locked)
Get the sequence header data.
Uint8 x_GetVolumeLength() const
Returns the sum of the lengths of all volumes.
int GetSequence(int oid, const char **buffer) const
Get the sequence data for a sequence.
void x_GetOidList(CSeqDBLockHold &locked)
Build the OID list.
int m_NextChunkOID
"Bookmark" for multithreaded chunk-type OID iteration.
static void GetTaxInfo(TTaxId taxid, SSeqDBTaxInfo &info)
Get taxonomy information.
int x_GetNumOIDs() const
Returns the size of the (possibly sparse) OID range.
void RetSequence(const char **buffer) const
Returns any resources associated with the sequence.
string GetAvailableMaskAlgorithmDescriptions()
Returns a formatted string with the list of available masking algorithms in this database for display...
bool CheckOrFindOID(int &next_oid)
Find an included OID, incrementing next_oid if necessary.
void GetPigBounds(int *low_id, int *high_id, int *count)
Get PIG Bounds.
void SeqidToOids(const CSeq_id &seqid, vector< int > &oids, bool multi)
Translate a CSeq-id to a list of OIDs.
int GetNumSeqsStats() const
Returns the number of sequences available.
@ kUnknownTitle
This column is not heard of yet.
@ kColumnNotFound
This column does not exist (we checked).
int x_GetMaxLength() const
Returns the longest sequence lengths of all volumes.
void SetVolsOidMaskType(int oid_mask_type)
int GetAmbigSeq(int oid, char **buffer, int nucl_code, SSeqDBSlice *region, ESeqDBAllocType strategy, CSeqDB::TSequenceRanges *masks=NULL) const
Get a pointer to a range of sequence data with ambiguities.
int m_MaxLength
Longest database sequence.
int m_RestrictBegin
Starting OID as provided to the constructor.
void GetGiBounds(TGi *low_id, TGi *high_id, int *count)
Get GI Bounds.
void GetAvailableMaskAlgorithms(vector< int > &algorithms)
Get a list of algorithm IDs for which mask data exists.
int GetSeqLengthApprox(int oid) const
Get the approximate sequence length.
Uint8 m_VolumeLength
Total length of all database volumes combined (in bases).
void GetTaxIdsForOids(const vector< blastdb::TOid > &oids, set< TTaxId > &tax_ids)
void DebugDump(CDebugDumpContext ddc, unsigned int depth) const
Dump debug information for this object.
Uint8 x_GetTotalLengthStats() const
Returns the sum of the lengths of all available sequences.
void GetTotals(ESummaryType sumtype, int *oid_count, Uint8 *total_length, bool use_approx)
Returns the sum of the sequence lengths.
CSeqDB_IdRemapper m_AlgorithmIds
Algorithm ID mapping.
bool x_CheckOrFindOID(int &next_oid, CSeqDBLockHold &locked)
Get the next included oid.
int m_MinLength
Shortest database sequence.
void GetMaskData(int oid, int algo_id, CSeqDB::TSequenceRanges &ranges)
Get masked ranges of a sequence.
int x_GetColumnId(const string &title, CSeqDBLockHold &locked)
Get the Column ID for the column with the specified title.
bool TiToOid(Int8 ti, int &oid)
Translate a TI to an OID.
Uint8 GetTotalLengthStats() const
Returns the sum of the lengths of all available sequences.
int x_GetSeqLength(int oid) const
Get the sequence length.
void SetVolsMemBit(int mbit)
Set the membership bit of all volumes.
Uint8 m_ExactTotalLength
Total length of database (in bases).
bool GiToOid(TGi gi, int &oid) const
Translate a GI to an OID.
void HashToOids(unsigned hash, vector< int > &oids)
Get the OIDs for a given sequence hash.
void SetOffsetRanges(int oid, const TRangeList &offset_ranges, bool append_ranges, bool cache_data)
Apply a range of offsets to a database sequence.
list< CRef< CSeq_id > > GetSeqIDs(int oid)
Gets a list of sequence identifiers.
void AccessionToOids(const string &acc, vector< int > &oids)
Find OIDs matching the specified string.
int x_GetNumSeqs() const
Returns the number of sequences available.
string m_DBNames
The list of database names provided to the constructor.
void FlushSeqMemory()
Flush unnecessarily held memory.
CRef< CBlast_def_line_set > GetHdr(int oid)
Get the sequence header data.
vector< SSeqResBuffer * > m_CachedSeqs
Cached sequences.
int GetNumSeqs() const
Returns the number of sequences available.
vector< CRef< CSeqDB_ColumnEntry > > m_ColumnInfo
Map assigned global column IDs to column information.
int x_GetNumSeqsStats() const
Returns the number of sequences available.
Uint8 GetTotalLength() const
Returns the sum of the lengths of all available sequences.
std::map< int, int > m_CacheID
mapping thread ID to storage ID
int x_GetMaskDataColumn(CSeqDBLockHold &locked)
Open the mask data column (if necessary) and return its id.
bool PigToOid(int pig, int &oid) const
Translate a PIG to an OID.
CSeqDB::EOidListType GetNextOIDChunk(int &begin_chunk, int &end_chunk, int oid_size, vector< int > &oid_list, int *oid_state)
Return a chunk of OIDs, and update the OID bookmark.
string GetDate() const
Returns the construction date of the database.
CSeqDBVolSet m_VolSet
Set of volumes used by this database instance.
string GetTitle() const
Returns the database title.
void ResetInternalChunkBookmark()
Restart chunk iteration at the beginning of the database.
string x_FixString(const string &s) const
Adjust string length to offset of first embedded NUL byte.
int m_NumOIDs
Size of databases OID range.
bool m_NeedTotalsScan
True if this configuration cannot deduce totals without a scan.
int m_NumSeqs
Number of sequences in the overall database.
void x_ScanTotals(bool approx, int *seq_count, Uint8 *base_count, int *max_count, int *min_count, CSeqDBLockHold &locked)
Compute totals via iteration.
int GetSeqLength(int oid) const
Get the sequence length.
CSeqDBIdSet GetIdSet()
Get IdSet list attached to this database.
bool OidToGi(int oid, TGi &gi)
Translate a GI to an OID.
void AccessionToOids(const string &acc, vector< TOid > &oids) const
void GetTaxIdsForOids(const vector< blastdb::TOid > &oids, set< TTaxId > &tax_ids) const
void GetLMDBFileNames(vector< string > &lmdb_list) const
void AccessionsToOids(const vector< string > &accs, vector< TOid > &oids) const
bool IsBlastDBVersion5() const
void GetDBTaxIds(set< TTaxId > &tax_ids) const
void TaxIdsToOids(set< TTaxId > &tax_ids, vector< blastdb::TOid > &rv) const
const vector< string > & GetSiList()
const vector< TTi > & GetTiList()
Build ID set for this negative list.
const vector< TGi > & GetGiList()
Build ID set for this negative list.
void UnLease()
Deallocate the memory ranges owned by this object.
bool CheckOrFindOID(TOID &next_oid) const
Find an included oid from the specified point.
static bool GetTaxNames(TTaxId tax_id, SSeqDBTaxInfo &info)
Get the taxonomy names for a given tax id.
void OptimizeGiLists()
Optimize the GI list configuration.
void UnLease()
Return storage held by the volumes.
const CSeqDBVol * GetVol(int i) const
Find a volume by index.
Uint8 GetVolumeSetLength() const
Find total volume length for all volumes.
CSeqDBVol * GetVolNonConst(int i)
Find a volume by index.
int GetNumVols() const
Get the number of volumes.
CSeqDBVol * FindVol(int oid, int &vol_oid) const
Find a volume by OID.
int GetNumOIDs() const
Get the size of the OID range.
int GetVolOIDStart(int i) const
Get the first OID in a volume.
void SeqidToOids(CSeq_id &seqid, vector< int > &oids, CSeqDBLockHold &locked) const
Find OIDs for the specified Seq-id.
void AccessionToOids(const string &acc, vector< int > &oids, CSeqDBLockHold &locked) const
Find OIDs for the specified accession or formatted Seq-id.
void GetColumnBlob(int col_id, int oid, CBlastDbBlob &blob, bool keep, CSeqDBLockHold &locked)
Fetch the data blob for the given column and oid.
const string & GetVolName() const
Get the volume name.
int GetSeqLengthExact(int oid) const
Exact sequence length for nucleotide databases.
void OpenSeqFile(CSeqDBLockHold &locked) const
Open sequence file.
int GetColumnId(const string &title, CSeqDBLockHold &locked)
Get an ID number for a given column title.
int GetNumOIDs() const
Get the number of OIDs for this volume.
void GetPigBounds(int &low_id, int &high_id, int &count, CSeqDBLockHold &locked) const
Get PIG Bounds.
void FlushOffsetRangeCache()
Flush all offset ranges cached.
void ListColumns(set< string > &titles, CSeqDBLockHold &locked)
List the titles of all columns for this volume.
int GetSeqLengthApprox(int oid) const
Approximate sequence length for nucleotide databases.
int GetSeqLengthProt(int oid) const
Sequence length for protein databases.
void SetOidMaskType(int oid_masks) const
bool GiToOid(TGi gi, int &oid, CSeqDBLockHold &locked) const
Find the OID given a GI.
void GetGiBounds(TGi &low_id, TGi &high_id, int &count, CSeqDBLockHold &locked) const
Get GI Bounds.
int GetOidAtOffset(int first_seq, Uint8 residue, CSeqDBLockHold &locked) const
Find the OID at a given index into the database.
const map< string, string > & GetColumnMetaData(int col_id, CSeqDBLockHold &locked)
Get all metadata for the specified column.
bool PigToOid(int pig, int &oid) const
Find the OID given a PIG.
bool TiToOid(Int8 ti, int &oid, CSeqDBLockHold &locked) const
Find the OID given a TI.
void SetMemBit(int mbit) const
Set the MEMB_BIT fitlering for this volume.
Uint8 GetVolumeLength() const
Get the total length of this volume (in bases).
string GetDate() const
Get the formatting date of the volume.
void GetStringBounds(string &low_id, string &high_id, int &count) const
Get String Bounds.
void HashToOids(unsigned hash, vector< int > &oids, CSeqDBLockHold &locked) const
Get the OIDs for a given sequence hash.
Something else yet again etc.
int GetEnd() const
Get OID after last included OID.
@ eOidRange
OID Range [start, end).
int GetBegin() const
Get first included OID.
Database-wide column information.
const map< string, string > & GetMap()
Get the metadata map.
void SetHaveMap()
Indicate that the metadata map is now complete.
int GetVolumeIndex(int volnum)
Get a volume-specific column ID.
bool HaveMap()
Determine if we have the metadata map yet.
void SetMapValue(const string &k, const string &v)
Add a meta-data key/value association.
bool HasFilter() const
Check whether this tree represents any volume filtering.
const vector< CRef< CSeqDB_FilterTree > > & GetNodes() const
Get child nodes attached to this node.
vector< CRef< CSeqDB_AliasMask > > TFilters
Type used to store lists of filters found here.
int m_CacheRealAlgo
Cached list of real algorithms for BuildVolAlgos.
int GetVolAlgo(int vol_idx, int algo_id)
Build a list of volume algorithm IDs.
bool GetDesc(int algorithm_id, string &desc)
Is this object populated?
void AddMapping(int vol_id, int id, const string &desc)
Register a volume's algorithm definition.
map< int, string > m_IdToDesc
Map of real IDs to descriptions.
void GetIdList(vector< int > &algorithms)
Get a list of user (real) IDs available here.
map< string, int > m_DescToId
Map of descriptions to real IDs.
CSeqDB_IdRemapper()
Constructor.
int GetAlgoId(const string &id)
Translate a string algorithm ID to a numeric algorithm ID.
int m_CacheVolAlgo
Cached list of volume algorithms for BuildVolAlgos.
int RealToVol(int vol_idx, int algo_id)
Translate a real algorithm ID to a volume algorithm ID.
void SetNotEmpty()
Is this object populated?
map< int, map< int, int > > m_RealIdToVolumeId
Map of volume# to map of real id to volume-based id.
int m_CacheVolIndex
Cached volume index for BuildVolAlgos.
bool Empty()
Is this object populated?
int m_NextId
Next unassigned synthetic ID.
EOidListType
Indicates how block of OIDs was returned.
ESummaryType
Types of summary information available.
@ eUnfilteredAll
Sum of all sequences, ignoring GI and OID lists and alias files.
@ eFilteredRange
Sum of included sequences with OIDs within the iteration range.
@ eFilteredAll
Values from alias files, or summation over all included sequences.
static const string kOidNotFound
String containing the error message in exceptions thrown when a given OID cannot be found.
iterator_bool insert(const value_type &val)
const_iterator begin() const
const_iterator end() const
static unsigned char depth[2 *(256+1+29)+1]
unsigned int TSeqPos
Type for sequence locations and lengths.
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
SStrictId_Tax::TId TTaxId
Taxon id type.
#define LOG_POST(message)
This macro is deprecated and it's strongly recomended to move in all projects (except tests) to macro...
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
void Info(CExceptionArgs_Base &args)
const string & FindName(TEnumValueType value, bool allowBadValue) const
Find name of the enum by its numeric value.
void Read(CObjectIStream &in, TObjectPtr object, const CTypeRef &type)
const string AsFastaString(void) const
string GetSeqIdString(bool with_version=false) const
Return seqid string with optional version for text seqid type.
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Optimized implementation of CSerialObject::Assign, which is not so efficient.
virtual void DebugDump(CDebugDumpContext ddc, unsigned int depth) const
Define method for dumping debug information.
void Reset(void)
Reset reference object.
bool NotEmpty(void) const THROWS_NONE
Check if CRef is not empty – pointing to an object and has a non-null value.
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
int32_t Int4
4-byte (32-bit) signed integer
uint32_t Uint4
4-byte (32-bit) unsigned integer
int64_t Int8
8-byte (64-bit) signed integer
uint64_t Uint8
8-byte (64-bit) unsigned integer
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
static int StringToInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to int.
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
static TNumeric StringToNumeric(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to a numeric value.
static string & Replace(const string &src, const string &search, const string &replace, string &dst, SIZE_TYPE start_pos=0, SIZE_TYPE max_replace=0, SIZE_TYPE *num_replace=0)
Replace occurrences of a substring within a string.
@ fConvErr_NoThrow
Do not throw an exception on error.
EBlast_filter_program
This defines the possible sequence filtering algorithms to be used in a BLAST database.
bool CanGet(void) const
Check if it is safe to call Get method.
const Tdata & Get(void) const
Get the member data.
bool CanGetDb(void) const
Check if it is safe to call GetDb method.
const TDb & GetDb(void) const
Get the Db member data.
bool IsPrf(void) const
Check if variant Prf is selected.
E_Choice Which(void) const
Which variant is currently selected.
const TGeneral & GetGeneral(void) const
Get the variant data.
bool IsPir(void) const
Check if variant Pir is selected.
@ e_General
for other databases
char * dbname(DBPROCESS *dbproc)
Get name of current database.
unsigned int
A callback function used to compare two keys in a database.
const TYPE & Get(const CNamedParameterList *param)
map< string, string > TStringMap
#define INT4_MAX
largest nubmer represented by signed int
bool approx(T x_, T y_, T eps_)
std::istream & in(std::istream &in_, double &x_)
static pcre_uint8 * buffer
#define INIT_CLASS_MARK()
Marker initializer for constructor.
#define CHECK_MARKER()
Assertion to verify the marker.
#define BREAK_MARKER()
Make the marker of this class invalid.
ESeqDBAllocType
Certain methods have an "Alloc" version.
const blastdb::TOid kSeqDBEntryNotFound
Int4 TOid
Ordinal ID in BLAST databases.
const int kSeqDBNuclNcbiNA8
Used to request ambiguities in Ncbi/NA8 format.
bool IsStringId(const CSeq_id &id)
Determine if id is srting id.
unsigned SeqDB_SequenceHash(const char *sequence, int length)
Returns a path minus filename.
EBlastDbVersion
BLAST database version.
const U & SeqDB_MapFind(const std::map< T, U > &m, const T &k, const U &dflt)
Find a map value or return a default.
static const string * s_CheckUniqueValues(const map< string, string > &m)
void s_ReadRanges(int vol_algo, CSeqDB::TSequenceRanges &ranges, CBlastDbBlob &blob)
static bool s_IsNumericId(const string &id)
static void s_GetDetails(const string &desc, string &program, string &program_name, string &algo_opts)
bool s_Contains(const C &c, const K &k)
void s_AccumulateMinMaxCount(TId low_in, TId high_in, int count_in, TId *low_out, TId *high_out, int *count_out, bool set_all)
Accumulate optional min, max, and count.
static const string s_RestoreColon(const string &in)
The top level of the private implementation layer for SeqDB.
Structure to buffer multiple TSeqRes.
Structure to keep sequence retrieval results.
List of sequence offset ranges.
void append(const void *src, size_type num_elements)
Append extra elements at the end.
static void Read(CBlastDbBlob &blob, int n, CSeqDB::TSequenceRanges &ranges)
static int Read(CBlastDbBlob &blob)
OID-Range type to simplify interfaces.