NCBI C++ ToolKit
|
Search Toolkit Book for CSeqDB
#include <objtools/blast/seqdb_reader/seqdb.hpp>
Classes | |
struct | TOffsetPair |
Structure to represent a range. More... | |
struct | TSequenceRanges |
List of sequence offset ranges. More... | |
Public Types | |
enum | EOidListType { eOidList , eOidRange } |
Indicates how block of OIDs was returned. More... | |
enum | ESeqType { eProtein , eNucleotide , eUnknown } |
Sequence types (eUnknown tries protein, then nucleotide). More... | |
enum | ESummaryType { eUnfilteredAll , eFilteredAll , eFilteredRange } |
Types of summary information available. More... | |
enum | EMmapFileTypes { eMmap_IndexFile , eMmap_SequenceFile } |
File type for which mmap strategy may be set. More... | |
enum | EMmapStrategies { eMmap_Normal , eMmap_Sequential , eMmap_WillNeed } |
Permitted mmap strategies. More... | |
typedef TSeqDBAliasFileValues | TAliasFileValues |
Import type to allow shorter name. More... | |
typedef int | TOID |
Sequence type accepted and returned for OID indices. More... | |
typedef int | TPIG |
Sequence type accepted and returned for PIG indices. More... | |
typedef TGi | TGI |
Sequence type accepted and returned for GI indices. More... | |
typedef set< pair< int, int > > | TRangeList |
List of sequence offset ranges. More... | |
Public Types inherited from CObject | |
enum | EAllocFillMode { eAllocFillNone = 1 , eAllocFillZero , eAllocFillPattern } |
Control filling of newly allocated memory. More... | |
typedef CObjectCounterLocker | TLockerType |
Default locker type for CRef. More... | |
typedef atomic< Uint8 > | TCounter |
Counter type is CAtomiCounter. More... | |
typedef Uint8 | TCount |
Alias for value type of counter. More... | |
Public Member Functions | |
CSeqDB (const string &dbname, ESeqType seqtype, CSeqDBGiList *gilist=0, bool use_atlas_lock=true) | |
Short Constructor. More... | |
CSeqDB (const string &dbname, ESeqType seqtype, CSeqDBNegativeList *nlist) | |
Short Constructor with Negative ID list. More... | |
CSeqDB (const string &dbname, ESeqType seqtype, CSeqDBGiList *gilist, CSeqDBNegativeList *nlist) | |
Short Constructor with Positive and Negative ID list. More... | |
CSeqDB (const string &dbname, ESeqType seqtype, int oid_begin, int oid_end, CSeqDBGiList *gilist, CSeqDBNegativeList *nlist) | |
Short Constructor with Positive and Negative ID list with oid range. More... | |
CSeqDB (const string &dbname, ESeqType seqtype, CSeqDBIdSet ids) | |
Short Constructor with Computed ID list. More... | |
CSeqDB (const vector< string > &dbs, ESeqType seqtype, CSeqDBGiList *gilist=0) | |
Short Constructor. More... | |
CSeqDB (const string &dbname, ESeqType seqtype, int oid_begin, int oid_end, bool use_mmap, CSeqDBGiList *gi_list=0) | |
Constructor with MMap Flag and OID Range. More... | |
CSeqDB (const vector< string > &dbname, ESeqType seqtype, int oid_begin, int oid_end, bool use_mmap, CSeqDBGiList *gi_list=0) | |
Constructor with MMap Flag and OID Range. More... | |
~CSeqDB () | |
Destructor. More... | |
int | GetSeqLength (int oid) const |
Returns the sequence length in base pairs or residues. More... | |
TGi | GetSeqGI (int oid) const |
Returns the first Gi (if any) of the sequence. More... | |
int | GetSeqLengthApprox (int oid) const |
Returns an unbiased, approximate sequence length. More... | |
CRef< CBlast_def_line_set > | GetHdr (int oid) const |
Get the ASN.1 header for the sequence. More... | |
void | GetLeafTaxIDs (int oid, map< TGi, set< TTaxId > > &gi_to_taxid_set, bool persist=false) const |
Get taxid for an OID. More... | |
void | GetLeafTaxIDs (int oid, vector< TTaxId > &taxids, bool persist=false) const |
Get taxids for an OID. More... | |
void | GetTaxIDs (int oid, map< TGi, TTaxId > &gi_to_taxid, bool persist=false) const |
Get taxid for an OID. More... | |
void | GetTaxIDs (int oid, vector< TTaxId > &taxids, bool persist=false) const |
Get taxids for an OID. More... | |
void | GetAllTaxIDs (int oid, set< TTaxId > &taxids) const |
Get all tax ids for an oid. More... | |
CRef< CBioseq > | GetBioseq (int oid, TGi target_gi=ZERO_GI, const CSeq_id *target_seq_id=NULL) const |
Get a CBioseq for a sequence. More... | |
CRef< CBioseq > | GetBioseqNoData (int oid, TGi target_gi=ZERO_GI, const CSeq_id *target_seq_id=NULL) const |
Get a CBioseq for a sequence without sequence data. More... | |
int | GetSequence (int oid, const char **buffer) const |
Get a pointer to raw sequence data. More... | |
int | GetAmbigSeq (int oid, const char **buffer, int nucl_code) const |
Get a pointer to sequence data with ambiguities. More... | |
int | GetAmbigSeq (int oid, const char **buffer, int nucl_code, int begin_offset, int end_offset) const |
Get a pointer to a range of sequence data with ambiguities. More... | |
int | GetAmbigSeqAlloc (int oid, char **buffer, int nucl_code, ESeqDBAllocType strategy, TSequenceRanges *masks=NULL) const |
Get a pointer to sequence data with ambiguities. More... | |
int | GetAmbigPartialSeq (int oid, char **buffer, int nucl_code, ESeqDBAllocType strategy, TSequenceRanges *partial_ranges, TSequenceRanges *masks=NULL) const |
void | RetSequence (const char **buffer) const |
Returns any resources associated with the sequence. More... | |
void | RetAmbigSeq (const char **buffer) const |
Returns any resources associated with the sequence. More... | |
list< CRef< CSeq_id > > | GetSeqIDs (int oid) const |
Gets a list of sequence identifiers. More... | |
void | GetGis (int oid, vector< TGi > &gis, bool append=false) const |
Gets a list of GIs for an OID. More... | |
ESeqType | GetSequenceType () const |
Returns the type of database opened - protein or nucleotide. More... | |
string | GetTitle () const |
Returns the database title. More... | |
string | GetDate () const |
Returns the construction date of the database. More... | |
int | GetNumSeqs () const |
Returns the number of sequences available. More... | |
int | GetNumSeqsStats () const |
Returns the number of sequences available. More... | |
int | GetNumOIDs () const |
Returns the size of the (possibly sparse) OID range. More... | |
Uint8 | GetTotalLength () const |
Returns the sum of the lengths of all available sequences. More... | |
Uint8 | GetExactTotalLength () |
Returns the exact sum of the lengths of all available sequences. More... | |
Uint8 | GetTotalLengthStats () const |
Returns the sum of the lengths of all available sequences. More... | |
Uint8 | GetVolumeLength () const |
Returns the sum of the lengths of all volumes. More... | |
void | GetTotals (ESummaryType sumtype, int *oid_count, Uint8 *total_length, bool use_approx=true) const |
Returns the sum of the sequence lengths. More... | |
int | GetMaxLength () const |
Returns the length of the largest sequence in the database. More... | |
int | GetMinLength () const |
Returns the length of the shortest sequence in the database. More... | |
CSeqDBIter | Begin () const |
Returns a sequence iterator. More... | |
bool | CheckOrFindOID (int &next_oid) const |
Find an included OID, incrementing next_oid if necessary. More... | |
EOidListType | GetNextOIDChunk (int &begin_chunk, int &end_chunk, int oid_size, vector< int > &oid_list, int *oid_state=NULL) |
Return a chunk of OIDs, and update the OID bookmark. More... | |
void | ResetInternalChunkBookmark () |
Resets this object's internal chunk bookmark, which is used when the oid_state argument to GetNextOIDChunk is NULL. More... | |
const string & | GetDBNameList () const |
Get list of database names. More... | |
const CSeqDBGiList * | GetGiList () const |
Get GI list attached to this database. More... | |
CSeqDBIdSet | GetIdSet () const |
Get IdSet list attached to this database. More... | |
bool | PigToOid (int pig, int &oid) const |
Translate a PIG to an OID. More... | |
bool | OidToPig (int oid, int &pig) const |
Translate an OID to a PIG. More... | |
bool | TiToOid (Int8 ti, int &oid) const |
Translate a TI to an OID. More... | |
bool | OidToGi (int oid, TGi &gi) const |
Translate an OID to a GI. More... | |
bool | GiToOid (TGi gi, int &oid) const |
Translate a GI to an OID. More... | |
bool | GiToOidwFilterCheck (TGi gi, int &oid) const |
Translate a GI To an OID with filter check. More... | |
bool | GiToPig (TGi gi, int &pig) const |
Translate a GI to a PIG. More... | |
bool | PigToGi (int pig, TGi &gi) const |
Translate a PIG to a GI. More... | |
void | AccessionToOids (const string &acc, vector< int > &oids) const |
Translate an Accession to a list of OIDs. More... | |
void | AccessionsToOids (const vector< string > &accs, vector< blastdb::TOid > &oids) const |
void | SeqidToOids (const CSeq_id &seqid, vector< int > &oids) const |
Translate a Seq-id to a list of OIDs. More... | |
bool | SeqidToOid (const CSeq_id &seqid, int &oid) const |
Translate a Seq-id to any matching OID. More... | |
int | GetOidAtOffset (int first_seq, Uint8 residue) const |
Find the sequence closest to the given offset into the database. More... | |
CRef< CBioseq > | GiToBioseq (TGi gi) const |
Get a CBioseq for a given GI. More... | |
CRef< CBioseq > | PigToBioseq (int pig) const |
Get a CBioseq for a given PIG. More... | |
CRef< CBioseq > | SeqidToBioseq (const CSeq_id &seqid) const |
Get a CBioseq for a given Seq-id. More... | |
void | FindVolumePaths (vector< string > &paths, bool recursive=true) const |
Find volume paths. More... | |
void | SetIterationRange (int oid_begin, int oid_end) |
Set Iteration Range. More... | |
void | GetAliasFileValues (TAliasFileValues &afv) |
Get Name/Value Data From Alias Files. More... | |
CRef< CSeq_data > | GetSeqData (int oid, TSeqPos begin, TSeqPos end) const |
Fetch data as a CSeq_data object. More... | |
void | GetSequenceAsString (int oid, CSeqUtil::ECoding coding, string &output, TSeqRange range=TSeqRange()) const |
Get a sequence in a given encoding. More... | |
void | GetSequenceAsString (int oid, string &output, TSeqRange range=TSeqRange()) const |
Get a sequence in a readable text encoding. More... | |
void | ListColumns (vector< string > &titles) |
List columns titles found in this database. More... | |
int | GetColumnId (const string &title) |
Get an ID number for a given column title. More... | |
const map< string, string > & | GetColumnMetaData (int column_id) |
Get all metadata for the specified column. More... | |
const string & | GetColumnValue (int column_id, const string &key) |
Look up the value for a specific column metadata key. More... | |
const map< string, string > & | GetColumnMetaData (int column_id, const string &volname) |
Get all metadata for the specified column. More... | |
void | GetColumnBlob (int col_id, int oid, CBlastDbBlob &blob) |
Fetch the data blob for the given column and oid. More... | |
void | GetAvailableMaskAlgorithms (vector< int > &algorithms) |
Get a list of algorithm IDs for which mask data exists. More... | |
int | GetMaskAlgorithmId (const string &algo_name) const |
Get the numeric algorithm ID for a string. More... | |
string | GetAvailableMaskAlgorithmDescriptions () |
Returns a formatted string with the list of available masking algorithms in this database for display purposes (i.e. More... | |
vector< int > | ValidateMaskAlgorithms (const vector< int > &algorithm_ids) |
Validates the algorithm IDs passed to this function, returning a vector of those algorithm IDs not present in this object. More... | |
void | GetMaskAlgorithmDetails (int algorithm_id, objects::EBlast_filter_program &program, string &program_name, string &algo_opts) |
Get information about one type of masking available here. More... | |
void | GetMaskAlgorithmDetails (int algorithm_id, string &program, string &program_name, string &algo_opts) |
void | GetMaskData (int oid, const vector< int > &algo_ids, TSequenceRanges &ranges) |
Get masked ranges of a sequence. More... | |
void | GetMaskData (int oid, int algo_id, TSequenceRanges &ranges) |
Get masked ranges of a sequence. More... | |
void | SetOffsetRanges (int oid, const TRangeList &offset_ranges, bool append_ranges, bool cache_data) |
Apply a range of offsets to a database sequence. More... | |
void | RemoveOffsetRanges (int oid) |
Remove any offset ranges for the given OID. More... | |
void | FlushOffsetRangeCache () |
Flush all offset ranges cached. More... | |
void | SetNumberOfThreads (int num_threads, bool force_mt=false) |
Setting the number of threads. More... | |
Int8 | GetDiskUsage () const |
Retrieve the disk usage in bytes for this BLAST database. More... | |
void | SetVolsMemBit (int mbit) |
Set the membership of all volumes. More... | |
void | DebugDump (CDebugDumpContext ddc, unsigned int depth) const |
Dump debug information for this object. More... | |
EBlastDbVersion | GetBlastDbVersion () const |
Return blast db version. More... | |
void | TaxIdsToOids (set< TTaxId > &tax_ids, vector< blastdb::TOid > &rv) const |
Get Oid list for input tax ids. More... | |
void | GetDBTaxIds (set< TTaxId > &tax_ids) const |
Get all unique tax ids from db. More... | |
void | GetTaxIdsForOids (const vector< blastdb::TOid > &oids, set< TTaxId > &tax_ids) const |
CRef< CBlast_db_metadata > | GetDBMetaData (string user_path=kEmptyStr) |
void | GetTaxIdsForAccession (const string &accs, vector< TTaxId > &taxids) |
Get all tax ids for an accessions. More... | |
void | GetTaxIdsForSeqId (const CSeq_id &seq_id, vector< TTaxId > &taxids) |
Get all tax ids for a seq id. More... | |
Public Member Functions inherited from CObject | |
CObject (void) | |
Constructor. More... | |
CObject (const CObject &src) | |
Copy constructor. More... | |
virtual | ~CObject (void) |
Destructor. More... | |
CObject & | operator= (const CObject &src) THROWS_NONE |
Assignment operator. More... | |
bool | CanBeDeleted (void) const THROWS_NONE |
Check if object can be deleted. More... | |
bool | IsAllocatedInPool (void) const THROWS_NONE |
Check if object is allocated in memory pool (not system heap) More... | |
bool | Referenced (void) const THROWS_NONE |
Check if object is referenced. More... | |
bool | ReferencedOnlyOnce (void) const THROWS_NONE |
Check if object is referenced only once. More... | |
void | AddReference (void) const |
Add reference to object. More... | |
void | RemoveReference (void) const |
Remove reference to object. More... | |
void | ReleaseReference (void) const |
Remove reference without deleting object. More... | |
virtual void | DoNotDeleteThisObject (void) |
Mark this object as not allocated in heap – do not delete this object. More... | |
virtual void | DoDeleteThisObject (void) |
Mark this object as allocated in heap – object can be deleted. More... | |
void * | operator new (size_t size) |
Define new operator for memory allocation. More... | |
void * | operator new[] (size_t size) |
Define new[] operator for 'array' memory allocation. More... | |
void | operator delete (void *ptr) |
Define delete operator for memory deallocation. More... | |
void | operator delete[] (void *ptr) |
Define delete[] operator for memory deallocation. More... | |
void * | operator new (size_t size, void *place) |
Define new operator. More... | |
void | operator delete (void *ptr, void *place) |
Define delete operator. More... | |
void * | operator new (size_t size, CObjectMemoryPool *place) |
Define new operator using memory pool. More... | |
void | operator delete (void *ptr, CObjectMemoryPool *place) |
Define delete operator. More... | |
Public Member Functions inherited from CDebugDumpable | |
CDebugDumpable (void) | |
virtual | ~CDebugDumpable (void) |
void | DebugDumpText (ostream &out, const string &bundle, unsigned int depth) const |
void | DebugDumpFormat (CDebugDumpFormatter &ddf, const string &bundle, unsigned int depth) const |
void | DumpToConsole (void) const |
Static Public Member Functions | |
static string | ESeqType2String (ESeqType type) |
Converts a CSeqDB sequence type into a human readable string. More... | |
static string | GenerateSearchPath () |
Returns the default BLAST database search path configured for this local installation of BLAST. More... | |
static CRef< CBlast_def_line_set > | ExtractBlastDefline (const CBioseq &bioseq) |
Extract a Blast-def-line-set object from a Bioseq retrieved by CSeqDB. More... | |
static CRef< CBlast_def_line_set > | ExtractBlastDefline (const CBioseq_Handle &handle) |
Extract a Blast-def-line-set object from a Bioseq_Handle retrieved by CSeqDB. More... | |
static CTime | GetDate (const string &dbname, ESeqType seqtype) |
Returns the construction date of the database. More... | |
static void | FindVolumePaths (const string &dbname, ESeqType seqtype, vector< string > &paths, vector< string > *alias_paths=NULL, bool recursive=true, bool expand_links=true) |
Find volume paths. More... | |
static void | GetTaxInfo (TTaxId taxid, SSeqDBTaxInfo &info) |
Get taxonomy information. More... | |
Static Public Member Functions inherited from CObject | |
static NCBI_XNCBI_EXPORT void | ThrowNullPointerException (void) |
Define method to throw null pointer exception. More... | |
static NCBI_XNCBI_EXPORT void | ThrowNullPointerException (const type_info &type) |
static EAllocFillMode | GetAllocFillMode (void) |
static void | SetAllocFillMode (EAllocFillMode mode) |
static void | SetAllocFillMode (const string &value) |
Set mode from configuration parameter value. More... | |
Static Public Member Functions inherited from CDebugDumpable | |
static void | EnableDebugDump (bool on) |
Static Public Attributes | |
static const string | kOidNotFound |
String containing the error message in exceptions thrown when a given OID cannot be found. More... | |
static const char * | kBlastDbDateFormat = "b d, Y H:m P" |
Format string for the date returned by CSeqDB::GetDate. More... | |
Static Public Attributes inherited from CObject | |
static const TCount | eCounterBitsCanBeDeleted = 1 << 0 |
Define possible object states. More... | |
static const TCount | eCounterBitsInPlainHeap = 1 << 1 |
Heap signature was found. More... | |
static const TCount | eCounterBitsPlaceMask |
Mask for 'in heap' state flags. More... | |
static const int | eCounterStep = 1 << 2 |
Skip over the "in heap" bits. More... | |
static const TCount | eCounterValid = TCount(1) << (sizeof(TCount) * 8 - 2) |
Minimal value for valid objects (reference counter is zero) Must be a single bit value. More... | |
static const TCount | eCounterStateMask |
Valid object, and object in heap. More... | |
Protected Member Functions | |
CSeqDB () | |
No-argument Constructor. More... | |
void | x_GetDBFilesMetaData (Int8 &disk_bytes, Int8 &cached_bytes, vector< string > &db_files, const string &user_path) const |
Protected Member Functions inherited from CObject | |
virtual void | DeleteThis (void) |
Virtual method "deleting" this object. More... | |
Protected Attributes | |
class CSeqDBImpl * | m_Impl |
Implementation details are hidden. (See seqdbimpl.hpp). More... | |
User interface class for blast databases.
This class provides the top-level interface class for BLAST database users. It defines access to the database component by calling methods on objects which represent the various database files, such as the index, header, sequence, and alias files.
typedef TGi CSeqDB::TGI |
typedef int CSeqDB::TOID |
typedef int CSeqDB::TPIG |
typedef set< pair<int, int> > CSeqDB::TRangeList |
enum CSeqDB::EOidListType |
enum CSeqDB::ESeqType |
enum CSeqDB::ESummaryType |
CSeqDB::CSeqDB | ( | const string & | dbname, |
ESeqType | seqtype, | ||
CSeqDBGiList * | gilist = 0 , |
||
bool | use_atlas_lock = true |
||
) |
Short Constructor.
This version of the constructor assumes memory mapping and that the entire possible OID range will be included. Please use quotes ("") around database names that contains space characters.
dbname | A list of database or alias names, seperated by spaces |
seqtype | Specify eProtein, eNucleotide, or eUnknown. |
gilist | The database will be filtered by this GI list if non-null. |
use_atlas_lock | Enable/diable thread synchronization. If true single Atlas mutex will be used to protect most of critical parts of the code. If false, CSeqdDBAtlas::Lock and CSeqDBAtlas::Unlock functions will be noops. If each thread accesses a different database vloume, then setting this parameter to false will reduce contention. Otherwise it should be set to true. |
Definition at line 155 of file seqdb.cpp.
References dbname(), m_Impl, NCBI_THROW, s_GetSeqTypeChar(), and s_SeqDBInit().
CSeqDB::CSeqDB | ( | const string & | dbname, |
ESeqType | seqtype, | ||
CSeqDBNegativeList * | nlist | ||
) |
Short Constructor with Negative ID list.
This version of the constructor assumes the entire OID range will be included, and applies filtering by a negative ID list. Please use quotes ("") around database names that contains space characters.
dbname | A list of database or alias names, seperated by spaces |
seqtype | Specify eProtein, eNucleotide, or eUnknown. |
nlist | The database will be filtered to not include these GIs or TIs. |
Definition at line 179 of file seqdb.cpp.
References dbname(), m_Impl, NCBI_THROW, NULL, s_GetSeqTypeChar(), and s_SeqDBInit().
CSeqDB::CSeqDB | ( | const string & | dbname, |
ESeqType | seqtype, | ||
CSeqDBGiList * | gilist, | ||
CSeqDBNegativeList * | nlist | ||
) |
Short Constructor with Positive and Negative ID list.
This version of the constructor assumes the entire OID range will be included, and applies filtering by a negative ID list. Please use quotes ("") around database names that contains space characters.
dbname | A list of database or alias names, seperated by spaces |
seqtype | Specify eProtein, eNucleotide, or eUnknown. |
nlist | The database will be filtered to not include these GIs or TIs. |
Definition at line 201 of file seqdb.cpp.
References dbname(), m_Impl, NCBI_THROW, s_GetSeqTypeChar(), and s_SeqDBInit().
CSeqDB::CSeqDB | ( | const string & | dbname, |
ESeqType | seqtype, | ||
int | oid_begin, | ||
int | oid_end, | ||
CSeqDBGiList * | gilist, | ||
CSeqDBNegativeList * | nlist | ||
) |
Short Constructor with Positive and Negative ID list with oid range.
This version of the constructor assumes the entire OID range will be included, and applies filtering by a negative ID list. Please use quotes ("") around database names that contains space characters.
dbname | A list of database or alias names, seperated by spaces |
seqtype | Specify eProtein, eNucleotide, or eUnknown. |
nlist | The database will be filtered to not include these GIs or TIs. |
Definition at line 225 of file seqdb.cpp.
References dbname(), m_Impl, NCBI_THROW, s_GetSeqTypeChar(), and s_SeqDBInit().
CSeqDB::CSeqDB | ( | const string & | dbname, |
ESeqType | seqtype, | ||
CSeqDBIdSet | ids | ||
) |
Short Constructor with Computed ID list.
This version of the constructor takes a computed CSeqDBIdSet list which can be positive or negative. This is equivalent to building a positive or negative list from the IdSet object and and passing it into one of the previous constructors.
dbname | A list of database or alias names, seperated by spaces |
seqtype | Specify eProtein, eNucleotide, or eUnknown. |
ids | The database will be filtered by this set of IDs. |
Definition at line 287 of file seqdb.cpp.
References CSeqDBIdSet::Blank(), dbname(), CSeqDBIdSet::GetNegativeList(), CRef< C, Locker >::GetPointerOrNull(), CSeqDBIdSet::GetPositiveList(), CSeqDBIdSet::IsPositive(), m_Impl, NCBI_THROW, s_GetSeqTypeChar(), and s_SeqDBInit().
CSeqDB::CSeqDB | ( | const vector< string > & | dbs, |
ESeqType | seqtype, | ||
CSeqDBGiList * | gilist = 0 |
||
) |
Short Constructor.
This version of the constructor assumes memory mapping and that the entire possible OID range will be included.
dbs | A list of database or alias names. |
seqtype | Specify eProtein, eNucleotide, or eUnknown. |
gilist | The database will be filtered by this GI list if non-null. |
Definition at line 319 of file seqdb.cpp.
References dbname(), m_Impl, NCBI_THROW, s_GetSeqTypeChar(), s_SeqDBInit(), and SeqDB_CombineAndQuote().
CSeqDB::CSeqDB | ( | const string & | dbname, |
ESeqType | seqtype, | ||
int | oid_begin, | ||
int | oid_end, | ||
bool | use_mmap, | ||
CSeqDBGiList * | gi_list = 0 |
||
) |
Constructor with MMap Flag and OID Range.
If the oid_end value is specified as zero, or as a value larger than the number of OIDs, it will be adjusted to the number of OIDs in the database. Specifying 0,0 for the start and end will cause inclusion of the entire database. This version of the constructor is obsolete because the sequence type is specified as a character (eventually only the ESeqType version will exist). Please use quotes ("") around database names that contains space characters.
dbname | A list of database or alias names, seperated by spaces. |
seqtype | Specify eProtein, eNucleotide, or eUnknown. |
oid_begin | Iterator will skip OIDs less than this value. Only OIDs found in the OID lists (if any) will be returned. |
oid_end | Iterator will return up to (but not including) this OID. |
use_mmap | If kSeqDBMMap is specified (the default), memory mapping is attempted. If kSeqDBNoMMap is specified, or memory mapping fails, this platform does not support it, the less efficient read and write calls are used instead. |
gi_list | The database will be filtered by this GI list if non-null. |
Definition at line 343 of file seqdb.cpp.
References dbname(), m_Impl, NCBI_THROW, s_GetSeqTypeChar(), and s_SeqDBInit().
CSeqDB::CSeqDB | ( | const vector< string > & | dbname, |
ESeqType | seqtype, | ||
int | oid_begin, | ||
int | oid_end, | ||
bool | use_mmap, | ||
CSeqDBGiList * | gi_list = 0 |
||
) |
Constructor with MMap Flag and OID Range.
If the oid_end value is specified as zero, or as a value larger than the number of OIDs, it will be adjusted to the number of OIDs in the database. Specifying 0,0 for the start and end will cause inclusion of the entire database. This version of the constructor is obsolete because the sequence type is specified as a character (eventually only the ESeqType version will exist).
dbname | A list of database or alias names. |
seqtype | Specify eProtein, eNucleotide, or eUnknown. |
oid_begin | Iterator will skip OIDs less than this value. Only OIDs found in the OID lists (if any) will be returned. |
oid_end | Iterator will return up to (but not including) this OID. |
use_mmap | If kSeqDBMMap is specified (the default), memory mapping is attempted. If kSeqDBNoMMap is specified, or memory mapping fails, this platform does not support it, the less efficient read and write calls are used instead. |
gi_list | The database will be filtered by this GI list if non-null. |
Definition at line 367 of file seqdb.cpp.
References dbname(), m_Impl, NCBI_THROW, s_GetSeqTypeChar(), s_SeqDBInit(), and SeqDB_CombineAndQuote().
CSeqDB::~CSeqDB | ( | ) |
Destructor.
This will return resources acquired by this object, including any gotten by the GetSequence() call, whether or not they have been returned by RetSequence().
Definition at line 715 of file seqdb.cpp.
References m_Impl.
|
protected |
No-argument Constructor.
This version of the constructor is used as an extension by the 'expert' interface in seqdbexpert.hpp.
Definition at line 394 of file seqdb.cpp.
References m_Impl.
void CSeqDB::AccessionsToOids | ( | const vector< string > & | accs, |
vector< blastdb::TOid > & | oids | ||
) | const |
Definition at line 252 of file seqdb.cpp.
References CSeqDBImpl::AccessionsToOids(), and m_Impl.
Referenced by BOOST_AUTO_TEST_CASE(), s_TestReadPDBAsn1(), WriteBlastSeqidlistFile(), CBlastDBCmdApp::x_ProcessBatchEntry_NoDup(), and CMakeClusterDBApp::x_ProcessInputData().
Translate an Accession to a list of OIDs.
Definition at line 870 of file seqdb.cpp.
References CSeqDBImpl::AccessionToOids(), NStr::fConvErr_NoThrow, CSeqDBImpl::GiToOidwFilterCheck(), m_Impl, and ZERO_GI.
Referenced by BOOST_AUTO_TEST_CASE(), CSeqDBTestThread::Main(), s_CheckIdLookup(), CBlastDBExtractor::SetSeqId(), CBlastDBCmdApp::x_GetOids(), and CBuildDatabase::x_ResolveFromSource().
CSeqDBIter CSeqDB::Begin | ( | void | ) | const |
Returns a sequence iterator.
This gets an iterator designed to allow traversal of the database from beginning to end.
Definition at line 723 of file seqdb.cpp.
Referenced by BOOST_AUTO_TEST_CASE().
Find an included OID, incrementing next_oid if necessary.
If the specified OID is not included in the set (i.e. the OID mask), the input parameter is incremented until one is found that is. The user will probably want to increment between calls, if iterating over the db.
Definition at line 728 of file seqdb.cpp.
References CSeqDBImpl::CheckOrFindOID(), and m_Impl.
Referenced by BOOST_AUTO_TEST_CASE(), CRawSeqDBSource::CRawSeqDBSource(), CSeqDBIter::CSeqDBIter(), CStrideTest::DoTest(), CSampleTest::DoTest(), CEndsTest::DoTest(), CBlastDB_SeqFormatter::DumpAll(), CBlastDB_FastaFormatter::DumpAll(), CBlastDB_BioseqFormatter::DumpAll(), CSeqFormatter::DumpAll(), CLBLASTObjectLoader::Execute(), CBlastDbAllBioseqSource::GetNext(), CRawSeqDBSource::GetNext(), CClusterDBSource::GetNext(), CBlastSequenceSource::GetNext(), CSeqDBIter::operator++(), CSeqDBIter::operator=(), CSeqDBDemo_SimpleIteration::Run(), s_DbHasOID(), s_MapAllGis(), s_TestDatabase(), s_TestReadPDBAsn1(), SDbSumInfo::SDbSumInfo(), CSearch< LEGACY, NHITS >::Search(), CElementaryMatching::x_CreateIndex(), CElementaryMatching::x_CreateRemapData(), CBuildDatabase::x_DupLocal(), CElementaryMatching::x_InitFilteringVector(), CElementaryMatching::x_LoadRemapData(), and CMakeBlastDBApp::x_ProcessInputData().
|
virtual |
Dump debug information for this object.
Reimplemented from CObject.
Definition at line 1597 of file seqdb.cpp.
References CObject::DebugDump(), depth, CDebugDumpContext::Log(), m_Impl, and CDebugDumpContext::SetFrame().
Converts a CSeqDB sequence type into a human readable string.
Definition at line 1328 of file seqdb.cpp.
References eNucleotide, eProtein, and eUnknown.
Referenced by CBlastDbMetadata::GetMoleculeType().
|
static |
Extract a Blast-def-line-set object from a Bioseq retrieved by CSeqDB.
bioseq | Bioseq retrieved from CSeqDB [in] |
Definition at line 1247 of file seqdbvol.cpp.
References s_ExtractBlastDefline().
Referenced by BOOST_AUTO_TEST_CASE(), CBlastDbBioseqSource::CBlastDbBioseqSource(), CShowBlastDefline::GetBioseqHandleDeflineAndId(), CAlignFormatUtil::GetTaxidForSeqid(), s_ExtractSeqIds(), s_ModifySeqAlnWithFilteredSeqIDs(), s_SeqAlignToXMLHit(), s_SetIdList(), CBlastTabularInfo::SetFields(), CShowBlastDefline::x_CheckForStructureLink(), CShowBlastDefline::x_FillDeflineAndId(), CBlastDBExtractor::x_InitDefline(), CDisplaySeqalign::x_InitDefLinesHeader(), CTaxFormat::x_InitTaxInfoMap(), and CDisplaySeqalign::x_PrintDefLine().
|
static |
Extract a Blast-def-line-set object from a Bioseq_Handle retrieved by CSeqDB.
bioseq | Bioseq retrieved from CSeqDB [in] |
Definition at line 1243 of file seqdbvol.cpp.
References s_ExtractBlastDefline().
|
static |
Find volume paths.
Find the base names of all volumes (and alias nodes). This method builds an alias hierarchy (which should be much faster than constructing an entire CSeqDB object), and returns the resolved volume/alias file base names from that hierarchy.
dbname | The input name of the database |
seqtype | Specify eProtein, eNucleotide, or eUnknown. |
paths | The set of resolved database volume file names |
alias_paths | The set of resolved database alias file names |
recursive | If true, the search will traverse the full alias node tree |
expand_links | If true, the search will expand the soft links |
Definition at line 1040 of file seqdb.cpp.
References dbname(), eNucleotide, eProtein, and CSeqDBImpl::FindVolumePaths().
Referenced by BlastKmerVerifyIndex(), BOOST_AUTO_TEST_CASE(), CBlastKmerBuildIndex::Build(), CBlastKmer::CBlastKmer(), CheckForFreqRatioFile(), CWriteDB_CreateOidMaskDB(), DeleteBlastDb(), CMetaDataTest::DoTest(), CIndexedDb_New::EnumerateDbVolumes(), GetDate(), GetDiskUsage(), CProfileData::Load(), CMkIndexApplication::Run(), CBlastdbConvertApp::Run(), CBlastDBCmdApp::Run(), CSeqDBDemo_Threaded::Run(), WriteBlastSeqidlistFile(), CDbTest::x_GetVolumeList(), CDirTest::x_GetVolumeList(), CBlastRPSInfo::x_Init(), CBlastDBCmdApp::x_PrintBlastDatabaseInformation(), CMakeBlastDBApp::x_ProcessInputData(), BlastdbCopyApplication::x_ShouldCopyPIGs(), and BlastdbCopyApplication::x_ShouldParseSeqIds().
Find volume paths.
Find the base names of all volumes. This method returns the resolved base names of all referenced blast database volumes.
paths | The returned set of resolved database path names |
recursive | If true, the search will traverse the full alias node tree |
Definition at line 1062 of file seqdb.cpp.
References CSeqDBImpl::FindVolumePaths(), and m_Impl.
void CSeqDB::FlushOffsetRangeCache | ( | ) |
Flush all offset ranges cached.
Definition at line 1316 of file seqdb.cpp.
References CSeqDBImpl::FlushOffsetRangeCache(), and m_Impl.
Referenced by s_SeqDbResetChunkIterator().
|
static |
Returns the default BLAST database search path configured for this local installation of BLAST.
Definition at line 1340 of file seqdb.cpp.
References CSeqDBAtlas::GenerateSearchPath().
void CSeqDB::GetAliasFileValues | ( | TAliasFileValues & | afv | ) |
Get Name/Value Data From Alias Files.
SeqDB treats each alias file as a map from a variable name to a value. This method will return a map from the basename of the filename of each alias file, to a vector of maps from variable name to value for each entry in that file. For example, the value of the "DBLIST" entry in the "wgs.nal" file would be values["wgs"][0]["DBLIST"]. The lines returned have been processed somewhat by SeqDB, including normalizing tabs to whitespace, trimming leading and trailing whitespace, and removal of comments and other non-value lines. Care should be taken when using the values returned by this method. SeqDB uses an internal "virtual" alias file entry, which maps from a filename of "-" and contains a single entry mapping "DBLIST" to SeqDB's database name input. This entry is the root of the alias file inclusion tree. Also note that alias files that appear in several places in the alias file inclusion tree may be different – SeqDB's internal editing distributes GI lists over sub-alias files, which is why the value type of the returned data is a vector.
afv | The alias file contents will be returned here. |
Definition at line 1098 of file seqdb.cpp.
References CSeqDBImpl::GetAliasFileValues(), and m_Impl.
Referenced by BOOST_AUTO_TEST_CASE().
Get all tax ids for an oid.
This includes leaf and non-leaf tax ids associated with the oid
oid | The ordinal id of the sequence. |
taxids | A returned a set of taxids. |
Definition at line 467 of file seqdb.cpp.
References CSeqDBImpl::GetAllTaxIDs(), and m_Impl.
Referenced by BOOST_AUTO_TEST_CASE(), and CBlastdbConvertApp::Run().
int CSeqDB::GetAmbigPartialSeq | ( | int | oid, |
char ** | buffer, | ||
int | nucl_code, | ||
ESeqDBAllocType | strategy, | ||
TSequenceRanges * | partial_ranges, | ||
TSequenceRanges * | masks = NULL |
||
) | const |
Definition at line 612 of file seqdb.cpp.
References buffer, eMalloc, eNew, CSeqDBImpl::GetAmbigPartialSeq(), m_Impl, and NCBI_THROW.
Referenced by s_SeqDbGetSequence().
Get a pointer to sequence data with ambiguities.
In the protein case, this is identical to GetSequence(). In the nucleotide case, it stores 2 bases per byte instead of 4. The third parameter indicates the encoding for nucleotide data, either kSeqDBNuclNcbiNA8 or kSeqDBNuclBlastNA8, ignored if the sequence is a protein sequence. When done, resources should be returned with RetSequence.
oid | The ordinal id of the sequence. |
buffer | A returned pointer to the data in the sequence. |
nucl_code | The encoding to use for the returned sequence data. |
Definition at line 550 of file seqdb.cpp.
References buffer, CSeqDBImpl::GetAmbigSeq(), and m_Impl.
Referenced by BOOST_AUTO_TEST_CASE(), CLocalBlastDbAdapter::GetSequence(), GetSequenceAsString(), s_TestPartialAmbigRange(), and CSeqDBDemo_Thread::x_UseOID().
int CSeqDB::GetAmbigSeq | ( | int | oid, |
const char ** | buffer, | ||
int | nucl_code, | ||
int | begin_offset, | ||
int | end_offset | ||
) | const |
Get a pointer to a range of sequence data with ambiguities.
This is like GetAmbigSeq(), but only a range of the sequence data is computed and returned. When done, resources should be returned with RetSequence.
oid | The ordinal id of the sequence. |
buffer | A returned pointer to the data in the sequence. |
nucl_code | The encoding to use for the returned sequence data. |
begin_offset | The zero-based offset at which to start translating. |
end_offset | The zero-based offset at which to end translation. |
Definition at line 570 of file seqdb.cpp.
References buffer, CSeqDBImpl::GetAmbigSeq(), and m_Impl.
int CSeqDB::GetAmbigSeqAlloc | ( | int | oid, |
char ** | buffer, | ||
int | nucl_code, | ||
ESeqDBAllocType | strategy, | ||
TSequenceRanges * | masks = NULL |
||
) | const |
Get a pointer to sequence data with ambiguities.
This is like GetAmbigSeq(), but the allocated object should be deleted by the caller. This is intended for users who are going to modify the sequence data, or are going to mix the data into a container with other data, and who are mixing data from multiple sources and want to free the data in the same way. The fourth parameter should be given one of the values from EAllocStrategy; the corresponding method should be used to delete the object. Note that "delete[]" should be used instead of "delete"
oid | Ordinal ID. |
buffer | Address of a char pointer to access the sequence data. |
nucl_code | The NA encoding, kSeqDBNuclNcbiNA8 or kSeqDBNuclBlastNA8. |
strategy | Indicate which allocation strategy to use. |
masks | If not empty, the return sequence will be (hard) masked. Masks are cleared on return. |
Definition at line 591 of file seqdb.cpp.
References buffer, eMalloc, eNew, CSeqDBImpl::GetAmbigSeq(), m_Impl, and NCBI_THROW.
Referenced by BOOST_AUTO_TEST_CASE(), and s_SeqDbGetSequence().
string CSeqDB::GetAvailableMaskAlgorithmDescriptions | ( | ) |
Returns a formatted string with the list of available masking algorithms in this database for display purposes (i.e.
: help)
Definition at line 1237 of file seqdb.cpp.
References CSeqDBImpl::GetAvailableMaskAlgorithmDescriptions(), and m_Impl.
Referenced by CSequenceIStreamBlastDB::CSequenceIStreamBlastDB(), CSequenceIStreamBlastDB::ShowSupportedFilters(), and CBlastDBCmdApp::x_PrintBlastDatabaseInformation().
void CSeqDB::GetAvailableMaskAlgorithms | ( | vector< int > & | algorithms | ) |
Get a list of algorithm IDs for which mask data exists.
Multiple sources of masking data may be used when building blast databases. This method retrieves a list of the IDs used to identify those types of filtering data to SeqDB. If the blast database volumes used by this instance of SeqDB were built with conflicting algorithm ID definitions, SeqDB will resolve the conflicts by renumbering some of the conflicting descriptions. For this reason, the IDs reported here may not match what was given to WriteDB when the database was created.
algorithms | List of algorithm ids. [out] |
Definition at line 1227 of file seqdb.cpp.
References CSeqDBImpl::GetAvailableMaskAlgorithms(), and m_Impl.
Referenced by BOOST_AUTO_TEST_CASE(), CClusterDBSource::CClusterDBSource(), CRawSeqDBSource::CRawSeqDBSource(), s_SeqDbSrcNew(), ValidateMaskAlgorithms(), and CSearchDatabase::x_ValidateMaskingAlgorithm().
CRef< CBioseq > CSeqDB::GetBioseq | ( | int | oid, |
TGi | target_gi = ZERO_GI , |
||
const CSeq_id * | target_seq_id = NULL |
||
) | const |
Get a CBioseq for a sequence.
This builds and returns the header and sequence data corresponding to the indicated sequence as a CBioseq. If target_gi is non-zero or target_seq_id is non-null, the header information will be filtered to only include the defline associated with that gi/seq_id.
oid | The ordinal id of the sequence. |
target_gi | If nonzero, the target gi to filter the header information by. |
target_seq_id | The target seq_id to filter the header information by. |
Definition at line 504 of file seqdb.cpp.
References CSeqDBImpl::GetBioseq(), and m_Impl.
Referenced by BOOST_AUTO_TEST_CASE(), CBlastDbBioseqSource::CBlastDbBioseqSource(), CSeqFormatter::DumpAll(), CBlastDbAllBioseqSource::GetNext(), CMaskBDBReader::GetNextSequence(), CSequenceIStreamBlastDB::next(), s_CheckIdLookup(), s_DupIdsBioseq(), s_GetSequencesIntoScope(), s_TestDatabase(), CBlastDBExtractor::SetSeqId(), CTestAction::TestOID(), CBlastDB_FastaFormatter::Write(), CBlastDB_BioseqFormatter::Write(), and CSearch< LEGACY, NHITS >::WriteBioseqs().
CRef< CBioseq > CSeqDB::GetBioseqNoData | ( | int | oid, |
TGi | target_gi = ZERO_GI , |
||
const CSeq_id * | target_seq_id = NULL |
||
) | const |
Get a CBioseq for a sequence without sequence data.
This builds and returns the data corresponding to the indicated sequence as a CBioseq, but without the sequence data. It is used when processing large sequences, to avoid accessing unused parts of the sequence.
oid | The ordinal id of the sequence. |
target_gi | If nonzero, the target gi to filter the header information by. |
target_seq_id | The target seq_id to filter the header information by. |
Definition at line 514 of file seqdb.cpp.
References CSeqDBImpl::GetBioseq(), and m_Impl.
Referenced by BOOST_AUTO_TEST_CASE(), CLocalBlastDbAdapter::GetBioseqNoData(), and CBlastDBExtractor::SetSeqId().
EBlastDbVersion CSeqDB::GetBlastDbVersion | ( | ) | const |
Return blast db version.
Definition at line 1604 of file seqdb.cpp.
References CSeqDBImpl::GetBlastDbVersion(), and m_Impl.
Referenced by GetDBMetaData(), GetDiskUsage(), CBlastDbMetadata::GetVersion(), CBlastdbConvertApp::Run(), WriteBlastSeqidlistFile(), BlastdbCopyApplication::x_CopyDB(), x_GetDBFilesMetaData(), CBlastDBCmdApp::x_GetOids(), BlastdbCopyApplication::x_MakeDBwIDList(), CBlastDBCmdApp::x_PrintBlastDatabaseInformation(), and CBlastDBCmdApp::x_ProcessBatchEntry_NoDup().
void CSeqDB::GetColumnBlob | ( | int | col_id, |
int | oid, | ||
CBlastDbBlob & | blob | ||
) |
Fetch the data blob for the given column and oid.
col_id | The column to fetch data from. [in] |
oid | The OID of the blob. [in] |
blob | The data will be returned here. [out] |
Definition at line 1220 of file seqdb.cpp.
References CSeqDBImpl::GetColumnBlob(), and m_Impl.
Referenced by BOOST_AUTO_TEST_CASE(), CRawSeqDBSource::GetNext(), and CClusterDBSource::GetNext().
Get an ID number for a given column title.
For a given column title, this returns an ID that can be used to access that column in the future. The returned ID number is specific to this instance of SeqDB. If the database does not have a column with this name, -1 will be returned.
title | Column title to search for. [in] |
Definition at line 1196 of file seqdb.cpp.
References CSeqDBImpl::GetColumnId(), and m_Impl.
Referenced by BOOST_AUTO_TEST_CASE(), CClusterDBSource::CClusterDBSource(), CRawSeqDBSource::CRawSeqDBSource(), CRawSeqDBSource::GetColumnId(), and CClusterDBSource::GetColumnId().
Get all metadata for the specified column.
Columns may contain user-defined metadata as a list of key-value pairs. For the specified column, this returns that column's metadata in the provided map. If multiple volumes are present, and they define contradictory meta data (this is more common when multiple databases are opened at once), this method returns the first value it finds for each metadata key. If this is unsatisfactory, the two-argument version of this method may be used to get more precise values for specific volumes.
column_id | The column id from GetColumnId. [in] |
Definition at line 1202 of file seqdb.cpp.
References CSeqDBImpl::GetColumnMetaData(), and m_Impl.
Referenced by BOOST_AUTO_TEST_CASE(), CRawSeqDBSource::GetColumnMetaData(), CClusterDBSource::GetColumnMetaData(), and GetColumnValue().
Get all metadata for the specified column.
Columns may contain user-defined metadata as a list of key-value pairs. For the specified database volume and column id, this returns that column's metadata (as defined for that volume) in the provided map. The volume name should match the string returned by FindVolumePaths(vector<string>&).
column_id | The column id from GetColumnId. [in] |
volname | The volume to get metadata for. [in] |
Definition at line 1214 of file seqdb.cpp.
References CSeqDBImpl::GetColumnMetaData(), and m_Impl.
Look up the value for a specific column metadata key.
Columns can contain user-defined metadata as a list of key-value pairs. For the specified column, this returns the value associated with one particular key.
column_id | The column id from GetColumnId. [in] |
Definition at line 1207 of file seqdb.cpp.
References GetColumnMetaData(), ncbi::grid::netcache::search::fields::key, and SeqDB_MapFind().
Referenced by BOOST_AUTO_TEST_CASE().
string CSeqDB::GetDate | ( | void | ) | const |
Returns the construction date of the database.
This is encoded in the database. If multiple databases or multiple volumes were accessed, the latest date will be used.
Definition at line 635 of file seqdb.cpp.
References CSeqDBImpl::GetDate(), and m_Impl.
Referenced by BOOST_AUTO_TEST_CASE(), CMetaDataTest::DoTest(), CBlastDbMetadata::GetDate(), GetDBMetaData(), LogCmdOptions(), s_FillDbInfoLocally(), CBuildDatabase::SetSourceDb(), WriteBlastSeqidlistFile(), CBlastDBCmdApp::x_AddCmdOptions(), and CBlastDBCmdApp::x_PrintBlastDatabaseInformation().
Returns the construction date of the database.
dbname | The database name. |
seqtype | The type of database (nucleotide or protein) |
Definition at line 641 of file seqdb.cpp.
References dbname(), eProtein, f, FindVolumePaths(), in(), CTime::IsEmpty(), ITERATE, offset, and SeqDB_GetStdOrd().
CRef< CBlast_db_metadata > CSeqDB::GetDBMetaData | ( | string | user_path = kEmptyStr | ) |
Definition at line 1705 of file seqdb.cpp.
References CTime::AsString(), set< Key, Compare >::begin(), eBDB_Version5, eFilteredAll, CTimeFormat::eISO8601_DateTimeSec, eProtein, NStr::fSplit_Tokenize, GetBlastDbVersion(), GetDate(), GetDBNameList(), GetDBTaxIds(), CSeqDBImpl::GetNumOfVols(), CDirEntry::GetPathSeparator(), CTimeFormat::GetPredefined(), GetSequenceType(), GetTitle(), GetTotals(), NStr::Join(), m_Impl, NON_CONST_ITERATE, set< Key, Compare >::size(), NStr::Split(), and x_GetDBFilesMetaData().
Referenced by CMakeBlastDBApp::x_BuildDatabase(), and CMakeClusterDBApp::x_BuildDatabase().
Get list of database names.
This returns the database name list used at construction.
Definition at line 760 of file seqdb.cpp.
References CSeqDBImpl::GetDBNameList(), and m_Impl.
Referenced by CBuildDatabase::AddIds(), BOOST_AUTO_TEST_CASE(), CBlastKmerBuildIndex::Build(), GetDBMetaData(), GetDiskUsage(), CTestAction::Log(), LogCmdOptions(), s_FillDbInfoLocally(), s_SeqDbGetName(), CBlastDbDataLoader::SBlastDbParam::SBlastDbParam(), CBuildDatabase::SetSourceDb(), CBlastDBCmdApp::x_AddCmdOptions(), CCddHeadersTest::x_GetHeader(), CBlastScopeSource::x_InitBlastDatabaseDataLoader(), BlastdbCopyApplication::x_MakeDBwIDList(), CBlastKmer::x_SearchMultipleQueries(), CCddHeadersTest::x_TestFreqRatios(), CCddDeltaHeadersTest::x_TestObservations(), and CCddDeltaHeadersTest::x_TestWeightedCounts().
Get all unique tax ids from db.
tax_ids | return taxonomy ids in db |
Definition at line 262 of file seqdb.cpp.
References CSeqDBImpl::GetDBTaxIds(), and m_Impl.
Referenced by BOOST_AUTO_TEST_CASE(), GetDBMetaData(), and CBlastDBCmdApp::x_PrintBlastDatabaseTaxInformation().
Int8 CSeqDB::GetDiskUsage | ( | ) | const |
Retrieve the disk usage in bytes for this BLAST database.
Definition at line 1464 of file seqdb.cpp.
References _ASSERT, dbname(), eProtein, ERR_POST, Error(), file, FindVolumePaths(), NStr::fSplit_Tokenize, GetBlastDbVersion(), GetDBNameList(), GetSequenceType(), ITERATE, LOG_POST, CDirEntry::MakePath(), NCBI_THROW, SeqDB_GetFileExtensions(), NStr::Split(), and Trace().
Referenced by BOOST_AUTO_TEST_CASE(), and CBlastDbMetadata::GetDiskUsage().
Uint8 CSeqDB::GetExactTotalLength | ( | ) |
Returns the exact sum of the lengths of all available sequences.
Calling this function may trigger a complete db scan if the total length of a db cannot be determined without iterating thorugh the sequences i.e. a db with gi list
Definition at line 690 of file seqdb.cpp.
References CSeqDBImpl::GetExactTotalLength(), and m_Impl.
Referenced by CBlastDBCmdApp::x_PrintBlastDatabaseInformation().
const CSeqDBGiList * CSeqDB::GetGiList | ( | ) | const |
Get GI list attached to this database.
This returns the GI list attached to this database, or NULL, if no GI list was used. The effects of changing the contents of this GI list are undefined. This method only deals with the GI list passed to the top level CSeqDB constructor; it does not consider volume GI lists.
Definition at line 1120 of file seqdb.cpp.
References CSeqDBImpl::GetGiList(), and m_Impl.
Referenced by BOOST_AUTO_TEST_CASE(), CSeqDbSeqInfoSrc::HasGiList(), and s_GetFilteredRedundantGis().
Gets a list of GIs for an OID.
This returns the GIs associated with the sequence specified by the given OID. If append is true, gis will be appended to the end of the provided vector; otherwise the vector will be emptied first.
oid | The oid of the sequence. |
gis | The returned list of gis. |
append | Specify true to append to gis, keeping existing elements. |
Definition at line 1070 of file seqdb.cpp.
References append(), GetSeqIDs(), and ITERATE.
Referenced by BOOST_AUTO_TEST_CASE(), s_GetAllGis(), and s_MapAllGis().
CRef< CBlast_def_line_set > CSeqDB::GetHdr | ( | int | oid | ) | const |
Get the ASN.1 header for the sequence.
Do not modify the object returned here (e.g. by removing some of the deflines), as the object is cached internally and future operations on this OID may be affected.
oid | The ordinal ID of the sequence. |
Definition at line 418 of file seqdb.cpp.
References CSeqDBImpl::GetHdr(), and m_Impl.
Referenced by BOOST_AUTO_TEST_CASE(), CSeqAlignFilter::FilterByTaxonomy(), CRawSeqDBSource::GetNext(), CClusterDBSource::GetNext(), s_DupIdsBioseq(), s_DupIdsRaw(), CSearch< LEGACY, NHITS >::SetResult(), CBlastDB_SeqFormatter::Write(), CBuildDatabase::x_DupLocal(), CBlastDBExtractor::x_InitDefline(), and CMakeBlastDBApp::x_ProcessInputData().
CSeqDBIdSet CSeqDB::GetIdSet | ( | ) | const |
Get IdSet list attached to this database.
This returns the ID set used to filter this database. If a CSeqDBGiList or CSeqDBNegativeList was used instead, then an ID set object will be constructed and returned (and cached here). This method only deals with filtering applied to the top level CSeqDB constructor; it does not consider GI or TI lists attached from alias files. If no filtering was used, a 'blank' list will be returned (an empty negative list).
Definition at line 1125 of file seqdb.cpp.
References CSeqDBImpl::GetIdSet(), and m_Impl.
Referenced by BOOST_AUTO_TEST_CASE(), and s_SeqDbGetSequence().
void CSeqDB::GetLeafTaxIDs | ( | int | oid, |
map< TGi, set< TTaxId > > & | gi_to_taxid_set, | ||
bool | persist = false |
||
) | const |
Get taxid for an OID.
This finds the leaf-node TAXIDS associated with a given OID and computes a mapping from GI to taxid. This mapping is added to the map<TGi,set<TTaxId>> provided by the user. If the "persist" flag is set to true, the new associations will simply be added to the map. If it is false (the default), the map will be cleared first.
oid | The ordinal id of the sequence. |
gi_to_taxid_set | A returned mapping from GI to set of taxids. |
persist | If false, the map will be cleared before adding new entries. |
Definition at line 473 of file seqdb.cpp.
References CSeqDBImpl::GetLeafTaxIDs(), ITERATE, and m_Impl.
Referenced by BOOST_AUTO_TEST_CASE().
Get taxids for an OID.
This finds the leaf-node TAXIDS associated with a given OID and returns them in a vector. If the "persist" flag is set to true, the new taxids will simply be appended to the vector. If it is false (the default), the vector will be cleared first. One advantage of this interface over the map<int,set<int>> version is that the vector interface works with databases with local IDs but lacking GIs.
oid | The ordinal id of the sequence. |
taxids | A returned vector of taxids. |
persist | If false, the map will be cleared before adding new entries. |
Definition at line 492 of file seqdb.cpp.
References CSeqDBImpl::GetLeafTaxIDs(), and m_Impl.
void CSeqDB::GetMaskAlgorithmDetails | ( | int | algorithm_id, |
objects::EBlast_filter_program & | program, | ||
string & | program_name, | ||
string & | algo_opts | ||
) |
Get information about one type of masking available here.
For a given algorithm_id, this method fetches information describing the basic algorithm used, as well as options passed to that algorithm to generate the data stored here. Each sequence in the database can provide sequence masking data from one or more sources. There can also be multiple types of masking data from the same algorithm (such as DUST), but generated with different sets of input parameters.
algorithm_id | The ID as from GetAvailableMaskAlgorithms [in] |
program | The filtering program used (DUST, SEG, etc.) [out] |
program_name | string representation of program [out] |
algo_opts | Describes options passed to `program'. [out] |
Definition at line 1263 of file seqdb.cpp.
References NStr::fConvErr_NoThrow, CSeqDBImpl::GetMaskAlgorithmDetails(), m_Impl, and NStr::StringToNumeric().
Referenced by BOOST_AUTO_TEST_CASE(), CClusterDBSource::CClusterDBSource(), CRawSeqDBSource::CRawSeqDBSource(), and s_FillDbInfoLocally().
void CSeqDB::GetMaskAlgorithmDetails | ( | int | algorithm_id, |
string & | program, | ||
string & | program_name, | ||
string & | algo_opts | ||
) |
Definition at line 1276 of file seqdb.cpp.
References CSeqDBImpl::GetMaskAlgorithmDetails(), and m_Impl.
Get the numeric algorithm ID for a string.
algo_name | The name of the filtering algorithm |
Definition at line 1232 of file seqdb.cpp.
References CSeqDBImpl::GetMaskAlgorithmId(), and m_Impl.
Referenced by CSequenceIStreamBlastDB::CSequenceIStreamBlastDB(), CBlastDBCmdApp::x_InitSearchRequest(), and CSearchDatabase::x_TranslateFilteringAlgorithm().
|
inline |
Get masked ranges of a sequence.
For the provided OID and list of algorithm IDs, this method gets a list of masked areas of those sequences for the first algorithm ID. The list of masked areas is returned via the ranges parameter.
oid | The ordinal ID of the sequence. [in] |
algo_id | The algorithm ID to get data for. [in] |
ranges | The list of sequence offset ranges. [out] |
Definition at line 1408 of file seqdb.hpp.
Referenced by BOOST_AUTO_TEST_CASE(), CSeqDbSeqInfoSrc::GetMasks(), CRawSeqDBSource::GetNext(), CClusterDBSource::GetNext(), CSequenceIStreamBlastDB::next(), s_GetSeqMask(), s_SeqDbGetSequence(), and CBlastDB_FastaFormatter::Write().
void CSeqDB::GetMaskData | ( | int | oid, |
int | algo_id, | ||
TSequenceRanges & | ranges | ||
) |
Get masked ranges of a sequence.
For the provided OID and algorithm ID, this method gets a list of masked areas of those sequences. The list of masked areas is returned via the ranges parameter.
oid | The ordinal ID of the sequence. [in] |
algo_id | The algorithm ID to get data for. [in] |
ranges | The list of sequence offset ranges. [out] |
Definition at line 1285 of file seqdb.cpp.
References CSeqDBImpl::GetMaskData(), and m_Impl.
int CSeqDB::GetMaxLength | ( | ) | const |
Returns the length of the largest sequence in the database.
This uses summary information stored in the database volumes or alias files. This might be used to chose buffer sizes.
Definition at line 705 of file seqdb.cpp.
References CSeqDBImpl::GetMaxLength(), and m_Impl.
Referenced by BOOST_AUTO_TEST_CASE(), CSeqDBDemo_Threaded::Run(), s_SeqDbGetMaxLength(), s_SeqDbGetSupportsPartialFetching(), and CBlastDBCmdApp::x_PrintBlastDatabaseInformation().
int CSeqDB::GetMinLength | ( | ) | const |
Returns the length of the shortest sequence in the database.
This uses summary information stored in the database volumes or alias files. This might be used to chose cutoff score.
Definition at line 710 of file seqdb.cpp.
References CSeqDBImpl::GetMinLength(), and m_Impl.
Referenced by s_SeqDbGetMinLength().
CSeqDB::EOidListType CSeqDB::GetNextOIDChunk | ( | int & | begin_chunk, |
int & | end_chunk, | ||
int | oid_size, | ||
vector< int > & | oid_list, | ||
int * | oid_state = NULL |
||
) |
Return a chunk of OIDs, and update the OID bookmark.
This method allows the caller to iterate over the database by fetching batches of OIDs. It will either return a list of OIDs in a vector, or set a pair of integers to indicate a range of OIDs. The return value will indicate which technique was used. The caller sets the number of OIDs to get by setting the size of the vector. If eOidRange is returned, the first included oid is oid_begin and oid_end is the oid after the last included oid. If eOidList is returned, the vector contain the included OIDs, and may be resized to a smaller value if fewer entries are available (for the last chunk). In some cases it may be desireable to have several concurrent, independent iterations over the same database object. If this is required, the caller should specify the address of an int to the optional parameter oid_state. This should be initialized to zero (before the iteration begins) but should otherwise not be modified by the calling code (except that it can be reset to zero to restart the iteration). For the normal case of one iteration per program, this parameter can be omitted.
begin_chunk | First included oid (if eOidRange is returned). |
end_chunk | OID after last included (if eOidRange is returned). |
oid_size | Number of OID to retrieve (ignored in MT environment) |
oid_list | An empty list. Will contain oid list if eOidList is returned. |
oid_state | Optional address of a state variable (for concurrent iterations). |
Definition at line 739 of file seqdb.cpp.
References CSeqDBImpl::GetNextOIDChunk(), m_Impl, and ncbi::grid::netcache::search::fields::size.
Referenced by BOOST_AUTO_TEST_CASE(), CSeqDBDemo_Thread::Main(), CSeqDBDemo_ChunkIteration::Run(), and s_SeqDbGetNextChunk().
int CSeqDB::GetNumOIDs | ( | ) | const |
Returns the size of the (possibly sparse) OID range.
Definition at line 680 of file seqdb.cpp.
References CSeqDBImpl::GetNumOIDs(), and m_Impl.
Referenced by CIndexedDb_New::AddIndexInfo(), BOOST_AUTO_TEST_CASE(), CRawSeqDBSource::CRawSeqDBSource(), CMetaDataTest::DoTest(), CSampleTest::DoTest(), CEndsTest::DoTest(), CSearch< LEGACY, NHITS >::InitBlast(), CMkIndexApplication::Run(), CBlastdbConvertApp::Run(), SDbSumInfo::SDbSumInfo(), and CSeqDbSeqInfoSrc::Size().
int CSeqDB::GetNumSeqs | ( | void | ) | const |
Returns the number of sequences available.
Definition at line 670 of file seqdb.cpp.
References CSeqDBImpl::GetNumSeqs(), and m_Impl.
Referenced by BOOST_AUTO_TEST_CASE(), CBlastKmerBuildIndex::Build(), CLBLASTObjectLoader::CountSeqs(), CWriteDB_CreateOidMaskDB(), CMetaDataTest::DoTest(), CBlastDbMetadata::GetNumberOfSequences(), CBlastSequenceSource::GetNumSeqs(), LogCmdOptions(), CSeqDBDemo_SimpleIteration::Run(), CSeqDBDemo_ChunkIteration::Run(), s_FillDbInfoLocally(), s_GenerateAccsList(), s_InitializeSubject(), s_MTSeqDBTest(), s_MTSeqDBTest2(), s_MTSeqDBTest3(), s_SeqDbGetNumSeqs(), s_SeqDbGetSupportsPartialFetching(), SDbSumInfo::SDbSumInfo(), CBlastDBCmdApp::x_AddCmdOptions(), CBlastKmerBuildIndex::x_BuildIndex(), CElementaryMatching::x_CreateRemapData(), CElementaryMatching::x_InitFilteringVector(), and CBlastDBCmdApp::x_PrintBlastDatabaseInformation().
int CSeqDB::GetNumSeqsStats | ( | ) | const |
Returns the number of sequences available.
This may be overridden by the STATS_NSEQ key.
Definition at line 675 of file seqdb.cpp.
References CSeqDBImpl::GetNumSeqsStats(), and m_Impl.
Referenced by BOOST_AUTO_TEST_CASE(), and s_SeqDbGetNumSeqsStats().
Find the sequence closest to the given offset into the database.
The database volumes can be viewed as a single array of residues, partitioned into sequences by OID order. The length of this array is given by GetTotalLength(). Given an offset between 0 and this length, this method returns the OID of the sequence at the given offset into the array. It is normally used to split the database into sections with approximately equal numbers of residues.
first_seq | First oid to consider (will always return this or higher). |
residue | The approximate number residues offset to search for. |
Definition at line 923 of file seqdb.cpp.
References CSeqDBImpl::GetOidAtOffset(), and m_Impl.
Referenced by BOOST_AUTO_TEST_CASE().
Fetch data as a CSeq_data object.
All or part of the sequence is fetched in a CSeq_data object. The portion of the sequence returned is specified by begin and end. An exception will be thrown if begin is greater than or equal to end, or if end is greater than or equal to the length of the sequence. Begin and end should be specified in bases; a range like (0,1) specifies 1 base, not 2. Nucleotide data will always be returned in ncbi4na format.
oid | Specifies the sequence to fetch. |
begin | Specifies the start of the data to get. [in] |
end | Specifies the end of the data to get. [in] |
Definition at line 539 of file seqdb.cpp.
References CSeqDBImpl::GetSeqData(), and m_Impl.
Referenced by BOOST_AUTO_TEST_CASE().
Returns the first Gi (if any) of the sequence.
This method does NOT check whether the OID in question belongs to the BLAST database after all filtering is applied (e.g.: GI list restriction or membership bit). If you need those checks, please use GetGis()
Definition at line 776 of file seqdb.cpp.
References CSeqDBImpl::GetSeqGI(), and m_Impl.
Referenced by BOOST_AUTO_TEST_CASE().
Gets a list of sequence identifiers.
This returns the list of CSeq_id identifiers associated with the sequence specified by the given OID.
oid | The oid of the sequence. |
Definition at line 765 of file seqdb.cpp.
References CSeqDBImpl::GetSeqIDs(), and m_Impl.
Referenced by BOOST_AUTO_TEST_CASE(), CLBLASTObjectLoader::Execute(), GetGis(), CSeqDbSeqInfoSrc::GetId(), CBlastSequenceSource::GetSeqID(), CLocalBlastDbAdapter::GetSeqIDs(), CSequenceIStreamBlastDB::next(), CBlastdbConvertApp::Run(), s_GenerateAccsList(), s_GetFilteredRedundantGis(), s_SeqDbGetSequence(), s_TestReadPDBAsn1(), s_UpdateSeqAlnWithFilteredSeqIDs(), CTestAction::TestOID(), CElementaryMatching::x_CreateRemapData(), CBlastKmerResults::x_InitScoreVec(), CElementaryMatching::x_LoadRemapData(), and CBuildDatabase::x_ResolveFromSource().
Returns the sequence length in base pairs or residues.
Definition at line 400 of file seqdb.cpp.
References CSeqDBImpl::GetSeqLength(), and m_Impl.
Referenced by BOOST_AUTO_TEST_CASE(), CBlastDBExtractor::ExtractSeqLen(), CSeqDbSeqInfoSrc::GetLength(), CBlastSequenceSource::GetSeqLength(), CLocalBlastDbAdapter::GetSeqLength(), CSeqDBDemo_SimpleIteration::Run(), s_MinhashSequences(), s_MinhashSequences2(), s_SeqDbGetSeqLen(), s_TestPartialAmbig(), SDbSumInfo::SDbSumInfo(), CBlastDBExtractor::SetSeqId(), CBlastDB_FastaFormatter::Write(), CElementaryMatching::x_CreateRemapData(), CTracebackSearchTestFixture::x_GetSelfHitHspStream(), CBlastDB_SeqFormatter::x_GetSeq(), CBlastDB_SeqFormatter::x_Print(), and CSeqDBDemo_ChunkIteration::x_UseOID().
Returns an unbiased, approximate sequence length.
For protein DBs, this method is identical to GetSeqLength(). In the nucleotide case, computing the exact length requires examination of the sequence data. This method avoids doing that, returning an approximation ranging from L-3 to L+3 (where L indicates the exact length), and unbiased on average.
Definition at line 409 of file seqdb.cpp.
References CSeqDBImpl::GetSeqLengthApprox(), and m_Impl.
Referenced by BOOST_AUTO_TEST_CASE().
Get a pointer to raw sequence data.
Get the raw sequence (strand data). When done, resources should be returned with RetSequence. This data pointed to by *buffer is in read-only memory (where supported).
oid | The ordinal id of the sequence. |
buffer | A returned pointer to the data in the sequence. |
Definition at line 530 of file seqdb.cpp.
References buffer, CSeqDBImpl::GetSequence(), and m_Impl.
Referenced by BOOST_AUTO_TEST_CASE(), CSeqDBSequence::CSeqDBSequence(), CBlastSequenceSource::GetSeq(), CLocalBlastDbAdapter::GetSequence(), CSeqDBTest2Thread::Main(), s_SeqDbGetSequence(), CTestAction::TestOID(), CElementaryMatching::x_CompartVolume(), CElementaryMatching::x_CreateIndex(), CSeqDBIter::x_GetSeq(), CElementaryMatching::x_InitFilteringVector(), and CSeqDBDemo_Thread::x_UseOID().
void CSeqDB::GetSequenceAsString | ( | int | oid, |
CSeqUtil::ECoding | coding, | ||
string & | output, | ||
TSeqRange | range = TSeqRange() |
||
) | const |
Get a sequence in a given encoding.
This method gets the sequence data for the given OID, converts it to the specified encoding, and returns it in a string. It supports all values of the CSeqUtil::ECoding enumeration (but the type must match the database type). This method returns the same data as GetAmbigSeq() (or GetSequence() for protein), but may be less efficient due to the cost of translation and string allocation.
oid | The OID of the sequence to fetch. |
coding | The encoding to use for the data. |
output | The returned sequence data as a string. |
range | The range of the sequence to retrieve, if empty, the entire sequence will be retrived [in] |
Definition at line 1141 of file seqdb.cpp.
References buffer, CSeqConvert::Convert(), CSeqUtil::e_Ncbi8na, CSeqUtil::e_Ncbistdaa, eProtein, GetAmbigSeq(), GetSequenceType(), kSeqDBNuclNcbiNA8, output, compile_time_bits::range(), result, and RetAmbigSeq().
Referenced by BOOST_AUTO_TEST_CASE(), CBlastDBExtractor::ExtractHash(), CBlastDBExtractor::ExtractSeqData(), GetSequenceAsString(), CSeqDBTestThread::Main(), s_MinhashSequences(), s_MinhashSequences2(), CBlastDB_SeqFormatter::x_GetSeq(), and CBlastDB_SeqFormatter::x_GetSeqHash().
Get a sequence in a readable text encoding.
This method gets the sequence data for an OID, converts it to a human-readable encoding (either Iupacaa for protein, or Iupacna for nucleotide), and returns it in a string. This is equivalent to calling the three-argument versions of this method with those encodings.
oid | The OID of the sequence to fetch. |
output | The returned sequence data as a string. |
range | The range of the sequence to retrieve, if empty, the entire sequence will be retrived [in] |
Definition at line 1130 of file seqdb.cpp.
References CSeqUtil::e_Iupacaa, CSeqUtil::e_Iupacna, eProtein, GetSequenceAsString(), GetSequenceType(), output, and compile_time_bits::range().
CSeqDB::ESeqType CSeqDB::GetSequenceType | ( | void | ) | const |
Returns the type of database opened - protein or nucleotide.
This uses the same constants as the constructor.
Definition at line 427 of file seqdb.cpp.
References eNucleotide, eProtein, CSeqDBImpl::GetSeqType(), m_Impl, and NCBI_THROW.
Referenced by CBuildDatabase::AddIds(), BOOST_AUTO_TEST_CASE(), GetDBMetaData(), GetDiskUsage(), GetSequenceAsString(), CLocalBlastDbAdapter::GetSequenceType(), CSeqDBDemo_Thread::Init(), CSeqDBDemo_Threaded::Run(), s_DupIdsRaw(), s_TestDatabase(), CBlastDbDataLoader::SBlastDbParam::SBlastDbParam(), x_GetDBFilesMetaData(), CBlastScopeSource::x_InitBlastDatabaseDataLoader(), and CBlastDBCmdApp::x_InitBlastDB().
Get taxid for an OID.
This finds the TAXIDS associated with a given OID and computes a mapping from GI to a set of taxids. This mapping is added to the map<int,int> provided by the user. If the "persist" flag is set to true, the new associations will simply be added to the map. If it is false (the default), the map will be cleared first.
oid | The ordinal id of the sequence. |
gi_to_taxid | A returned mapping from GI to taxid. |
persist | If false, the map will be cleared before adding new entries. |
Definition at line 441 of file seqdb.cpp.
References map_checker< Container >::clear(), CSeqDBImpl::GetTaxIDs(), ITERATE, and m_Impl.
Referenced by BOOST_AUTO_TEST_CASE(), CLocalBlastDbAdapter::GetTaxId(), PrintSAMHeader(), and CSearch< LEGACY, NHITS >::Search().
Get taxids for an OID.
This finds the TAXIDS associated with a given OID and returns them in a vector. If the "persist" flag is set to true, the new taxids will simply be appended to the vector. If it is false (the default), the vector will be cleared first. One advantage of this interface over the map<int,int> version is that the vector interface works with databases with local IDs but lacking GIs.
oid | The ordinal id of the sequence. |
taxids | A returned list of taxids. |
persist | If false, the map will be cleared before adding new entries. |
Definition at line 458 of file seqdb.cpp.
References CSeqDBImpl::GetTaxIDs(), and m_Impl.
Get all tax ids for an accessions.
accs | input accession |
taxids | taxids for accession |
Definition at line 1752 of file seqdb.cpp.
References CSeq_id::fParse_RawText, CSeq_id::fParse_ValidLocal, CSeqDBImpl::GetTaxIdsForSeqId(), and m_Impl.
Referenced by BOOST_AUTO_TEST_CASE().
void CSeqDB::GetTaxIdsForOids | ( | const vector< blastdb::TOid > & | oids, |
set< TTaxId > & | tax_ids | ||
) | const |
Definition at line 267 of file seqdb.cpp.
References CSeqDBImpl::GetTaxIdsForOids(), and m_Impl.
Referenced by CClusterDBSource::GetNext().
Get all tax ids for a seq id.
seq_id | input seq id |
taxids | taxids for accession |
Definition at line 1758 of file seqdb.cpp.
References CSeqDBImpl::GetTaxIdsForSeqId(), and m_Impl.
Referenced by BOOST_AUTO_TEST_CASE().
|
static |
Get taxonomy information.
This method returns taxonomy information for a single taxid. This information does not vary with sequence type (protein vs. nucleotide) and is the same for all blast databases. If the taxonomy database is not available or the taxid is not found, this method will throw an exception.
taxid | An integer identifying the taxid to fetch. |
info | A structure containing taxonomic description strings. |
Definition at line 1105 of file seqdb.cpp.
References CSeqDBImpl::GetTaxInfo(), and info.
Referenced by BOOST_AUTO_TEST_CASE(), CMetaDataTest::DoTest(), CBlastDBExtractor::ExtractBlastName(), CBlastDBExtractor::ExtractCommonTaxonomicName(), CBlastDBExtractor::ExtractLeafCommonTaxonomicNames(), CBlastDBExtractor::ExtractLeafScientificNames(), CBlastDBExtractor::ExtractScientificName(), CBlastDBExtractor::ExtractSuperKingdom(), s_GetTaxName(), s_SeqAlignToXMLHit(), CShowBlastDefline::x_GetTaxonomyInfoForTaxID(), CTaxFormat::x_InitBlastDBTaxInfo(), CBlastTabularInfo::x_SetTaxInfo(), and CBlastTabularInfo::x_SetTaxInfoAll().
string CSeqDB::GetTitle | ( | void | ) | const |
Returns the database title.
This is usually read from database volumes or alias files. If multiple databases were passed to the constructor, this will be a concatenation of those databases' titles.
Definition at line 630 of file seqdb.cpp.
References CSeqDBImpl::GetTitle(), and m_Impl.
Referenced by BOOST_AUTO_TEST_CASE(), CLBLASTObjectLoader::CreateLoader(), CMetaDataTest::DoTest(), GetDBMetaData(), CBlastDbMetadata::GetTitle(), CBlastFormat::PrintReport(), s_FillDbInfoLocally(), s_TestDatabase(), CBuildDatabase::SetSourceDb(), CMakeBlastDBApp::x_BuildDatabase(), BlastdbCopyApplication::x_CopyDB(), BlastdbCopyApplication::x_MakeDBwIDList(), and CBlastDBCmdApp::x_PrintBlastDatabaseInformation().
Uint8 CSeqDB::GetTotalLength | ( | void | ) | const |
Returns the sum of the lengths of all available sequences.
This uses summary information stored in the database volumes or alias files. It provides an approx value without iterating over individual sequences for cases when scanning the db is the only way to determine the exact total length
Definition at line 685 of file seqdb.cpp.
References CSeqDBImpl::GetTotalLength(), and m_Impl.
Referenced by BOOST_AUTO_TEST_CASE(), CWriteDB_CreateOidMaskDB(), CMetaDataTest::DoTest(), CBlastDbMetadata::GetDbLength(), CBlastSequenceSource::GetTotalLength(), LogCmdOptions(), CBlastnApp::Run(), CBlastpApp::Run(), CBlastxApp::Run(), CTblastnApp::Run(), CBlastKmerSearch::Run(), s_FillDbInfoLocally(), s_InitializeSubject(), s_SeqDbGetSupportsPartialFetching(), s_SeqDbGetTotLen(), SDbSumInfo::SDbSumInfo(), CBlastDBCmdApp::x_AddCmdOptions(), CElementaryMatching::x_CreateIndex(), CElementaryMatching::x_InitFilteringVector(), and CBlastDBCmdApp::x_PrintBlastDatabaseInformation().
Uint8 CSeqDB::GetTotalLengthStats | ( | ) | const |
Returns the sum of the lengths of all available sequences.
This uses summary information stored in the database volumes or alias files. It provides either an exact value or a value changed in the alias files by the STATS_TOTLEN key.
Definition at line 695 of file seqdb.cpp.
References CSeqDBImpl::GetTotalLengthStats(), and m_Impl.
Referenced by BOOST_AUTO_TEST_CASE(), and s_SeqDbGetTotLenStats().
void CSeqDB::GetTotals | ( | ESummaryType | sumtype, |
int * | oid_count, | ||
Uint8 * | total_length, | ||
bool | use_approx = true |
||
) | const |
Returns the sum of the sequence lengths.
This uses summary information and iteration to compute the total length and number of sequences for some subset of the database. If eUnfilteredAll is specified, it uses information from the underlying database volumes, without filtering. If eFilteredAll is specified, all of the included sequences are used, for all possible OIDs. If eFilteredRange is specified, the returned values correspond to the sum over only those sequences that survive filtering, and are within the iteration range. If either of oid_count or total_length is passed NULL, that result is not returned. In some cases, the results can be computed in constant time; other cases require iteration proportional to the length of the database or the included OID range (see SetIterationRange()).
sumtype | Specifies the subset of sequences to include. |
oid_count | The returned number of included OIDs. |
total_length | The returned sum of included sequence lengths. |
use_approx | Whether to use approximate lengths for nucleotide. |
Definition at line 1110 of file seqdb.cpp.
References CSeqDBImpl::GetTotals(), and m_Impl.
Referenced by BOOST_AUTO_TEST_CASE(), GetDBMetaData(), and s_ComputeNumSequencesAndDbLength().
Uint8 CSeqDB::GetVolumeLength | ( | ) | const |
Returns the sum of the lengths of all volumes.
This uses summary information stored in the database volumes (but not the alias files). It provides an exact value, without iterating over individual sequences. It includes all OIDs regardless of inclusion by the filtering mechanisms of the alias files.
Definition at line 700 of file seqdb.cpp.
References CSeqDBImpl::GetVolumeLength(), and m_Impl.
Referenced by BOOST_AUTO_TEST_CASE(), CMetaDataTest::DoTest(), SDbSumInfo::SDbSumInfo(), and WriteBlastSeqidlistFile().
Get a CBioseq for a given GI.
This builds and returns the header and sequence data corresponding to the indicated GI as a CBioseq.
gi | The GI of the sequence. |
Definition at line 987 of file seqdb.cpp.
References CSeqDBImpl::GetBioseq(), CSeqDBImpl::GiToOid(), m_Impl, and NULL.
Referenced by BOOST_AUTO_TEST_CASE().
Translate a GI to an OID.
Definition at line 808 of file seqdb.cpp.
References CSeqDBImpl::GiToOid(), and m_Impl.
Referenced by BOOST_AUTO_TEST_CASE(), s_Check_GiEquivalenceInDB(), s_GetSequencesIntoScope(), s_TestDatabase(), s_TestPartialAmbig(), CBlastDBExtractor::SetSeqId(), and CBlastDbBioseqSource::x_GetOidFromSeqID().
Translate a GI To an OID with filter check.
Definition at line 817 of file seqdb.cpp.
References CSeqDBImpl::GiToOidwFilterCheck(), and m_Impl.
Referenced by CBlastDBCmdApp::x_GetOids(), and CBlastDBCmdApp::x_ProcessBatchEntry_NoDup().
Translate a GI to a PIG.
Definition at line 854 of file seqdb.cpp.
References CSeqDBImpl::GiToOid(), m_Impl, and CSeqDBImpl::OidToPig().
Referenced by BOOST_AUTO_TEST_CASE().
void CSeqDB::ListColumns | ( | vector< string > & | titles | ) |
List columns titles found in this database.
This returns a list of the column titles of all user created (and system generated) columns found in any of this database's volumes. Column titles appearing in more than one volume are only listed here once.
titles | Column titles are returned here. [out] |
Definition at line 1191 of file seqdb.cpp.
References CSeqDBImpl::ListColumns(), and m_Impl.
Referenced by BOOST_AUTO_TEST_CASE(), CClusterDBSource::CClusterDBSource(), and CRawSeqDBSource::CRawSeqDBSource().
Translate an OID to a GI.
Definition at line 826 of file seqdb.cpp.
References m_Impl, and CSeqDBImpl::OidToGi().
Referenced by BOOST_AUTO_TEST_CASE(), and s_TestDatabase().
Translate an OID to a PIG.
Definition at line 790 of file seqdb.cpp.
References m_Impl, and CSeqDBImpl::OidToPig().
Referenced by BOOST_AUTO_TEST_CASE(), and CBlastDBExtractor::ExtractPig().
Get a CBioseq for a given PIG.
This builds and returns the header and sequence data corresponding to the indicated PIG (a numeric identifier used for proteins) as a CBioseq.
pig | The protein identifier group id of the sequence. |
Definition at line 1004 of file seqdb.cpp.
References CSeqDBImpl::GetBioseq(), m_Impl, NULL, CSeqDBImpl::PigToOid(), and ZERO_GI.
Referenced by BOOST_AUTO_TEST_CASE().
Translate a PIG to a GI.
Definition at line 837 of file seqdb.cpp.
References m_Impl, CSeqDBImpl::OidToGi(), and CSeqDBImpl::PigToOid().
Referenced by BOOST_AUTO_TEST_CASE().
Translate a PIG to an OID.
Definition at line 781 of file seqdb.cpp.
References m_Impl, and CSeqDBImpl::PigToOid().
Referenced by BOOST_AUTO_TEST_CASE(), CBlastDBExtractor::SetSeqId(), CBlastDBCmdApp::x_ProcessBatchPig(), and CBlastDBCmdApp::x_ProcessEntry().
void CSeqDB::RemoveOffsetRanges | ( | int | oid | ) |
Remove any offset ranges for the given OID.
oid | OID of the sequence. |
Definition at line 1310 of file seqdb.cpp.
References ctll::empty(), and SetOffsetRanges().
void CSeqDB::ResetInternalChunkBookmark | ( | ) |
Resets this object's internal chunk bookmark, which is used when the oid_state argument to GetNextOIDChunk is NULL.
This allows for several iterations to be performed over the same CSeqDB object
Definition at line 755 of file seqdb.cpp.
References m_Impl, and CSeqDBImpl::ResetInternalChunkBookmark().
Referenced by BOOST_AUTO_TEST_CASE(), and s_SeqDbResetChunkIterator().
void CSeqDB::RetAmbigSeq | ( | const char ** | buffer | ) | const |
Returns any resources associated with the sequence.
Calls to GetAmbigSeq (but not GetBioseq()) either increment a counter corresponding to a section of the database where the sequence data lives, or allocate a buffer to return to the user. This method decrements that counter or frees the allocated buffer, so that the memory can be used by other processes. Each allocating call should be paired with a returning call. Note that this does not apply to GetBioseq(), or GetHdr(), for example.
buffer | A pointer to the sequence data to release. |
Definition at line 563 of file seqdb.cpp.
References buffer, m_Impl, and CSeqDBImpl::RetAmbigSeq().
Referenced by BOOST_AUTO_TEST_CASE(), CLocalBlastDbAdapter::GetSequence(), GetSequenceAsString(), s_TestPartialAmbigRange(), and CSeqDBDemo_Thread::x_UseOID().
void CSeqDB::RetSequence | ( | const char ** | buffer | ) | const |
Returns any resources associated with the sequence.
Calls to GetSequence (but not GetBioseq()) either increment a counter corresponding to a section of the database where the sequence data lives, or allocate a buffer to return to the user. This method decrements that counter or frees the allocated buffer, so that the memory can be used by other processes. Each allocating call should be paired with a returning call. Note that this does not apply to GetBioseq(), or GetHdr(), for example.
buffer | A pointer to the sequence data to release. |
Definition at line 523 of file seqdb.cpp.
References buffer, m_Impl, and CSeqDBImpl::RetSequence().
Referenced by BOOST_AUTO_TEST_CASE(), CRawSeqDBSource::ClearSequence(), CRawSeqDBSource::GetNext(), CClusterDBSource::GetNext(), CLocalBlastDbAdapter::GetSequence(), CBlastSequenceSource::RetSequence(), s_SeqDbReleaseSequence(), CTestAction::TestOID(), CElementaryMatching::x_CompartVolume(), CElementaryMatching::x_CreateIndex(), CElementaryMatching::x_InitFilteringVector(), CSeqDBIter::x_RetSeq(), CSeqDBDemo_Thread::x_UseOID(), CRawSeqDBSource::~CRawSeqDBSource(), CSeqDBSequence::~CSeqDBSequence(), and CSequenceReturn::~CSequenceReturn().
Get a CBioseq for a given Seq-id.
This builds and returns the header and sequence data corresponding to the indicated Seq-id as a CBioseq. Note that certain forms of Seq-id map to more than one OID. If this is the case for the provided Seq-id, the first matching OID will be used.
seqid | The Seq-id identifier of the sequence. |
Definition at line 1021 of file seqdb.cpp.
References CSeqDBImpl::GetBioseq(), m_Impl, CSeqDBImpl::SeqidToOids(), and ZERO_GI.
Referenced by BOOST_AUTO_TEST_CASE(), CSeqDBDemo_SeqidToBioseq::Run(), and CIgBlast::x_AnnotateDomain().
Translate a Seq-id to any matching OID.
Definition at line 903 of file seqdb.cpp.
References m_Impl, and CSeqDBImpl::SeqidToOids().
Referenced by BOOST_AUTO_TEST_CASE(), CCddInputData::CHit::FillData(), CSeqAlignFilter::FilterByTaxonomy(), s_DupIdsBioseq(), s_DupIdsRaw(), s_GetSequencesIntoScope(), CLocalBlastDbAdapter::SeqidToOid(), CTestAction::TestOID(), CMultiAligner::x_FindRPSHits(), CBlastDbBioseqSource::x_GetOidFromSeqID(), and CMultiAligner::x_SetDomainHits().
Translate a Seq-id to a list of OIDs.
Definition at line 896 of file seqdb.cpp.
References m_Impl, and CSeqDBImpl::SeqidToOids().
Referenced by BOOST_AUTO_TEST_CASE(), CTracebackSearchTestFixture::x_GetSampleHspStream(), and CTracebackSearchTestFixture::x_GetSelfHitHspStream().
Set Iteration Range.
This method sets the iteration range as a pair of OIDs. Iteration proceeds from begin, up to but not including end. End will be adjusted to the number of OIDs in the case that it is 0, negative, or greater than the number of OIDs.
oid_begin | Iterator will skip OIDs less than this value. Only OIDs found in the OID lists (if any) will be returned. |
oid_end | Iterator will return up to (but not including) this OID. |
Definition at line 1093 of file seqdb.cpp.
References m_Impl, and CSeqDBImpl::SetIterationRange().
Referenced by BOOST_AUTO_TEST_CASE(), and s_SeqDbSrcNew().
Setting the number of threads.
This should be called by the master thread, before and after multiple threads run.
num_threads | Number of threads |
Definition at line 1321 of file seqdb.cpp.
References m_Impl, and CSeqDBImpl::SetNumberOfThreads().
Referenced by CBlastKmerBuildIndex::Build(), CMagicBlastThread::Main(), CSeqDBDemo_Threaded::Run(), CBlastKmerSearch::Run(), s_MTSeqDBTest2(), s_MTSeqDBTest3(), and s_SeqDbSetNumberOfThreads().
void CSeqDB::SetOffsetRanges | ( | int | oid, |
const TRangeList & | offset_ranges, | ||
bool | append_ranges, | ||
bool | cache_data | ||
) |
Apply a range of offsets to a database sequence.
The GetAmbigSeq() method requires an amount of work (and I/O) which is proportional to the size of the sequence data (more if ambiguities are present). In some cases, only certain subranges of this data will be utilized. This method allows the user to specify which parts of a sequence are actually needed by the user. (Care should be taken if one SeqDB object is shared by several program components.) (Note that offsets above the length of the sequence will not generate an error, and are replaced by the sequence length.)
If ranges are specified for a sequence, data areas in specified sequences will be accurate, but data outside the specified ranges should not be accessed, and no guarantees are made about what data they will contain. If the append_ranges flag is true, the range will be added to existing ranges. If false, existing ranges will be flushed and replaced by new ranges. To remove ranges, call this method with an empty list of ranges (and append_ranges == false); future calls will then return the complete sequence.
If the cache_data flag is set, data for this sequence will be kept for the duration of SeqDB's lifetime. To disable caching (and flush cached data) for this sequence, call the method again, but specify cache_data to be false.
oid | OID of the sequence. |
offset_ranges | Ranges of sequence data to return. |
append_ranges | Append new ranges to existing list. |
cache_data | Keep sequence data for future callers. |
Definition at line 1295 of file seqdb.cpp.
References m_Impl, and CSeqDBImpl::SetOffsetRanges().
Referenced by CSubjectRangesSet::ApplyRanges(), RemoveOffsetRanges(), and s_SeqDbSetRanges().
void CSeqDB::SetVolsMemBit | ( | int | mbit | ) |
Set the membership of all volumes.
Definition at line 1345 of file seqdb.cpp.
References m_Impl, and CSeqDBImpl::SetVolsMemBit().
Referenced by BlastdbCopyApplication::x_MakeDBwIDList().
void CSeqDB::TaxIdsToOids | ( | set< TTaxId > & | tax_ids, |
vector< blastdb::TOid > & | rv | ||
) | const |
Get Oid list for input tax ids.
tax_ids | taxonomy ids, return only tax ids found in db |
Definition at line 257 of file seqdb.cpp.
References m_Impl, and CSeqDBImpl::TaxIdsToOids().
Referenced by CBlastDBCmdApp::x_PrintBlastDatabaseTaxInformation(), and CBlastDBCmdApp::x_ProcessTaxIdList().
Translate a TI to an OID.
Definition at line 799 of file seqdb.cpp.
References m_Impl, and CSeqDBImpl::TiToOid().
Validates the algorithm IDs passed to this function, returning a vector of those algorithm IDs not present in this object.
Definition at line 1242 of file seqdb.cpp.
References copy(), GetAvailableMaskAlgorithms(), and ITERATE.
Referenced by CSeqFormatter::CSeqFormatter(), and s_IsMaskAlgoIdValid().
|
protected |
Definition at line 1610 of file seqdb.cpp.
References _ASSERT, a, eBDB_Version4, eBDB_Version5, eProtein, ERR_POST, Error(), CFile::Exists(), f, file, CSeqDBImpl::FindVolumePaths(), GetBlastDbVersion(), CFile::GetLength(), CSeqDBImpl::GetLMDBFileNames(), CDirEntry::GetName(), CDirEntry::GetPath(), GetSequenceType(), ITERATE, l(), m_Impl, NULL, SeqDB_GetFileExtensions(), and ct::sort().
Referenced by GetDBMetaData().
Format string for the date returned by CSeqDB::GetDate.
Definition at line 851 of file seqdb.hpp.
Referenced by CSeqDBImpl::GetDate(), s_ConvertV4toV5(), and WriteBlastSeqidlistFile().
String containing the error message in exceptions thrown when a given OID cannot be found.
Definition at line 316 of file seqdb.hpp.
Referenced by CSeqDBImpl::GetAmbigPartialSeq(), CSeqDBImpl::GetAmbigSeq(), CSeqDBImpl::GetBioseq(), CSeqDBImpl::GetMaskData(), CSeqDBVol::GetRawSeqAndAmbig(), CSeqDBImpl::GetRawSeqAndAmbig(), CSeqDBImpl::GetSeqData(), CSeqDBImpl::GetSeqIDs(), CSeqDBImpl::GetSeqLengthApprox(), CSeqDBImpl::GetSequence(), CSeqDBImpl::OidToGi(), CSeqDBImpl::OidToPig(), CSeqDBImpl::SetOffsetRanges(), CSeqDBImpl::x_FillSeqBuffer(), CSeqDBImpl::x_GetHdr(), CSeqDBImpl::x_GetSeqGI(), and CSeqDBImpl::x_GetSeqLength().
|
protected |
Implementation details are hidden. (See seqdbimpl.hpp).
Definition at line 1529 of file seqdb.hpp.
Referenced by AccessionsToOids(), AccessionToOids(), CheckOrFindOID(), CSeqDB(), DebugDump(), FindVolumePaths(), FlushOffsetRangeCache(), GetAliasFileValues(), GetAllTaxIDs(), GetAmbigPartialSeq(), GetAmbigSeq(), GetAmbigSeqAlloc(), GetAvailableMaskAlgorithmDescriptions(), GetAvailableMaskAlgorithms(), GetBioseq(), GetBioseqNoData(), GetBlastDbVersion(), GetColumnBlob(), GetColumnId(), GetColumnMetaData(), GetDate(), GetDBMetaData(), GetDBNameList(), GetDBTaxIds(), GetExactTotalLength(), CSeqDBExpert::GetGiBounds(), GetGiList(), GetHdr(), GetIdSet(), GetLeafTaxIDs(), GetMaskAlgorithmDetails(), GetMaskAlgorithmId(), GetMaskData(), GetMaxLength(), GetMinLength(), GetNextOIDChunk(), GetNumOIDs(), GetNumSeqs(), GetNumSeqsStats(), GetOidAtOffset(), CSeqDBExpert::GetPigBounds(), CSeqDBExpert::GetRawSeqAndAmbig(), GetSeqData(), GetSeqGI(), GetSeqIDs(), GetSeqLength(), GetSeqLengthApprox(), GetSequence(), CSeqDBExpert::GetSequenceHash(), GetSequenceType(), CSeqDBExpert::GetStringBounds(), GetTaxIDs(), GetTaxIdsForAccession(), GetTaxIdsForOids(), GetTaxIdsForSeqId(), GetTitle(), GetTotalLength(), GetTotalLengthStats(), GetTotals(), GetVolumeLength(), GiToBioseq(), GiToOid(), GiToOidwFilterCheck(), GiToPig(), CSeqDBExpert::HashToOids(), ListColumns(), OidToGi(), OidToPig(), PigToBioseq(), PigToGi(), PigToOid(), ResetInternalChunkBookmark(), RetAmbigSeq(), RetSequence(), SeqidToBioseq(), SeqidToOid(), SeqidToOids(), SetIterationRange(), SetNumberOfThreads(), SetOffsetRanges(), SetVolsMemBit(), TaxIdsToOids(), TiToOid(), x_GetDBFilesMetaData(), and ~CSeqDB().