33 #ifndef C_DB_INDEX_HPP
34 #define C_DB_INDEX_HPP
71 extern
unsigned long GetCodeBits(
unsigned long stride );
103 switch( GetErrCode() ) {
104 case eFile:
return "access failure";
105 case eRead:
return "read failure";
106 case eWrite:
return "write failure";
107 case eEndian:
return "endianness mismatch";
108 case eVersion:
return "unknown index format version";
109 case eSize:
return "wrong header size";
126 static const Uint4 INDEX_FORMAT_VERSION_0 = 0;
129 static const Uint4 INDEX_FORMAT_VERSION_1 = 1;
135 static Uint4 GetSystemEndianness(
void );
193 static const size_t COMMON_SIZE = 2*
sizeof(
Uint4 );
202 void Save( std::ostream & os,
const std::string & fname );
271 static const size_t EXPECTED_SIZE = COMMON_SIZE + 2*
sizeof(
Uint4 );
316 template<
typename T >
425 virtual const char * GetErrCodeString()
const override;
441 static const unsigned long CR = 4;
447 static const unsigned long STRIDE = 5;
455 static const unsigned long MIN_OFFSET = 64;
460 static const unsigned long CODE_BITS = 3;
463 static const unsigned char VERSION = (
unsigned char)5;
515 unsigned long word_size,
517 const TWord *
map,
size_t map_size )
518 : word_size_( word_size ), start_( start ), results_(
size, 0 )
520 for(
size_t i = 0;
i < map_size; ++
i ) {
521 map_.push_back(
map[
i] );
531 if( seq == 0 )
return 0;
532 else if( seq - start_ - 1 >= results_.size() )
return 0;
533 else return results_[seq - start_ - 1];
552 if( subj >= map_.size() )
return 0;
553 return (
TSeqNum)(map_[subj]) + chunk;
564 {
return GetResults( MapSubject( subj, chunk ) ); }
575 if( subj >= map_.size() )
return false;
578 TSeqNum start = MapSubject( subj, 0 );
579 TSeqNum end = MapSubject( subj + 1, 0 );
580 if( end == 0 ) end = start_ +
static_cast<TSeqNum>(results_.size()) + 1;
582 for(
TSeqNum chunk = start; chunk < end; ++chunk ) {
583 if( GetResults( chunk ) != 0 ) {
599 if( seq > 0 && seq - start_ - 1 < results_.size() ) {
600 results_[seq - start_ - 1] = res;
607 for( TResults::iterator it = results_.begin();
608 it != results_.end(); ++it ) {
661 static void MakeIndex(
681 { MakeIndex( fname, oname, start, 0, stop, stop_chunk, options ); }
690 static void MakeIndex(
713 static void MakeIndex(
733 { MakeIndex(
input, oname, start, 0, stop, stop_chunk, options ); }
742 static void MakeIndex(
766 const SSearchOptions & search_options
804 "GetSeqLen() is not supported in this index version." );
818 "GetSeqData() is not supported in this index version." );
843 template<
bool LEGACY >
874 ASSERT( oid >= getStartOId() );
875 return oid - getStartOId();
880 ASSERT( sid <= getStopOId() - getStartOId() );
881 return sid + getStartOId();
885 unsigned long getStride()
const {
return header_.stride_; }
886 unsigned long getWSHint()
const {
return header_.ws_hint_; }
896 pair< TSeqNum, TSeqNum > getSRCId(
TSeqNum cid )
const;
900 {
return getChunkLength( getCId( sid, rcid ) ); }
903 {
return getSIdByCId( getCIdByLRCId( lid, rcid ) ); }
904 pair< TSeqNum, TSeqPos > getRCIdOffByLIdOff(
TSeqNum lid,
TSeqPos loff )
const;
908 pair< TSeqNum, TSeqPos >
t = getRCIdOffByLIdOff( lid, loff );
909 return make_pair( getCIdByLRCId( lid,
t.first ),
t.second );
916 pair< TSeqNum, TSeqNum >
t = getSRCId( cid );
917 return make_pair(
t.first, getSOff(
t.first,
t.second, coff ) );
922 pair< TSeqNum, TSeqPos >
t = getCIdOffByLIdOff( lid, loff );
923 return getSIdOffByCIdOff(
t.first,
t.second );
926 TSeqNum getNumSubjects()
const;
932 TSeqNum getLId(
const TOffsetValue & v )
const;
933 TSeqPos getLOff(
const TOffsetValue & v )
const;
937 if( sid < idmap_.size() )
return idmap_[sid];
938 else return "unknown";
941 const vector< string > &
getIdMap()
const {
return idmap_; }
983 unsigned long stride,
unsigned long ws_hint );
1065 unsigned long stride );
1079 unsigned long stride );
1113 return *(ptr + 1) - *ptr;
1152 return std::make_pair(
1188 TWord lid_start = *ptr;
1194 ASSERT( siter != eiter );
1195 TChunksIter res = std::upper_bound( siter, eiter, abs_offset );
1199 return std::make_pair(
1201 (
TSeqPos)(soff - (*res - lid_start)*
CR) );
1215 return start + lchunk;
1296 return start + rcid;
1309 TWord lid_start = *ptr;
1316 ASSERT( siter != eiter );
1317 TChunksIter res = std::upper_bound( siter, eiter, abs_offset );
1321 return std::make_pair(
1323 (
TSeqPos)(loff - (*res - lid_start)*
CR) );
1397 inline pair< CDbIndex::TSeqNum, CDbIndex::TSeqNum >
1408 inline pair< CDbIndex::TSeqNum, TSeqPos >
Definitions used throughout BLAST.
#define NCBI_XBLAST_EXPORT
NULL operations for other cases.
Ungapped extension structures that are common to nucleotide and protein extension routines.
BlastInitHitList * BLAST_InitHitListFree(BlastInitHitList *init_hitlist)
Free memory for the BlastInitList structure.
This class represents a set of seeds obtained by searching all subjects represented by the index.
unsigned long GetWordSize() const
Get the search word size.
void SetResults(TSeqNum seq, BlastInitHitList *res)
Set the result set for a given logical subject.
TSeqNum start_
Starting logical subject number.
CSearchResults(unsigned long word_size, TSeqNum start, TSeqNum size, const TWord *map, size_t map_size)
Object constructor.
BlastInitHitList * GetResults(TSeqNum subj, TSeqNum chunk) const
Get the result set for a particular subject and chunk.
vector< Uint8 > map_
(subject,chunk)->(logical id) map.
~CSearchResults()
Object destructor.
TSeqNum MapSubject(TSeqNum subj, TSeqNum chunk) const
Map a subject sequence and a chunk number to internal logical id.
CDbIndex::TWord TWord
Convenience declaration.
unsigned long word_size_
Word size used for the search.
bool CheckResults(TSeqNum subj) const
Check if any results are available for a given subject sequence.
BlastInitHitList * GetResults(TSeqNum seq) const
Get the result set for a particular logical subject.
vector< BlastInitHitList * > TResults
Each vector item points to results for a particular logical subject.
TResults results_
The combined result set.
TSeqNum NumSeq() const
Get the number of logical sequences in the results set.
Types of exception the indexing library can throw.
EErrCode
Numerical error codes.
@ eBadOption
Bad index creation/search option.
@ eBadVersion
Wrong index version.
@ eBadData
Bad index data.
@ eBadSequence
Bad input sequence data.
NCBI_EXCEPTION_DEFAULT(CDbIndex_Exception, CException)
Base class providing high level interface to index objects.
TSeqNum StartSeq() const
Get the OID of the first sequence in the index.
pair< TSeqNum, TSeqPos > getSIdOffByCIdOff(TSeqNum cid, TSeqPos coff) const
TSeqNum getNumSubjects() const
static CRef< CDbIndex > LoadIndex(CNcbiIstream &is)
Load index from an open stream.
const Uint1 * getSeqData(TSeqNum sid) const
TSeqNum StopChunk() const
Get the number of the last chunk of the last sequence in the index.
TSeqNum getSIdByLRCId(TSeqNum lid, TSeqNum rcid) const
TSeqNum getSIdByOId(TSeqNum oid) const
static void MakeIndex(CSequenceIStream &input, const std::string &oname, TSeqNum start, TSeqNum &stop, TSeqNum &stop_chunk, const SOptions &options)
Create an index object.
TSeqPos getSOff(TSeqNum sid, TSeqNum rcid, TSeqPos coff) const
pair< TSeqNum, TSeqPos > getRCIdOffByLIdOff(TSeqNum lid, TSeqPos loff) const
TSeqNum getCIdByLRCId(TSeqNum lid, TSeqNum rcid) const
const vector< string > & getIdMap() const
TSeqNum getCId(TSeqNum sid, TSeqNum rcid) const
pair< TSeqNum, TSeqNum > getSRCId(TSeqNum cid) const
virtual ~CDbIndex()
Index object destructor.
pair< TSeqNum, TSeqPos > getCIdOffByLIdOff(TSeqNum lid, TSeqPos loff) const
const string getBioseqIdBySId(TSeqNum sid) const
TSeqNum start_
OID of the first sequence in the index.
SOffsetValue TOffsetValue
unsigned long getChunkOverlap() const
unsigned long getHKeyWidth() const
TSeqNum getCId(TSeqNum sid) const
unsigned long getMaxChunkSize() const
virtual const Uint1 * GetSeqData(TSeqNum) const
Get the sequence data of the subject sequence.
TSeqNum stop_chunk_
Number of the last chunk of the last sequence.
Uint4 TWord
Type representing main memory unit of the index structure.
virtual TSeqPos GetSeqLen(TSeqNum) const
Get the length of the subject sequence.
SIndexHeader header_
The index header structure.
TSeqNum start_chunk_
Number of the first chunk of the first sequence.
unsigned long getWSHint() const
static void MakeIndex(const std::string &fname, const std::string &oname, TSeqNum start, TSeqNum &stop, TSeqNum &stop_chunk, const SOptions &options)
Create an index object.
TSeqNum getLId(const TOffsetValue &v) const
TWord getChunkLength(TSeqNum cid) const
TSeqPos getLOff(const TOffsetValue &v) const
TSubjectMap * subject_map_
The subject map object.
TWord getChunkLength(TSeqNum sid, TSeqNum rcid) const
TSeqNum getSIdByCId(TSeqNum cid) const
unsigned long getStride() const
TSeqNum StartChunk() const
Get the number of the first chunk of the first sequence in the index.
vector< string > idmap_
Mapping from source ids to bioseq ids.
TSeqNum getNumChunks() const
virtual CConstRef< CSearchResults > DoSearch(const BLAST_SequenceBlk *, const BlastSeqLoc *, const SSearchOptions &)
Actual implementation of seed searching.
pair< TSeqNum, TSeqPos > getSIdOffByLIdOff(TSeqNum lid, TSeqPos loff) const
static const unsigned long CR
Letters per byte in the sequence store.
virtual void Remap()
If possible reduce the index footpring by unmapping the portion that does not contain sequence data.
TSeqNum stop_
OID of the last sequence in the inex.
CSequenceIStream::TStreamPos TSeqNum
Type used to enumerate sequences in the index.
TWord getSubjectLength(TSeqNum sid) const
TSeqNum getStopOId() const
TSeqNum getStartOId() const
TSeqNum getOIdBySId(TSeqNum sid) const
TSeqNum StopSeq() const
Get the OID of the last sequence in the index.
Class representing index hash table and offset list database.
THashTable hash_table_
The hash table (mapping from Nmer values to the lists of offsets.
CVectorWrap< TWord > THashTable
The type of the hash table.
COffsetData_Base(TWord **map, unsigned long hkey_width, unsigned long stride, unsigned long ws_hint)
Object constructor.
TWord total_
Auxiliary data member used for importing the offset list data.
unsigned long min_offset_
Minimum offset value used by the index.
unsigned long stride_
Stride value used by the index.
unsigned long getMinOffset() const
Accessor for minimum offset value.
CDbIndex::TWord TWord
Index word type (public to support Solaris).
unsigned long ws_hint_
ws_hint values used by the index.
unsigned long hkey_width_
Hash key width in bp.
unsigned long hkey_width() const
Get the width of the hash key in base pairs.
CDbIndex::SOffsetValue TOffsetValue
unsigned long getWSHint() const
Accessor for ws_hint value.
unsigned long getStride() const
Accessor for stride value.
Iterator for 0-terminated pre-ordered offset lists.
Class used to abstract reading nucleotide sequences from various sources.
Uint4 TStreamPos
Type used to represent positions within a sequence stream.
Type representing subject map data.
TSeqNum getLId(const TOffsetValue &v) const
TLengths lengths_
Subject lengths storage.
TWord getSubjectLength(TSeqNum sid) const
const Uint1 * GetSeqStoreBase() const
Return the start of the raw storage for compressed subject sequence data.
TSeqNum GetNumChunks(TSeqNum lid) const
Get number of chunks combined into a given logical sequence.
unsigned long GetStride() const
Accessor for stride value.
CVectorWrap< TWord > TChunks
Type for storing the chunk data.
CDbIndex::TOffsetValue TOffsetValue
TWord offset_mask_
Mask to extract offsets.
pair< TSeqNum, TSeqNum > TSCPair
CVectorWrap< TWord > TLIdMap
Local id -> chunks map storage type.
unsigned long stride_
Index stride value.
TSCPair getSRCId(TSeqNum cid) const
std::pair< TSeqNum, TSeqPos > DecodeOffset(TWord offset) const
Decode offset.
pair< TSeqNum, TSeqPos > TSOPair
TSeqNum getCIdByLRCId(TSeqNum lid, TSeqNum rcid) const
TSeqNum NumSubjects() const
Get the total number of logical sequences in the map.
unsigned long max_chunk_size_
TSeqNum MapSubject(TSeqNum subject, TSeqNum chunk) const
Get the logical sequence id from the database oid and the chunk number.
void Load(TWord **map, TSeqNum start, TSeqNum stop, unsigned long stride)
Loads index by mapping to the memory segment.
unsigned long min_offset_
Minimum offset used by the index.
const TWord * GetSubjectMap() const
Provides a mapping from real subject ids and chunk numbers to internal logical subject ids.
Uint1 offset_bits_
Number of bits used to encode offset.
TWord total_
Size in bytes of the raw sequence storage.
TLIdMap lid_map_
Local id -> chunk map storage.
TSeqNum NumChunks() const
Get the total number of sequence chunks in the map.
std::pair< TSeqNum, TSeqPos > MapSubjOff(TSeqNum lid, TSeqPos soff) const
Map logical sequence id and logical sequence offset to relative chunk number and chunk offset.
TSubjects subjects_
Mapping from database oids to the chunk info.
TSeqPos getSOff(TSeqNum sid, TSeqNum rcid, TSeqPos coff) const
TChunks chunks_
Collection of individual chunk descriptors.
TWord GetSeqStoreSize() const
Return the size in bytes of the eaw sequence storage.
TSeqStore seq_store_
Storage for the raw subject sequence data.
CVectorWrap< TWord > TLengths
Subject lengths storage type.
TSeqPos GetSeqLen(TSeqNum oid) const
Get the length of the subject sequence.
unsigned long chunk_overlap_
TSeqNum getNumChunks(TSeqNum sid) const
void SetSeqDataFromMap(TWord **map)
Set up the sequence store from the memory segment.
const Uint1 * getSeqData(TSeqNum sid) const
void SetSubjInfo(TSeqNum subj, TWord &start, TWord &end) const
Return the subject information based on the given logical subject id.
const Uint1 * GetSeqData(TSeqNum oid) const
Get the sequence data of the subject sequence.
TSeqNum MapLId2Chunk(TSeqNum lid, TSeqNum lchunk) const
Map logical id and relative chunk to absolute chunk id.
CSubjectMap()
Trivial constructor.
TSeqPos getLOff(const TOffsetValue &v) const
TSeqNum getNumChunks() const
TSOPair getRCIdOffByLIdOff(TSeqNum lid, TSeqPos loff) const
TSCPairMap c2s_map_
CId -> (SId, RCId) map.
CVectorWrap< TWord > TSubjects
Type used to map database oids to the chunk info.
CVectorWrap< Uint1 > TSeqStore
Type used for compressed subject sequence data storage.
CDbIndex::TSeqNum TSeqNum
vector< TSCPair > TSCPairMap
TSeqNum getCId(TSeqNum sid, TSeqNum rcid) const
TWord getChunkLength(TSeqNum cid) const
TSeqNum getNumSubjects() const
A vector or pointer based sequence wrapper.
void resize(size_type n, T v=T())
Change the size of the sequence.
TVector::reference reference
bool vec_
Flag indicating whether it is a wrapper or a holder of external sequence.
std::vector< T > TVector
Sequence type being wrapped.
TVector::size_type size_type
T * base_
Pointer to the first element of the sequence.
const T * const_iterator
Iterator type pointing to const data.
TVector data_
std::vector object wrapped by this object.
void SetPtr(T *base, size_type sz)
Make the object hold an external sequence.
size_type size() const
Get the sequence size.
TVector::const_reference const_reference
TVector::value_type value_type
const_reference operator[](size_type n) const
Indexing operator.
const_iterator begin() const
Get the start of the sequence.
CVectorWrap(size_type sz=0, T v=T())
Object constructor.
reference operator[](size_type n)
Indexing operator.
size_type size_
Size of the external sequence.
const_iterator end() const
Get the end of the sequence.
const unsigned long WIDTH_32
32-bit index.
const unsigned long OFFSET_COMBINED
Combination of chunk number and chunk-based offset.
CRef< CIndexSuperHeader_Base > GetIndexSuperHeader(const std::string &fname)
Read superheader structure from the file.
const unsigned long TWO_HIT
Use two-hit search.
const unsigned long REPORT_QUIET
No progress reporting.
const unsigned long REPORT_NORMAL
Normal reporting.
unsigned long GetMinOffset(unsigned long stride)
Compute the minimum offset value needed encode offsets based on stride.
const unsigned long ONE_HIT
Use one-hit search (normal).
const unsigned long UNCOMPRESSED
No compression.
const unsigned long REPORT_VERBOSE
Verbose reporting.
unsigned long GetCodeBits(unsigned long stride)
Compute the number of bits to encode special offsets based on stride.
size_t GetIdxVolNumOIDs(const std::string &fname)
Read the index header information from the given file.
static const unsigned long CR
CDbIndex::TSeqNum TSeqNum
Forwarding declarations for convenience.
unsigned int TSeqPos
Type for sequence locations and lengths.
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
virtual const char * GetErrCodeString(void) const
Get error code interpreted as text.
uint8_t Uint1
1-byte (8-bit) unsigned integer
uint32_t Uint4
4-byte (32-bit) unsigned integer
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define END_SCOPE(ns)
End the previously defined scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
#define BEGIN_SCOPE(ns)
Define a new scope.
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
double value_type
The numeric datatype used by the parser.
const struct ncbi::grid::netcache::search::fields::SIZE size
#define ASSERT
macro for assert.
Portable reference counted smart and weak pointers using CWeakRef, CRef, CObject and CObjectEx.
Structure to hold a sequence.
Structure to hold all initial HSPs for a given subject sequence.
Used to hold a set of positions, mostly used for filtering.
Simple record type used to specify index creation parameters.
bool legacy
Indicator of the legacy index format.
unsigned long report_level
Verbose index creation.
unsigned long max_index_size
Maximum index size in megabytes.
unsigned long chunk_size
Long sequences are split into chunks of this size.
std::string stat_file_name
File to write index statistics into.
unsigned long ws_hint
Most likely word size to use for searches.
unsigned long chunk_overlap
Amount by which individual chunks overlap.
bool idmap
Indicator of the index map creation.
unsigned long hkey_width
Width of the hash key in bits.
unsigned long stride
Stride to use for stored database locations.
Simple record type used to specify index search parameters.
unsigned long two_hits
Window for two-hit method (see megablast docs).
unsigned long word_size
Target seed length.