33 #ifndef C_DB_INDEX_SP_HPP
34 #define C_DB_INDEX_SP_HPP
42 #define INLINE NCBI_INLINE
54 template<
typename word_t >
67 template<
bool LEGACY >
77 template<
bool LEGACY >
87 template<
bool LEGACY >
91 template<
typename iterator_t >
166 template<
typename iterator_t >
187 unsigned long stride,
unsigned long ws_hint );
197 unsigned long stride,
unsigned long ws_hint );
216 unsigned long h = offset_data.
hkey_width() - 1;
217 unsigned long s = offset_data.
getStride();
218 unsigned long w = offset_data.
getWSHint();
257 if(
curr_ == 0 )
return false;
302 {
return more_ != 0; }
305 template<
typename iterator_t >
307 TWord **
map,
unsigned long hkey_width,
308 unsigned long stride,
unsigned long ws_hint )
309 :
TBase(
map, hkey_width, stride, ws_hint )
322 template<
bool LEGACY >
331 template<
bool LEGACY >
359 const vector< string > & idmap,
440 virtual void Remap();
454 const SSearchOptions & search_options );
466 template<
bool LEGACY >
469 const vector< string > & idmap,
TWord *
data )
470 : mapfile_(
map ), map_start_( 0 ), version_(
VERSION ),
486 stride_, GetIndexWSHint< LEGACY >( header ) );
491 else if(
data != 0 ) {
497 stride_, GetIndexWSHint< LEGACY >( header ) );
504 template<
bool LEGACY >
507 if( mapfile_ != 0 ) {
508 delete subject_map_; subject_map_ = 0;
509 delete offset_data_; offset_data_ = 0;
511 map_ = (
TWord *)(mapfile_->Map( subject_map_offset_ ));
512 subject_map_ =
new TSubjectMap( &map_, start_, stop_, stride_ );
517 template<
bool LEGACY >
530 template<
bool LEGACY >
534 vector< string > idmap;
535 string idmap_fname = fname +
".map";
538 while( idmap_stream ) {
540 idmap_stream >> line;
541 idmap.push_back( line );
557 ERR_POST(
"not enough memory for index" );
560 "not enough memory for index" );
563 s.read( (
char *)
data,
l );
564 header = ReadIndexHeader< LEGACY >(
data );
569 header = ReadIndexHeader< LEGACY >(
map->GetPtr() );
Types of exception the indexing library can throw.
Implementation of the BLAST database index.
const Uint1 * GetSeqStoreBase() const
Get the start of compressed raw sequence data.
size_t subject_map_offset_
Offset of the subject map in the index file.
unsigned long version_
Index format version.
virtual CConstRef< CSearchResults > DoSearch(const BLAST_SequenceBlk *query, const BlastSeqLoc *locs, const SSearchOptions &search_options)
The search procedure for this specialized index implementation.
TOffsetData * offset_data_
Offset lists.
static const unsigned long HEADER_SIZE
Size of the index file header for index format version >= 2.
TTraits::TOffsetData TOffsetData
CMemoryFile * mapfile_
Memory mapped file.
TWord * map_
Start of memory mapped file data.
TOffsetData::TIterator TOffsetIterator
TWord * map_start_
Start of the index data, when not mapped.
const TOffsetIterator OffsetIterator(TWord nmer, unsigned long mod) const
Create an offset list iterator corresponding to the given Nmer value.
TTraits::TSubjectMap TSubjectMap
virtual TSeqPos GetSeqLen(TSeqNum oid) const
Get the length of the subject sequence.
TSeqNum NumChunks() const
Get the total number of sequence chunks in the index.
const TSubjectMap & GetSubjectMap() const
Get the subject map instance from the index object.
CDbIndex_Traits< LEGACY > TTraits
Offset data and subject map types computer.
virtual void Remap()
If possible reduce the index footpring by unmapping the portion that does not contain sequence data.
~CDbIndex_Impl()
Object destructor.
CDbIndex_Impl(CMemoryFile *map, const SIndexHeader &header, const vector< string > &idmap, TWord *data=0)
Create an index object from mapped memory segment.
TSeqNum NumSubjects() const
Get the total number of logical sequences in the index.
unsigned long stride_
Stride value used during index creation.
virtual unsigned long Version() const
Get the index format version.
virtual const Uint1 * GetSeqData(TSeqNum oid) const
Get the sequence data of the subject sequence.
unsigned long hkey_width() const
Get the hash key width of the index.
Base class providing high level interface to index objects.
static CRef< CDbIndex > LoadIndex(CNcbiIstream &is)
Load index from an open stream.
TSeqNum start_
OID of the first sequence in the index.
TSeqNum stop_chunk_
Number of the last chunk of the last sequence.
Uint4 TWord
Type representing main memory unit of the index structure.
SIndexHeader header_
The index header structure.
TSeqNum start_chunk_
Number of the first chunk of the first sequence.
TSubjectMap * subject_map_
The subject map object.
vector< string > idmap_
Mapping from source ids to bioseq ids.
TSeqNum stop_
OID of the last sequence in the inex.
CSequenceIStream::TStreamPos TSeqNum
Type used to enumerate sequences in the index.
Class representing index hash table and offset list database.
THashTable hash_table_
The hash table (mapping from Nmer values to the lists of offsets.
TWord total_
Auxiliary data member used for importing the offset list data.
unsigned long getMinOffset() const
Accessor for minimum offset value.
unsigned long hkey_width() const
Get the width of the hash key in base pairs.
unsigned long getWSHint() const
Accessor for ws_hint value.
unsigned long getStride() const
Accessor for stride value.
Iterator specific functionality of offset list manager class.
TWord * data_start_
Start of the offset data.
TOffsets offsets_
Concatenated offset list data.
COffsetData_Base TBase
Base class alias.
CVectorWrap< TWord > TOffsets
Type used to store offset lists.
iterator_t TIterator
Type used to iterate over an offset list.
COffsetData(CNcbiIstream &is, unsigned long hkey_width, unsigned long stride, unsigned long ws_hint)
Construct the object from the data in the given input stream.
Iterator for 0-terminated pre-ordered offset lists.
const TWord * curr_
Current position in the offset list.
TOffsetValue getOffsetValue() const
bool More()
Check if more data is available in the iterator.
bool boundary_
Flag indicating the current offset is actually a extra information for boundary cases.
COffsetData_Base::TOffsetValue TOffsetValue
CPreOrderedOffsetIterator()
unsigned long more_
Flag indicating that more values are available.
bool Next()
Advance the iterator.
unsigned long mod_
Determines which offsets to skip.
COffsetData< CPreOrderedOffsetIterator > TOffsetData
Type of offset data class supported by this iterator.
TWord offset_
Current cached offset value.
unsigned long min_offset_
Minimum offset used by the index.
CPreOrderedOffsetIterator(const TOffsetData &offset_data, TWord key, unsigned long ws)
Object constructor.
TWord Offset() const
Iterator dereference.
Type representing subject map data.
const Uint1 * GetSeqStoreBase() const
Return the start of the raw storage for compressed subject sequence data.
TSeqNum NumSubjects() const
Get the total number of logical sequences in the map.
TSeqNum NumChunks() const
Get the total number of sequence chunks in the map.
TSeqPos GetSeqLen(TSeqNum oid) const
Get the length of the subject sequence.
const Uint1 * GetSeqData(TSeqNum oid) const
Get the sequence data of the subject sequence.
TVector::size_type size_type
void SetPtr(T *base, size_type sz)
Make the object hold an external sequence.
void ReadWord(CNcbiIstream &is, word_t &data)
Read a word from the input stream.
unsigned long GetIndexStride(const SIndexHeader &header)
Get the stride value associated with the index.
const SIndexHeader ReadIndexHeader(void *map)
Read the index header information from the given input stream.
CDbIndex::TSeqNum TSeqNum
Forwarding declarations for convenience.
CMemoryFile * MapFile(const std::string &fname)
Memory map a file and return a pointer to the mapped area.
unsigned long GetIndexWSHint(const SIndexHeader &header)
Get the ws_hint value associated with the index.
unsigned int TSeqPos
Type for sequence locations and lengths.
#define ERR_POST(message)
Error posting with file, line number information but without error codes.
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Int8 GetLength(void) const
Get size of file.
bool Unmap(void)
Unmap file if mapped.
void * GetPtr(void) const
Get pointer to beginning of data.
uint8_t Uint1
1-byte (8-bit) unsigned integer
uint32_t Uint4
4-byte (32-bit) unsigned integer
int64_t Int8
8-byte (64-bit) signed integer
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define END_SCOPE(ns)
End the previously defined scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
#define BEGIN_SCOPE(ns)
Define a new scope.
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
IO_PREFIX::ifstream CNcbiIfstream
Portable alias for ifstream.
const struct ncbi::grid::netcache::search::fields::KEY key
Defines classes: CDirEntry, CFile, CDir, CSymLink, CMemoryFile, CFileUtil, CFileLock,...
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
static SLJIT_INLINE sljit_ins l(sljit_gpr r, sljit_s32 d, sljit_gpr x, sljit_gpr b)
Structure to hold a sequence.
Used to hold a set of positions, mostly used for filtering.
Simple record type used to specify index search parameters.
Some computed type definitions.
COffsetData< CPreOrderedOffsetIterator > TOffsetData