75 catch( ... ) {
result = 0; }
86 "Index memory mapping failed.\n"
87 "It is possible that an index volume is missing or is too large.\n"
88 "Please, consider using -volsize option of makeindex utility to\n"
89 "reduce the size of index volumes." );
110 unsigned long hkey_width,
148 unsigned long hkey_width,
const Uint1 *
query,
151 pos_( start ), stop_( stop ), nmer_( 0 ), count_( 0 ),
152 hkey_width_( hkey_width )
333 : num_subjects_( num_subjects ), subj_roots_len_bits_( 7 ),
334 roots_( 0 ), rinfo_( 0 ), total_( 0 )
396 template<
unsigned long NHITS >
412 : qoff_( qoff ), soff_( soff ), len_(
len ), qright_( qright )
434 : qoff_( qoff ), soff_( soff ), len_(
len ), qright_( qright ),
448 template<
unsigned long NHITS >
457 typedef std::list< TTrackedSeed >
TSeeds;
458 typedef typename TSeeds::iterator
TIter;
533 template<
unsigned long NHITS >
536 { it_ = seeds_.begin(); }
576 template<
unsigned long NHITS >
580 if(
seed.len_ > 0 ) {
583 std::pair< TSeqNum, TSeqPos > mapval =
584 subject_map_->MapSubjOff( lid_, soff );
594 TSeqNum chunk = subject_map_->MapLId2Chunk( lid_, mapval.first );
595 cerr <<
"SEED: " << qoff <<
"\t" << mapval.second <<
"\t"
596 <<
seed.len_ <<
"\t" << chunk <<
"\n";
602 template<
unsigned long NHITS >
605 { seeds_.insert( it_,
seed ); }
608 template<
unsigned long NHITS >
613 if( it_ != seeds_.begin() ) {
614 TIter tmp_it = it_; tmp_it--;
616 TSeqPos bs_soff_corr = tmp_it->soff_ + step;
618 if( bs_soff_corr ==
seed.soff_ ) {
619 if(
seed.qright_ < tmp_it->qright_ ) {
620 if( tmp_it->len_ > 0 ) {
621 tmp_it->len_ -= (tmp_it->qright_ -
seed.qright_ );
624 if( tmp_it->len_ < word_size ) {
625 seeds_.erase( tmp_it );
627 tmp_it->qright_ =
seed.qright_;
630 }
else if(
seed.len_ >= word_size ) {
631 seeds_.insert( it_,
seed );
633 }
else if(
seed.len_ >= word_size ) {
634 seeds_.insert( it_,
seed );
642 template<
unsigned long NHITS >
667 :
TBase( subject_map )
683 bool EvalAndUpdate(
const TTrackedSeed &
seed );
695 for( TSeeds::const_iterator cit = this->seeds_.begin();
696 cit != this->seeds_.end(); ++cit ) {
705 while( this->it_ != this->seeds_.end() ) {
707 TSeqPos it_soff_corr = this->it_->soff_ + step;
709 if( it_soff_corr >
seed.soff_ ) {
713 if( this->it_->qright_ <
seed.qoff_ ) {
714 SaveSeed( *this->it_ );
715 this->it_ = this->seeds_.erase( this->it_ );
720 if( it_soff_corr ==
seed.soff_ ) {
751 :
TBase( subject_map ),
752 window_( options.two_hits ),
753 contig_len_( 2*options.word_size ),
754 word_size_( options.word_size ),
755 stride_( subject_map.GetStride() )
764 bool EvalAndUpdate( TTrackedSeed &
seed );
779 bool CheckAndSaveSeed(
const TTrackedSeed &
seed );
793 if( (
seed.second_hit_ > 0 &&
795 seed.qright_ <=
seed.second_hit_ +
seed.len_ + window_ ) ||
796 seed.len_ >= contig_len_ ) {
807 for( TSeeds::const_iterator cit = this->seeds_.begin();
808 cit != this->seeds_.end(); ++cit ) {
809 CheckAndSaveSeed( *cit );
817 while( this->it_ != this->seeds_.end() ) {
819 TSeqPos it_soff_corr = this->it_->soff_ + step;
820 if( it_soff_corr >
seed.soff_ )
return true;
822 if( this->it_->qright_ +
seed.len_ + window_ + 3*stride_
824 CheckAndSaveSeed( *this->it_ );
825 this->it_ = this->seeds_.erase( this->it_ );
827 else if( this->it_->qright_ <
seed.qoff_ ) {
828 if( CheckAndSaveSeed( *this->it_ ) ) {
829 this->it_ = this->seeds_.erase( this->it_ );
831 else if( it_soff_corr ==
seed.soff_ &&
832 this->it_->len_ > 0 ) {
833 seed.second_hit_ = this->it_->qright_;
836 else { ++this->it_; }
840 if( it_soff_corr ==
seed.soff_ )
return false;
856 template<
bool LEGACY,
unsigned long NHITS,
typename derived_t >
975 template<
bool LEGACY,
unsigned long NHITS,
typename derived_t >
981 : index_impl_( index_impl ), query_(
query ), locs_( locs ),
982 options_( options ), subject_( 0 ), subj_end_off_( 0 ),
983 roots_( index_impl_.NumSubjects() ),
984 code_bits_(
GetCodeBits( index_impl.GetSubjectMap().GetStride() ) ),
985 min_offset_(
GetMinOffset( index_impl.GetSubjectMap().GetStride() ) )
990 for(
typename TTrackedSeedsSet::size_type
i = 0;
i <
seeds_.size(); ++
i ) {
996 template<
bool LEGACY,
unsigned long NHITS,
typename derived_t >
1003 unsigned long hkey_width = index_impl_.hkey_width();
1004 const Uint1 * sstart = index_impl_.GetSeqStoreBase() + subj_start_;
1005 const Uint1 * spos = sstart + (
seed.soff_ - (hkey_width - 1))/
CR;
1006 const Uint1 * qstart = query_->sequence;
1007 const Uint1 * qpos = qstart +
seed.qoff_ - (hkey_width - 1);
1008 unsigned int incomplete = (
seed.soff_ - (hkey_width - 1))%
CR;
1011 nmax = nmax < options_.word_size - hkey_width ?
1012 nmax :
static_cast<TSeqPos>(options_.word_size - hkey_width);
1014 while( nmax > 0 && incomplete > 0 && qpos > qstart ) {
1015 Uint1 sbyte = (((*spos)>>(2*(
CR - incomplete--)))&0x3);
1016 if( *--qpos != sbyte )
return;
1021 nmax = (nmax < (
TSeqPos)(qpos - qstart))
1022 ? nmax : (
TSeqPos)(qpos - qstart);
1023 nmax = (nmax < (
TSeqPos)(
CR*(spos - sstart)))
1027 while( nmax >=
CR ) {
1028 Uint1 sbyte = *spos--;
1031 bool ambig(
false );
1033 for( ;
i <
CR; ++
i ) {
1034 qbyte = qbyte + ((*--qpos)<<(2*
i));
1047 if( sbyte != qbyte ){
1060 Uint1 sbyte = (((*spos)>>(2*(
i++)))&0x3);
1061 if( sbyte != *--qpos )
return;
1068 template<
bool LEGACY,
unsigned long NHITS,
typename derived_t >
1075 const Uint1 * sbase = index_impl_.GetSeqStoreBase();
1076 const Uint1 * send = sbase + subj_end_;
1077 const Uint1 * spos = sbase + subj_start_ +
seed.soff_/
CR;
1078 const Uint1 * qend = query_->sequence + qstop_;
1079 const Uint1 * qpos = query_->sequence +
seed.qoff_ + 1;
1080 unsigned int incomplete =
seed.soff_%
CR;
1082 while( nmax > 0 && (++incomplete)%
CR != 0 && qpos < qend ) {
1083 Uint1 sbyte = (((*spos)>>(6 - 2*incomplete))&0x3);
1084 if( *qpos++ != sbyte )
return;
1091 nmax = (nmax < (
TSeqPos)(qend - qpos)) ?
1092 nmax : (
TSeqPos)(qend - qpos);
1093 nmax = (nmax <= (send - spos)*
CR) ?
1096 while( nmax >=
CR ) {
1097 Uint1 sbyte = *spos++;
1099 bool ambig(
false );
1101 for(
unsigned int i = 0;
i <
CR; ++
i ) {
1110 qbyte = (qbyte<<2) + *qpos++;
1115 if( sbyte != qbyte ) {
1126 unsigned int i = 2*(
CR - 1);
1128 while( nmax-- > 0 ) {
1129 Uint1 sbyte = (((*spos)>>
i)&0x3);
1130 if( sbyte != *qpos++ )
break;
1140 template<
bool LEGACY,
unsigned long NHITS,
typename derived_t >
1150 subj_seeds.EvalAndUpdate(
seed );
1152 if( nmaxleft > 0 ) {
1153 ExtendLeft(
seed, nmaxleft - 1 );
1158 if( nmaxright > 0 ) {
1159 ExtendRight(
seed, nmaxright - 1 );
1161 ExtendRight(
seed );
1166 seed.len_ < options_.word_size ) {
1168 subj_seeds.AppendSimple(
seed );
1170 subj_seeds.Append(
seed, options_.word_size );
1175 template<
bool LEGACY,
unsigned long NHITS,
typename derived_t >
1183 if( subj_seeds.EvalAndUpdate(
seed ) ) {
1185 ExtendRight(
seed );
1186 if(
seed.len_ >= options_.word_size )
1187 subj_seeds.AppendSimple(
seed );
1192 template<
bool LEGACY,
unsigned long NHITS,
typename derived_t >
1197 if( qoff_ != root->
qoff_ ) {
1199 qoff_ = root->
qoff_;
1200 }
else if( root->
soff_ >= min_offset_ &&
1201 root->
soff_ < soff_ ) {
1208 if( root->
soff_ < min_offset_ ) {
1209 TSeqPos boundary = (root++)->soff_;
1210 ProcessBoundaryOffset( root->
soff_ -
static_cast<unsigned int>(min_offset_), boundary );
1212 soff_ = root->
soff_;
1215 ProcessOffset( root->
soff_ -
static_cast<unsigned int>(min_offset_) );
1216 soff_ = root->
soff_;
1222 template<
bool LEGACY,
unsigned long NHITS,
typename derived_t >
1226 TSeqNum num_subjects = index_impl_.NumSubjects() - 1;
1228 for( subject_ = 0; subject_ < num_subjects; ++subject_ ) {
1230 self->SetSubjInfo();
1234 if( rinfo.
len_ > 0 ) {
1235 const SSeedRoot * roots = roots_.GetSubjRoots( subject_ );
1238 for(
unsigned long j = 0; j < rinfo.
len_; ) {
1239 j += ProcessRoot( seeds, roots + j );
1246 for( TRoots::size_type j = 0;
1248 j += ProcessRoot( seeds, roots + j );
1258 template<
bool LEGACY,
unsigned long NHITS,
typename derived_t >
1263 index_impl_.hkey_width(), query_->sequence, qstart_, qstop_ );
1265 while( nmer_it.
Next() ) {
1267 index_impl_.OffsetIterator(
1268 nmer_it.
Nmer(), options_.word_size ) );
1269 qoff_ = nmer_it.
Pos();
1271 while( off_it.More() ) {
1275 while( off_it.Next() ) {
1279 if(
offset < min_offset_ ) {
1281 TWord real_offset = off_it.Offset();
1282 TSeqPos soff =
self->DecodeOffset( real_offset );
1285 roots_.Add2(
r1,
r2, subject_ );
1288 SSeedRoot r = { qoff_, soff, qstart_, qstop_ };
1289 roots_.Add(
r, subject_ );
1294 if( roots_.Overflow() ) {
1301 qstart_ = old_qstart;
1308 template<
bool LEGACY,
unsigned long NHITS,
typename derived_t >
1314 while( curloc != 0 ) {
1315 if( curloc->
ssr != 0 ) {
1322 curloc = curloc->
next;
1331 index_impl_.StopSeq() - index_impl_.StartSeq() ) );
1333 for(
typename TTrackedSeedsSet::size_type
i = 0, k = 1;
1334 i < seeds_.size(); ++
i ) {
1335 seeds_[
i].Finalize();
1338 for(
TSeqNum j = 0; j < nchunks; ++j ) {
1340 (
TSeqNum)(k++), seeds_[
i].GetHitList( j ) );
1349 template<
bool LEGACY,
unsigned long NHITS >
1353 template<
bool LEGACY,
unsigned long NHITS >
1355 :
public CSearch_Base< LEGACY, NHITS, CSearch< LEGACY, NHITS > >
1402 std::pair< TSeqNum, TSeqPos > decoded =
1406 return decoded.second;
Ungapped extension structures that are common to nucleotide and protein extension routines.
BlastInitHitList * BLAST_InitHitListNew(void)
Allocate memory for the BlastInitHitList structure.
Boolean BLAST_SaveInitialHit(BlastInitHitList *init_hitlist, Int4 q_off, Int4 s_off, BlastUngappedData *ungapped_data)
Save the initial hit data into the initial hit list structure.
Structures and functions prototypes used for BLAST gapped extension.
Structures and API used for saving BLAST hits.
This class represents a set of seeds obtained by searching all subjects represented by the index.
Implementation of the BLAST database index.
TOffsetData::TIterator TOffsetIterator
const TSubjectMap & GetSubjectMap() const
Get the subject map instance from the index object.
TSeqNum NumSubjects() const
Get the total number of logical sequences in the index.
Base class providing high level interface to index objects.
CConstRef< CSearchResults > Search(const BLAST_SequenceBlk *query, const BlastSeqLoc *locs, const SSearchOptions &search_options)
Search the index.
SIndexHeader header_
The index header structure.
static const unsigned long CR
Letters per byte in the sequence store.
Type used to iterate over the consecutive Nmer values of the query sequence.
bool Next()
Advance the iterator.
TWord nmer_
Nmer value reported by Nmer().
bool state_
false, if the end of the sequence has been reached.
TSeqPos stop_
One past the last position in the query.
TSeqPos pos_
Position returned by Pos().
unsigned long hkey_width_
Hash key width (in base pairs).
TSeqPos count_
Auxiliary member used to determine the next valid position.
TWord hkey_mask_
Hash key mask.
TSeqPos Pos() const
Get the current position in the query sequence.
TWord Nmer() const
Get the Nmer value corresponding to the current state of the iterator object.
const Uint1 * query_
The query data (BLASTNA encoded).
CNmerIterator(unsigned long hkey_width, const Uint1 *query, TSeqPos start, TSeqPos stop)
Object constructor.
unsigned long min_offset_
Minumum offset used by the index.
void ProcessBoundaryOffset(TWord offset, TWord bounds)
Process a seed candidate that is close to the masked out or ambigous region of the subject.
void SearchInt()
Helper method to search a particular segment of the query.
const BlastSeqLoc * locs_
Set of query locations to search.
std::vector< TTrackedSeeds > TTrackedSeedsSet
Representation of the set of currently tracked seeds for all subject sequences.
TWord subj_start_
Start position of subject_.
TWord subj_start_off_
Start offset of subject_.
STrackedSeed< NHITS > TTrackedSeed
Alias for convenience.
const BLAST_SequenceBlk * query_
The query sequence encoded in BLASTNA.
TWord subj_end_
One past the end position of subject_.
CSeedRoots roots_
Collection of initial soff/qoff pairs.
void ProcessOffset(TWord offset)
Process a regular seed candidate.
CConstRef< CDbIndex::CSearchResults > operator()()
Performs the search.
unsigned long ProcessRoot(TTrackedSeeds &seeds, const SSeedRoot *root)
Process a single root.
TTrackedSeedsSet seeds_
The set of currently tracked seeds.
TSeqNum subject_
Logical id of the subject sequence containing the offset value currently being considered.
TSeqPos qoff_
Current query offset.
const TIndex_Impl & index_impl_
The index implementation object.
CTrackedSeeds< NHITS > TTrackedSeeds
void ExtendRight(TTrackedSeed &seed, TSeqPos nmax=~(TSeqPos) 0) const
Extend a seed candidate to the right.
TSearchOptions options_
Search options.
TSeqPos soff_
Current subject offset.
void ComputeSeeds()
Compute the seeds after all roots are collected.
CSearch_Base(const TIndex_Impl &index_impl, const BLAST_SequenceBlk *query, const BlastSeqLoc *locs, const TSearchOptions &options)
Object constructor.
TSeqPos qstart_
Start of the current query segment.
TSeqPos qstop_
One past the end of the current query segment.
CDbIndex_Impl< LEGACY > TIndex_Impl
TIndex_Impl::TSubjectMap TSubjectMap
void ExtendLeft(TTrackedSeed &seed, TSeqPos nmax=~(TSeqPos) 0) const
Extend a seed candidate to the left.
unsigned long code_bits_
Number of bits to represent special offset prefix.
TWord subj_end_off_
End offset of subject_.
CDbIndex::SSearchOptions TSearchOptions
Alias for convenience.
TBase::TIndex_Impl TIndex_Impl
CSearch(const TIndex_Impl &index_impl, const BLAST_SequenceBlk *query, const BlastSeqLoc *locs, const TSearchOptions &options)
Object constructor.
CSearch_Base< LEGACY, NHITS, CSearch > TBase
TSeqPos DecodeOffset(TWord offset)
Decode offset value into subject position.
void SetSubjInfo()
Set the parameters of the current subject sequence.
TBase::TSearchOptions TSearchOptions
Seed roots container for all subjects.
CSeedRoots(TSeqNum num_subjects=0)
Object constructor.
unsigned long total_
Currenr total number of elements.
void Add(const SSeedRoot &root, TSeqNum subject)
Append a normal (non boundary) root to the container.
unsigned long subj_roots_len_bits_
Log_2 of n_subj_roots_.
~CSeedRoots()
Object destructor.
void Allocate()
Reallocate all the storage.
void Reset()
Reinitialize the structure.
static const unsigned long LIMIT_ROOTS
Max number of roots before triggering overflow.
void Add2(const SSeedRoot &root1, const SSeedRoot &root2, TSeqNum subject)
Append a boundary root (both parts) to the container.
bool Overflow() const
Check if the max number of elements is reached.
const SSubjRootsInfo & GetSubjInfo(TSeqNum subject) const
Get the set of roots for a particular subject.
SSubjRootsInfo * rinfo_
Array of root information structures for each subject.
SSubjRootsInfo::TRoots TRoots
Alias type for convenience.
SSeedRoot * roots_
Roots array preallocated for all subjects.
void CleanUp()
Clean up all the dynamically allocated memory.
TSeqNum num_subjects_
Number of subjects in the index.
static const unsigned long TOTAL_CACHE
Assumption on the amound of cache in the system.
const SSeedRoot * GetSubjRoots(TSeqNum subject) const
Return the preallocated array of roots for a particular subject.
unsigned long n_subj_roots_
Space is preallocated for this number of roots per subject.
unsigned long total_roots_
Max number of roots in preallocated storage.
Type representing subject map data.
TSeqNum GetNumChunks(TSeqNum lid) const
Get number of chunks combined into a given logical sequence.
std::pair< TSeqNum, TSeqPos > DecodeOffset(TWord offset) const
Decode offset.
const TWord * GetSubjectMap() const
Provides a mapping from real subject ids and chunk numbers to internal logical subject ids.
void SetSubjInfo(TSeqNum subj, TWord &start, TWord &end) const
Return the subject information based on the given logical subject id.
CTrackedSeeds(const TSubjectMap &subject_map, const CDbIndex::SSearchOptions &options)
Object constructor.
TBase::TSubjectMap TSubjectMap
TBase::TTrackedSeed TTrackedSeed
CTrackedSeeds(const CTrackedSeeds &rhs)
Object copy constructor.
CTrackedSeeds_Base< ONE_HIT > TBase
unsigned long word_size_
Target word size.
CTrackedSeeds_Base< TWO_HIT > TBase
TBase::TTrackedSeed TTrackedSeed
TBase::TSubjectMap TSubjectMap
unsigned long contig_len_
Min continuous length to save unconditionally.
unsigned long window_
Window for two-hit based search.
CTrackedSeeds(const TSubjectMap &subject_map, const CDbIndex::SSearchOptions &options)
Object constructor.
unsigned long stride_
Stride value used by the index.
Representation of a collection of tacked seeds for a specific subject sequence.
TSeeds seeds_
List of seed candidates.
TIter it_
Iterator pointing to the tracked seed that is about to be inspected.
STrackedSeed< NHITS > TTrackedSeed
BlastInitHitList * GetHitList(TSeqNum num) const
Get the list of saved seeds.
const TSubjectMap * subject_map_
The subject map object.
CTrackedSeeds_Base(const TSubjectMap &subject_map)
Object constructor.
void AppendSimple(const TTrackedSeed &seed)
Add a seed to the set of tracked seeds.
TSeqNum lid_
Logical sequence number.
void SetLId(TSeqNum lid)
Set the correspondence between this object and a logical sequence.
CTrackedSeeds_Base(const CTrackedSeeds_Base &rhs)
Object copy constructor.
void Append(const TTrackedSeed &seed, unsigned long word_size)
Add a seed to the set of tracked seeds.
THitLists hitlists_
The result sets (one per chunk).
std::vector< BlastInitHitList * > THitLists
std::list< TTrackedSeed > TSeeds
void SaveSeed(const TTrackedSeed &seed)
Save the tracked seed for reporting in the search result set.
void Reset()
Prepare for processing of the next query position.
CTrackedSeeds functionality that is different depending on whether a one-hit or two-hit based search ...
static const char * bounds[]
const unsigned long TWO_HIT
Use two-hit search.
unsigned long GetMinOffset(unsigned long stride)
Compute the minimum offset value needed encode offsets based on stride.
const unsigned long ONE_HIT
Use one-hit search (normal).
unsigned long GetCodeBits(unsigned long stride)
Compute the number of bits to encode special offsets based on stride.
static const unsigned long CR
CMemoryFile * MapFile(const std::string &fname)
Memory map a file and return a pointer to the mapped area.
CDbIndex::TSeqNum TSeqNum
Forwarding declarations for convenience.
unsigned int TSeqPos
Type for sequence locations and lengths.
#define ERR_POST(message)
Error posting with file, line number information but without error codes.
uint8_t Uint1
1-byte (8-bit) unsigned integer
int32_t Int4
4-byte (32-bit) signed integer
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define END_SCOPE(ns)
End the previously defined scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
#define BEGIN_SCOPE(ns)
Define a new scope.
Defines classes: CDirEntry, CFile, CDir, CSymLink, CMemoryFile, CFileUtil, CFileLock,...
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
static const sljit_gpr r1
static const sljit_gpr r2
Structure to hold a sequence.
Structure to hold all initial HSPs for a given subject sequence.
Used to hold a set of positions, mostly used for filtering.
SSeqRange * ssr
location data on the sequence.
struct BlastSeqLoc * next
next in linked list
Simple record type used to specify index search parameters.
unsigned long two_hits
Window for two-hit method (see megablast docs).
Representation of a seed root.
TSeqPos qstop_
1 + end of the corresponding query interval.
TSeqPos qstart_
Start of the corresponding query interval.
TSeqPos soff_
Corresponding subject offset.
TSeqPos qoff_
Query offset.
Int4 left
left endpoint of range (zero based)
Int4 right
right endpoint of range (zero based)
SSeedRoot container for one subject.
TRoots * extra_roots_
Storage for extra roots.
void CleanUp()
Clean up extra allocated memory.
unsigned int len_
Current number of stored roots.
std::vector< SSeedRoot > TRoots
Container implementation type.
TSeqPos qoff_
Query offset of the seed's origin.
TSeqPos qright_
Offset of the rightmost position of the seed in the query.
STrackedSeed(TSeqPos qoff, TSeqPos soff, TSeqPos len, TSeqPos qright)
Instance constructor.
TSeqPos len_
Length of the seed.
TSeqPos soff_
Subject offset of the seed's origin.
TSeqPos second_hit_
Right end of the first hit.
TSeqPos qoff_
Query offset of the seed's origin.
TSeqPos soff_
Subject offset of the seed's origin.
TSeqPos qright_
Offset of the rightmost position of the seed in the query.
TSeqPos len_
Length of the seed.
STrackedSeed(TSeqPos qoff, TSeqPos soff, TSeqPos len, TSeqPos qright)
Instance constructor.
Representation of a seed being tracked by the search algorithm.
static Uint4 letter(char c)
static bool ambig(char c)