NCBI C++ ToolKit
dbindex_search.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: dbindex_search.cpp 100101 2023-06-15 14:10:29Z merezhuk $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Aleksandr Morgulis
27  *
28  * File Description:
29  * Implementation of index search functionality.
30  *
31  */
32 
33 #include <ncbi_pch.hpp>
34 
35 #include <list>
36 #include <algorithm>
37 
38 #include <corelib/ncbifile.hpp>
39 
43 
44 #ifdef LOCAL_SVN
45 #include "dbindex.hpp"
46 #else
48 #endif
49 
51 
53 BEGIN_SCOPE( blastdbindex )
54 
55 // Comment this out for production.
56 // #define SEEDDEBUG 1
57 // #define PRINTSUBJMAP 1
58 
59 /** Forwarding declarations for convenience. */
61 typedef CDbIndex::TWord TWord;
62 
63 //-------------------------------------------------------------------------
64 /** Memory map a file and return a pointer to the mapped area.
65  @param fname [I] file name
66  @return pointer to the start of the mapped memory area
67 */
68 CMemoryFile * MapFile( const std::string & fname )
69 {
70  CMemoryFile * result = 0;
71 
72  try {
73  result = new CMemoryFile( fname );
74  }
75  catch( ... ) { result = 0; }
76 
77  if( result ) {
78  if( !result->Map() ) {
79  delete result;
80  result = 0;
81  }
82  }
83 
84  if( result == 0 ) {
85  ERR_POST(
86  "Index memory mapping failed.\n"
87  "It is possible that an index volume is missing or is too large.\n"
88  "Please, consider using -volsize option of makeindex utility to\n"
89  "reduce the size of index volumes." );
90  }
91 
92  return result;
93 }
94 
95 //-------------------------------------------------------------------------
96 /** Type used to iterate over the consecutive Nmer values of the query
97  sequence.
98 */
100 {
101  public:
102 
103  /** Object constructor.
104  @param hkey_width [I] nmer width
105  @param query [I] query sequence in BLASTNA encoding
106  @param start [I] start position in the query
107  @param stop [I] one past the last position in the query
108  */
109  CNmerIterator(
110  unsigned long hkey_width,
111  const Uint1 * query, TSeqPos start, TSeqPos stop );
112 
113  /** Advance the iterator.
114  @return false if end of the sequence range has been reached,
115  true otherwise
116  */
117  bool Next();
118 
119  /** Get the current position in the query sequence.
120  The position returned corresponds to the last base of the
121  current Nmer value.
122  @return position in the query corresponding to the current
123  state of the iterator object
124  */
125  TSeqPos Pos() const { return pos_ - 1; }
126 
127  /** Get the Nmer value corresponding to the current state of the
128  iterator object.
129  @return current Nmer value
130  */
131  TWord Nmer() const { return nmer_; }
132 
133  public:
134 
135  const Uint1 * query_; /**< The query data (BLASTNA encoded). */
136  bool state_; /**< false, if the end of the sequence has been reached. */
137  TSeqPos pos_; /**< Position returned by Pos(). */
138  TSeqPos stop_; /**< One past the last position in the query. */
139  TWord nmer_; /**< Nmer value reported by Nmer(). */
140  TSeqPos count_; /**< Auxiliary member used to determine the next valid position. */
141  unsigned long hkey_width_; /**< Hash key width (in base pairs). */
142  TWord hkey_mask_; /**< Hash key mask. */
143 };
144 
145 //-------------------------------------------------------------------------
146 INLINE
148  unsigned long hkey_width, const Uint1 * query,
149  TSeqPos start, TSeqPos stop )
150  : query_( query ), state_( true ),
151  pos_( start ), stop_( stop ), nmer_( 0 ), count_( 0 ),
152  hkey_width_( hkey_width )
153 {
154  hkey_mask_ = (1<<(2*hkey_width_)) - 1;
155  query_ += pos_;
156 }
157 
158 //-------------------------------------------------------------------------
159 INLINE
161 {
162  if( state_ ) {
163  while( pos_ < stop_ ) {
164  TWord letter = (TWord)*(query_++);
165  ++pos_;
166 
167  if( letter < 4 ) {
168  nmer_ = ((nmer_<<2)&hkey_mask_) + letter;
169  ++count_;
170  if( count_ >= hkey_width_ ) return true;
171  }else {
172  count_ = 0;
173  nmer_ = 0;
174  }
175  }
176 
177  state_ = false;
178  }
179 
180  return state_;
181 }
182 
183 //-------------------------------------------------------------------------
184 /** Representation of a seed root.
185  A seed root is the initial match of Nmers found by a hash table lookup.
186 */
187 struct SSeedRoot
188 {
189  TSeqPos qoff_; /**< Query offset. */
190  TSeqPos soff_; /**< Corresponding subject offset. */
191  TSeqPos qstart_; /**< Start of the corresponding query interval. */
192  TSeqPos qstop_; /**< 1 + end of the corresponding query interval. */
193 };
194 
195 /** SSeedRoot container for one subject. */
197 {
198  /** Container implementation type. */
199  typedef std::vector< SSeedRoot > TRoots;
200 
201  /** Clean up extra allocated memory. */
202  void CleanUp()
203  {
204  if( extra_roots_ != 0 ) {
205  delete extra_roots_;
206  }
207  }
208 
209  unsigned int len_; /**< Current number of stored roots. */
210  TRoots * extra_roots_; /**< Storage for extra roots. Allocated
211  only if preallocated storage
212  overfills. */
213 };
214 
215 /** Seed roots container for all subjects. */
217 {
218  /** Alias type for convenience. */
220 
221  public:
222 
223  /** Object constructor.
224  @param num_subjects [I] number of subjects sequences
225  */
226  CSeedRoots( TSeqNum num_subjects = 0 );
227 
228  /** Object destructor. */
230 
231  /** Append a normal (non boundary) root to the container.
232  @param root [I] root to append
233  @param subject [I] subject sequence containing
234  root.soff_ of the root
235  */
236  void Add( const SSeedRoot & root, TSeqNum subject );
237 
238  /** Append a boundary root (both parts) to the container.
239  @param root1 [I] boundary root structure
240  @param root2 [I] real root data
241  @param subject [I] subject sequence containing
242  root2.soff_.
243  */
244  void Add2(
245  const SSeedRoot & root1, const SSeedRoot & root2,
246  TSeqNum subject );
247 
248  /** Get the set of roots for a particular subject.
249  @param subject [I] local subject id
250  @return reference to SSubjRootsInfo describing the given
251  subject
252  */
254  { return rinfo_[subject]; }
255 
256  /** Return the preallocated array of roots for a particular
257  subject.
258  @param subject [I] local subject id
259  @return preallocated array for storing roots for the
260  given subject
261  */
263  { return roots_ + (subject<<subj_roots_len_bits_); }
264 
265  /** Check if the max number of elements is reached.
266  @return true if LIM_ROOTS is exceeded, false otherwise
267  */
268  bool Overflow() const { return total_ > LIMIT_ROOTS; }
269 
270  /** Reinitialize the structure. */
271  void Reset();
272 
273  private:
274 
275  /** Assumption on the amound of cache in the system.
276  (overly optimistic)
277  */
278  static const unsigned long TOTAL_CACHE = 4*1024*1024;
279 
280  /** Max number of roots before triggering overflow. */
281  static const unsigned long LIMIT_ROOTS = 16*1024*1024;
282 
283  /** Clean up all the dynamically allocated memory. */
284  void CleanUp()
285  {
286  for( TSeqNum i = 0; i < num_subjects_; ++i ) {
287  rinfo_[i].CleanUp();
288  }
289 
290  delete[] rinfo_;
291  delete[] roots_;
292  }
293 
294  /** Reallocate all the storage. Used by constructor and
295  Reset().
296  */
297  void Allocate();
298 
299  TSeqNum num_subjects_; /**< Number of subjects in the index. */
300  unsigned long subj_roots_len_bits_; /**< Log_2 of n_subj_roots_. */
301  unsigned long n_subj_roots_; /**< Space is preallocated for this number of roots per subject. */
302  SSeedRoot * roots_; /**< Roots array preallocated for all subjects. */
303  SSubjRootsInfo * rinfo_; /**< Array of root information structures for each subject.
304  Dynamically allocated. */
305  unsigned long total_; /**< Currenr total number of elements. */
306  unsigned long total_roots_; /**< Max number of roots in preallocated storage. */
307 };
308 
310 {
311  try {
314 
315  for( TSeqNum i = 0; i < num_subjects_; ++i ) {
316  SSubjRootsInfo t = { 0, 0 };
317  rinfo_[i] = t;
318  }
319  }catch( ... ) {
320  CleanUp();
321  throw;
322  }
323 }
324 
326 {
327  CleanUp();
328  roots_ = 0; rinfo_ = 0; total_ = 0;
329  Allocate();
330 }
331 
333  : num_subjects_( num_subjects ), subj_roots_len_bits_( 7 ),
334  roots_( 0 ), rinfo_( 0 ), total_( 0 )
335 {
337 
338  while( total_roots_*sizeof( SSeedRoot ) < TOTAL_CACHE ) {
340  total_roots_ <<= 1;
341  }
342 
344  Allocate();
345 }
346 
347 INLINE
349 {
350  SSubjRootsInfo & rinfo = rinfo_[subject];
351 
352  if( rinfo.len_ < n_subj_roots_ - 1 ) {
353  *(roots_ + (subject<<subj_roots_len_bits_) + (rinfo.len_++))
354  = root;
355  }else {
356  if( rinfo.extra_roots_ == 0 ) {
357  rinfo.extra_roots_ = new TRoots;
358  rinfo.extra_roots_->reserve( n_subj_roots_<<2 );
359  }
360 
361  rinfo.extra_roots_->push_back( root );
362  }
363 
364  ++total_;
365 }
366 
367 INLINE
369  const SSeedRoot & root1,
370  const SSeedRoot & root2,
371  TSeqNum subject )
372 {
373  SSubjRootsInfo & rinfo = rinfo_[subject];
374 
375  if( rinfo.len_ < n_subj_roots_ - 1 ) {
376  *(roots_ + (subject<<subj_roots_len_bits_) + (rinfo.len_++))
377  = root1;
378  *(roots_ + (subject<<subj_roots_len_bits_) + (rinfo.len_++))
379  = root2;
380  }else {
381  if( rinfo.extra_roots_ == 0 ) {
382  rinfo.extra_roots_ = new TRoots;
383  rinfo.extra_roots_->reserve( n_subj_roots_<<2 );
384  }
385 
386  rinfo.extra_roots_->push_back( root1 );
387  rinfo.extra_roots_->push_back( root2 );
388  }
389 
390  total_ += 2;
391 }
392 
393 //-------------------------------------------------------------------------
394 /** Representation of a seed being tracked by the search algorithm.
395 */
396 template< unsigned long NHITS >
398 
399 /** Specialization for one-hit based search. */
400 template<>
402 {
403  /** Instance constructor.
404 
405  @param qoff Query offset.
406  @param soff Subject offset.
407  @param len Seed length.
408  @param qright Rightmost position of the seed in query's coordinates.
409  */
411  TSeqPos qoff, TSeqPos soff, TSeqPos len, TSeqPos qright )
412  : qoff_( qoff ), soff_( soff ), len_( len ), qright_( qright )
413  {}
414 
415  TSeqPos qoff_; /**< Query offset of the seed's origin. */
416  TSeqPos soff_; /**< Subject offset of the seed's origin. */
417  TSeqPos len_; /**< Length of the seed. */
418  TSeqPos qright_; /**< Offset of the rightmost position of the seed in the query. */
419 };
420 
421 /** Specializarion for two-hit based search. */
422 template<>
424 {
425  /** Instance constructor.
426 
427  @param qoff Query offset.
428  @param soff Subject offset.
429  @param len Seed length.
430  @param qright Rightmost position of the seed in query's coordinates.
431  */
433  TSeqPos qoff, TSeqPos soff, TSeqPos len, TSeqPos qright )
434  : qoff_( qoff ), soff_( soff ), len_( len ), qright_( qright ),
435  second_hit_( 0 )
436  {}
437 
438  TSeqPos qoff_; /**< Query offset of the seed's origin. */
439  TSeqPos soff_; /**< Subject offset of the seed's origin. */
440  TSeqPos len_; /**< Length of the seed. */
441  TSeqPos qright_; /**< Offset of the rightmost position of the seed in the query. */
442  TSeqPos second_hit_; /**< Right end of the first hit. */
443 };
444 
445 /** Representation of a collection of tacked seeds for a specific subject
446  sequence.
447 */
448 template< unsigned long NHITS >
450 {
451  protected:
452 
453  /**@name Some convenience type declaration. */
454  /**@{*/
457  typedef std::list< TTrackedSeed > TSeeds;
458  typedef typename TSeeds::iterator TIter;
459  typedef std::vector< BlastInitHitList * > THitLists;
460  /**@}*/
461 
462  public:
463 
464  /** Object constructor.
465 
466  @param subject_map The subject map instance.
467  */
468  CTrackedSeeds_Base( const TSubjectMap & subject_map )
469  : subject_map_( &subject_map ), lid_( 0 )
470  { it_ = seeds_.begin(); }
471 
472  /** Object copy constructor.
473  @param rhs [I] source object to copy
474  */
476  : hitlists_( rhs.hitlists_ ),
477  seeds_( rhs.seeds_ ), subject_map_( rhs.subject_map_ ),
478  lid_( rhs.lid_ )
479  { it_ = seeds_.begin(); }
480 
481  /** Set the correspondence between this object and a
482  logical sequence.
483 
484  @param lid The logical sequence id.
485  */
486  void SetLId( TSeqNum lid )
487  {
488  lid_ = lid;
489  hitlists_.resize( subject_map_->GetNumChunks( lid_ ), 0 );
490  }
491 
492  /** Prepare for processing of the next query position. */
493  void Reset();
494 
495  /** Add a seed to the set of tracked seeds.
496  @param seed [I] seed to add
497  @param word_size [I] minimum size of a valid seed
498  */
499  void Append( const TTrackedSeed & seed, unsigned long word_size );
500 
501  /** Add a seed to the set of tracked seeds.
502  No check for word size is performed.
503  @param seed [I] seed to add
504  */
505  void AppendSimple( const TTrackedSeed & seed );
506 
507  /** Save the tracked seed for reporting in the search result set.
508  @param seed [I] seed to save
509  */
510  void SaveSeed( const TTrackedSeed & seed );
511 
512  /** Get the list of saved seeds.
513 
514  @param num The relative chunk number.
515 
516  @return the results set for the subject sequence to which
517  this object corresponds
518  */
520  { return hitlists_[num]; }
521 
522  protected:
523 
524  THitLists hitlists_; /**< The result sets (one per chunk). */
525  TSeeds seeds_; /**< List of seed candidates. */
526  TIter it_; /**< Iterator pointing to the tracked seed that
527  is about to be inspected. */
528  const TSubjectMap * subject_map_; /**< The subject map object. */
529  TSeqNum lid_; /**< Logical sequence number. */
530 };
531 
532 //-------------------------------------------------------------------------
533 template< unsigned long NHITS >
534 INLINE
536 { it_ = seeds_.begin(); }
537 
538 /* This code is for testing purposes only.
539 {
540  unsigned long soff = 0, qoff = 0;
541  bool good = true;
542 
543  for( TSeeds::iterator i = seeds_.begin(); i != seeds_.end(); ++i ) {
544  if( i != seeds_.begin() ) {
545  unsigned long s;
546 
547  if( i->qoff_ > qoff ) {
548  unsigned long step = i->qoff_ - qoff;
549  s = soff + step;
550  if( s > i->soff_ ) { good = false; break; }
551  }else {
552  unsigned long step = qoff - i->qoff_;
553  s = i->soff_ + step;
554  if( s < soff ) { good = false; break; }
555  }
556  }
557 
558  soff = i->soff_;
559  qoff = i->qoff_;
560  }
561 
562  if( !good ) {
563  cerr << "Bad List at " << qoff << " " << soff << endl;
564 
565  for( TSeeds::iterator i = seeds_.begin(); i != seeds_.end(); ++i ) {
566  cerr << i->qoff_ << " " << i->soff_ << " "
567  << i->qright_ << " " << i->len_ << endl;
568  }
569  }
570 
571  it_ = seeds_.begin();
572 }
573 */
574 
575 //-------------------------------------------------------------------------
576 template< unsigned long NHITS >
577 INLINE
579 {
580  if( seed.len_ > 0 ) {
581  TSeqPos qoff = seed.qright_ - seed.len_ + 1;
582  TSeqPos soff = seed.soff_ - (seed.qoff_ - qoff);
583  std::pair< TSeqNum, TSeqPos > mapval =
584  subject_map_->MapSubjOff( lid_, soff );
585  BlastInitHitList * hitlist = hitlists_[mapval.first];
586 
587  if( hitlist == 0 ) {
588  hitlists_[mapval.first] = hitlist = BLAST_InitHitListNew();
589  }
590 
591  BLAST_SaveInitialHit( hitlist, (Int4)qoff, (Int4)mapval.second, 0 );
592 
593 #ifdef SEEDDEBUG
594  TSeqNum chunk = subject_map_->MapLId2Chunk( lid_, mapval.first );
595  cerr << "SEED: " << qoff << "\t" << mapval.second << "\t"
596  << seed.len_ << "\t" << chunk << "\n";
597 #endif
598  }
599 }
600 
601 //-------------------------------------------------------------------------
602 template< unsigned long NHITS >
603 INLINE
605 { seeds_.insert( it_, seed ); }
606 
607 //-------------------------------------------------------------------------
608 template< unsigned long NHITS >
609 INLINE
611  const TTrackedSeed & seed, unsigned long word_size )
612 {
613  if( it_ != seeds_.begin() ) {
614  TIter tmp_it = it_; tmp_it--;
615  TSeqPos step = seed.qoff_ - tmp_it->qoff_;
616  TSeqPos bs_soff_corr = tmp_it->soff_ + step;
617 
618  if( bs_soff_corr == seed.soff_ ) {
619  if( seed.qright_ < tmp_it->qright_ ) {
620  if( tmp_it->len_ > 0 ) {
621  tmp_it->len_ -= (tmp_it->qright_ - seed.qright_ );
622  }
623 
624  if( tmp_it->len_ < word_size ) {
625  seeds_.erase( tmp_it );
626  }else {
627  tmp_it->qright_ = seed.qright_;
628  }
629  }
630  }else if( seed.len_ >= word_size ) {
631  seeds_.insert( it_, seed );
632  }
633  }else if( seed.len_ >= word_size ) {
634  seeds_.insert( it_, seed );
635  }
636 }
637 
638 //-------------------------------------------------------------------------
639 /** CTrackedSeeds functionality that is different depending on
640  whether a one-hit or two-hit based search is used.
641 */
642 template< unsigned long NHITS >
644 
645 //-------------------------------------------------------------------------
646 /** Specialization for one-hit searches. */
647 template<>
648 class CTrackedSeeds< ONE_HIT > : public CTrackedSeeds_Base< ONE_HIT >
649 {
650  /** @name Types forwarded from the base class. */
651  /**@{*/
654  typedef TBase::TTrackedSeed TTrackedSeed;
656  /**@}*/
657 
658  public:
659 
660  /** Object constructor.
661 
662  @param subject_map The subject map instance.
663  */
665  const TSubjectMap & subject_map,
666  const CDbIndex::SSearchOptions & options )
667  : TBase( subject_map )
668  {}
669 
670  /** Object copy constructor.
671  @param rhs [I] source object to copy
672  */
674  : TBase( rhs )
675  {}
676 
677  /** Process seeds on diagonals below or equal to the seed
678  given as the parameter.
679  @param seed [I] possible candidate for a 'tracked' seed
680  @return true if there is a tracked seed on the same diagonal
681  as seed; false otherwise
682  */
683  bool EvalAndUpdate( const TTrackedSeed & seed );
684 
685  /** Save the remaining valid tracked seeds and clean up the
686  structure.
687  */
688  void Finalize();
689 };
690 
691 //-------------------------------------------------------------------------
692 INLINE
694 {
695  for( TSeeds::const_iterator cit = this->seeds_.begin();
696  cit != this->seeds_.end(); ++cit ) {
697  SaveSeed( *cit );
698  }
699 }
700 
701 //-------------------------------------------------------------------------
702 INLINE
704 {
705  while( this->it_ != this->seeds_.end() ) {
706  TSeqPos step = seed.qoff_ - this->it_->qoff_;
707  TSeqPos it_soff_corr = this->it_->soff_ + step;
708 
709  if( it_soff_corr > seed.soff_ ) {
710  return true;
711  }
712 
713  if( this->it_->qright_ < seed.qoff_ ) {
714  SaveSeed( *this->it_ );
715  this->it_ = this->seeds_.erase( this->it_ );
716  }
717  else {
718  ++this->it_;
719 
720  if( it_soff_corr == seed.soff_ ) {
721  return false;
722  }
723  }
724  }
725 
726  return true;
727 }
728 
729 //-------------------------------------------------------------------------
730 /** Specialization for two-hit searches. */
731 template<>
732 class CTrackedSeeds< TWO_HIT > : public CTrackedSeeds_Base< TWO_HIT >
733 {
734  /** @name Types forwarded from the base class. */
735  /**@{*/
738  typedef TBase::TTrackedSeed TTrackedSeed;
740  /**@}*/
741 
742  public:
743 
744  /** Object constructor.
745 
746  @param subject_map The subject map instance.
747  */
749  const TSubjectMap & subject_map,
750  const CDbIndex::SSearchOptions & options )
751  : TBase( subject_map ),
752  window_( options.two_hits ),
753  contig_len_( 2*options.word_size ),
754  word_size_( options.word_size ),
755  stride_( subject_map.GetStride() )
756  {}
757 
758  /** Process seeds on diagonals below or equal to the seed
759  given as the parameter.
760  @param seed [I] possible candidate for a 'tracked' seed
761  @return true if there is a tracked seed on the same diagonal
762  as seed; false otherwise
763  */
764  bool EvalAndUpdate( TTrackedSeed & seed );
765 
766  /** Save the remaining valid tracked seeds and clean up the
767  structure.
768  */
769  void Finalize();
770 
771  private:
772 
773  /** Verify two-seed criterion and save the seed if it is satisfied.
774 
775  @param seed Seed to check and save.
776 
777  @return true if seed was saved; false otherwise.
778  */
779  bool CheckAndSaveSeed( const TTrackedSeed & seed );
780 
781  unsigned long window_; /**< Window for two-hit based search. */
782  unsigned long contig_len_; /**< Min continuous length to save unconditionally. */
783  unsigned long word_size_; /**< Target word size. */
784  unsigned long stride_; /**< Stride value used by the index. */
785 };
786 
787 
788 //-------------------------------------------------------------------------
789 INLINE
791  const TTrackedSeed & seed )
792 {
793  if( (seed.second_hit_ > 0 &&
794  seed.qright_ >= seed.second_hit_ + seed.len_ &&
795  seed.qright_ <= seed.second_hit_ + seed.len_ + window_ ) ||
796  seed.len_ >= contig_len_ ) {
797  SaveSeed( seed );
798  return true;
799  }
800  else return false;
801 }
802 
803 //-------------------------------------------------------------------------
804 INLINE
806 {
807  for( TSeeds::const_iterator cit = this->seeds_.begin();
808  cit != this->seeds_.end(); ++cit ) {
809  CheckAndSaveSeed( *cit );
810  }
811 }
812 
813 //-------------------------------------------------------------------------
814 INLINE
816 {
817  while( this->it_ != this->seeds_.end() ) {
818  TSeqPos step = seed.qoff_ - this->it_->qoff_;
819  TSeqPos it_soff_corr = this->it_->soff_ + step;
820  if( it_soff_corr > seed.soff_ ) return true;
821 
822  if( this->it_->qright_ + seed.len_ + window_ + 3*stride_
823  < seed.qright_ ) {
824  CheckAndSaveSeed( *this->it_ );
825  this->it_ = this->seeds_.erase( this->it_ );
826  }
827  else if( this->it_->qright_ < seed.qoff_ ) {
828  if( CheckAndSaveSeed( *this->it_ ) ) {
829  this->it_ = this->seeds_.erase( this->it_ );
830  }
831  else if( it_soff_corr == seed.soff_ &&
832  this->it_->len_ > 0 ) {
833  seed.second_hit_ = this->it_->qright_;
834  ++this->it_;
835  }
836  else { ++this->it_; }
837  }
838  else {
839  ++this->it_;
840  if( it_soff_corr == seed.soff_ ) return false;
841  }
842  }
843 
844  return true;
845 }
846 
847 //-------------------------------------------------------------------------
848 // Forward declaration.
849 //
850 template< bool LEGACY > class CDbIndex_Impl;
851 
852 /** This is the object representing the state of a search over the index.
853  Use of a separate class for searches allows for multiple simultaneous
854  searches against the same index.
855 */
856 template< bool LEGACY, unsigned long NHITS, typename derived_t >
857 class CSearch_Base
858 {
859  protected:
860 
861  typedef CDbIndex::SSearchOptions TSearchOptions; /**< Alias for convenience. */
862 
863  public:
864 
865  /** @name Aliases for convenience. */
866  /**@{*/
870  typedef derived_t TDerived;
871  /**@}*/
872 
873  /** Object constructor.
874  @param index_impl [I] the index implementation object
875  @param query [I] query data encoded in BLASTNA
876  @param locs [I] set of query locations to search
877  @param options [I] search options
878  */
880  const TIndex_Impl & index_impl,
881  const BLAST_SequenceBlk * query,
882  const BlastSeqLoc * locs,
883  const TSearchOptions & options );
884 
885  /** Performs the search.
886  @return the set of seeds matching the query to the sequences
887  present in the index
888  */
890 
891  protected:
892 
893  typedef STrackedSeed< NHITS > TTrackedSeed; /**< Alias for convenience. */
894 
895  /** Representation of the set of currently tracked seeds for
896  all subject sequences.
897  */
898  typedef std::vector< TTrackedSeeds > TTrackedSeedsSet;
899 
900  /** Helper method to search a particular segment of the query.
901  The segment is taken from state of the search object.
902  */
903  void SearchInt();
904 
905  /** Process a seed candidate that is close to the masked out
906  or ambigous region of the subject.
907  The second parameter is encoded as follows: bits 3-5 (0-2) '
908  is the distance to the left (right) boundary of the valid
909  subject region plus 1. Value 0 in either field indicates that
910  the corresponding distance is greater than 5.
911  @param offset [I] uncompressed offset value
912  @param bounds [I] distance to the left and/or right
913  boundary of the valid subject
914  region.
915  */
917 
918  /** Process a regular seed candidate.
919  @param offset [I] uncompressed offset value
920  */
922 
923  /** Extend a seed candidate to the left.
924  No more than word_length - hkey_width positions are inspected.
925  @param seed [I] the seed candidate
926  @param nmax [I] if non-zero - additional restriction for
927  the number of positions to consider
928  */
929  void ExtendLeft(
930  TTrackedSeed & seed, TSeqPos nmax = ~(TSeqPos)0 ) const;
931 
932  /** Extend a seed candidate to the right.
933  Extends as far right as possible, unless nmax parameter is
934  non-zeroA
935  @param seed [I] the seed candidate
936  @param nmax [I] if non-zero - search no more than this
937  many positions
938  */
939  void ExtendRight(
940  TTrackedSeed & seed, TSeqPos nmax = ~(TSeqPos)0 ) const;
941 
942  /** Compute the seeds after all roots are collected. */
943  void ComputeSeeds();
944 
945  /** Process a single root.
946  @param seeds [I/O] information on currently tracked seeds
947  @param root [I] root to process
948  @return 1 for normal offsets, 2 for boundary offsets
949  */
950  unsigned long ProcessRoot( TTrackedSeeds & seeds, const SSeedRoot * root );
951 
952  const TIndex_Impl & index_impl_; /**< The index implementation object. */
953  const BLAST_SequenceBlk * query_; /**< The query sequence encoded in BLASTNA. */
954  const BlastSeqLoc * locs_; /**< Set of query locations to search. */
955  TSearchOptions options_; /**< Search options. */
956 
957  TTrackedSeedsSet seeds_; /**< The set of currently tracked seeds. */
958  TSeqNum subject_; /**< Logical id of the subject sequence containing the offset
959  value currently being considered. */
960  TWord subj_start_off_; /**< Start offset of subject_. */
961  TWord subj_end_off_; /**< End offset of subject_. */
962  TWord subj_start_; /**< Start position of subject_. */
963  TWord subj_end_; /**< One past the end position of subject_. */
964  TSeqPos qoff_; /**< Current query offset. */
965  TSeqPos soff_; /**< Current subject offset. */
966  TSeqPos qstart_; /**< Start of the current query segment. */
967  TSeqPos qstop_; /**< One past the end of the current query segment. */
968  CSeedRoots roots_; /**< Collection of initial soff/qoff pairs. */
969 
970  unsigned long code_bits_; /**< Number of bits to represent special offset prefix. */
971  unsigned long min_offset_; /**< Minumum offset used by the index. */
972 };
973 
974 //-------------------------------------------------------------------------
975 template< bool LEGACY, unsigned long NHITS, typename derived_t >
977  const TIndex_Impl & index_impl,
978  const BLAST_SequenceBlk * query,
979  const BlastSeqLoc * locs,
980  const TSearchOptions & options )
981  : index_impl_( index_impl ), query_( query ), locs_( locs ),
982  options_( options ), subject_( 0 ), subj_end_off_( 0 ),
983  roots_( index_impl_.NumSubjects() ),
984  code_bits_( GetCodeBits( index_impl.GetSubjectMap().GetStride() ) ),
985  min_offset_( GetMinOffset( index_impl.GetSubjectMap().GetStride() ) )
986 {
987  seeds_.resize(
988  index_impl_.NumSubjects() - 1,
989  TTrackedSeeds( index_impl_.GetSubjectMap(), options ) );
990  for( typename TTrackedSeedsSet::size_type i = 0; i < seeds_.size(); ++i ) {
991  seeds_[i].SetLId( (TSeqNum)i );
992  }
993 }
994 
995 //-------------------------------------------------------------------------
996 template< bool LEGACY, unsigned long NHITS, typename derived_t >
997 INLINE
999  TTrackedSeed & seed, TSeqPos nmax ) const
1000 {
1001  static const unsigned long CR = CDbIndex::CR;
1002 
1003  unsigned long hkey_width = index_impl_.hkey_width();
1004  const Uint1 * sstart = index_impl_.GetSeqStoreBase() + subj_start_;
1005  const Uint1 * spos = sstart + (seed.soff_ - (hkey_width - 1))/CR;
1006  const Uint1 * qstart = query_->sequence;
1007  const Uint1 * qpos = qstart + seed.qoff_ - (hkey_width - 1);
1008  unsigned int incomplete = (seed.soff_ - (hkey_width - 1))%CR;
1009 
1010  qstart += qstart_;
1011  nmax = nmax < options_.word_size - hkey_width ?
1012  nmax : static_cast<TSeqPos>(options_.word_size - hkey_width);
1013 
1014  while( nmax > 0 && incomplete > 0 && qpos > qstart ) {
1015  Uint1 sbyte = (((*spos)>>(2*(CR - incomplete--)))&0x3);
1016  if( *--qpos != sbyte ) return;
1017  --nmax;
1018  ++seed.len_;
1019  }
1020 
1021  nmax = (nmax < (TSeqPos)(qpos - qstart))
1022  ? nmax : (TSeqPos)(qpos - qstart);
1023  nmax = (nmax < (TSeqPos)(CR*(spos - sstart)))
1024  ? nmax : (TSeqPos)(CR*(spos - sstart));
1025  --spos;
1026 
1027  while( nmax >= CR ) {
1028  Uint1 sbyte = *spos--;
1029  Uint1 qbyte = 0;
1030  unsigned int i = 0;
1031  bool ambig( false );
1032 
1033  for( ; i < CR; ++i ) {
1034  qbyte = qbyte + ((*--qpos)<<(2*i));
1035 
1036  if( *qpos > 3 ) {
1037  ++spos;
1038  qpos += i + 1;
1039  nmax = i;
1040  ambig = true;
1041  break;
1042  }
1043  }
1044 
1045  if( ambig ) break;
1046 
1047  if( sbyte != qbyte ){
1048  ++spos;
1049  qpos += i;
1050  break;
1051  }
1052 
1053  nmax -= CR;
1054  seed.len_ += CR;
1055  }
1056 
1057  unsigned int i = 0;
1058 
1059  while( nmax > 0 ) {
1060  Uint1 sbyte = (((*spos)>>(2*(i++)))&0x3);
1061  if( sbyte != *--qpos ) return;
1062  ++seed.len_;
1063  --nmax;
1064  }
1065 }
1066 
1067 //-------------------------------------------------------------------------
1068 template< bool LEGACY, unsigned long NHITS, typename derived_t >
1069 INLINE
1071  TTrackedSeed & seed, TSeqPos nmax ) const
1072 {
1073  static const unsigned long CR = CDbIndex::CR;
1074 
1075  const Uint1 * sbase = index_impl_.GetSeqStoreBase();
1076  const Uint1 * send = sbase + subj_end_;
1077  const Uint1 * spos = sbase + subj_start_ + seed.soff_/CR;
1078  const Uint1 * qend = query_->sequence + qstop_;
1079  const Uint1 * qpos = query_->sequence + seed.qoff_ + 1;
1080  unsigned int incomplete = seed.soff_%CR;
1081 
1082  while( nmax > 0 && (++incomplete)%CR != 0 && qpos < qend ) {
1083  Uint1 sbyte = (((*spos)>>(6 - 2*incomplete))&0x3);
1084  if( *qpos++ != sbyte ) return;
1085  ++seed.len_;
1086  ++seed.qright_;
1087  --nmax;
1088  }
1089 
1090  ++spos;
1091  nmax = (nmax < (TSeqPos)(qend - qpos)) ?
1092  nmax : (TSeqPos)(qend - qpos);
1093  nmax = (nmax <= (send - spos)*CR) ?
1094  nmax : (TSeqPos)((send - spos)*CR);
1095 
1096  while( nmax >= CR ) {
1097  Uint1 sbyte = *spos++;
1098  Uint1 qbyte = 0;
1099  bool ambig( false );
1100 
1101  for( unsigned int i = 0; i < CR; ++i ) {
1102  if( *qpos > 3 ) {
1103  nmax = i;
1104  qpos -= i;
1105  --spos;
1106  ambig = true;
1107  break;
1108  }
1109 
1110  qbyte = (qbyte<<2) + *qpos++;
1111  }
1112 
1113  if( ambig ) break;
1114 
1115  if( sbyte != qbyte ) {
1116  --spos;
1117  qpos -= CR;
1118  break;
1119  }
1120 
1121  seed.len_ += CR;
1122  seed.qright_ += CR;
1123  nmax -= CR;
1124  }
1125 
1126  unsigned int i = 2*(CR - 1);
1127 
1128  while( nmax-- > 0 ) {
1129  Uint1 sbyte = (((*spos)>>i)&0x3);
1130  if( sbyte != *qpos++ ) break;
1131  ++seed.len_;
1132  ++seed.qright_;
1133  i -= 2;
1134  }
1135 
1136  return;
1137 }
1138 
1139 //-------------------------------------------------------------------------
1140 template< bool LEGACY, unsigned long NHITS, typename derived_t >
1141 INLINE
1144 {
1145  TSeqPos nmaxleft = (TSeqPos)(bounds>>code_bits_);
1146  TSeqPos nmaxright = (TSeqPos)(bounds&((1<<code_bits_) - 1));
1147  TTrackedSeed seed(
1148  qoff_, (TSeqPos)offset, (TSeqPos)index_impl_.hkey_width(), qoff_ );
1149  TTrackedSeeds & subj_seeds = seeds_[subject_];
1150  subj_seeds.EvalAndUpdate( seed );
1151 
1152  if( nmaxleft > 0 ) {
1153  ExtendLeft( seed, nmaxleft - 1 );
1154  }else {
1155  ExtendLeft( seed );
1156  }
1157 
1158  if( nmaxright > 0 ) {
1159  ExtendRight( seed, nmaxright - 1 );
1160  }else {
1161  ExtendRight( seed );
1162  }
1163 
1164  if( nmaxleft > 0 &&
1165  nmaxright == 0 &&
1166  seed.len_ < options_.word_size ) {
1167  seed.len_ = 0;
1168  subj_seeds.AppendSimple( seed );
1169  }else {
1170  subj_seeds.Append( seed, options_.word_size );
1171  }
1172 }
1173 
1174 //-------------------------------------------------------------------------
1175 template< bool LEGACY, unsigned long NHITS, typename derived_t >
1176 INLINE
1178 {
1180  qoff_, (TSeqPos)offset, (TSeqPos)index_impl_.hkey_width(), qoff_ );
1181  TTrackedSeeds & subj_seeds = seeds_[subject_];
1182 
1183  if( subj_seeds.EvalAndUpdate( seed ) ) {
1184  ExtendLeft( seed );
1185  ExtendRight( seed );
1186  if( seed.len_ >= options_.word_size )
1187  subj_seeds.AppendSimple( seed );
1188  }
1189 }
1190 
1191 //-------------------------------------------------------------------------
1192 template< bool LEGACY, unsigned long NHITS, typename derived_t >
1193 INLINE
1195  TTrackedSeeds & seeds, const SSeedRoot * root )
1196 {
1197  if( qoff_ != root->qoff_ ) {
1198  seeds.Reset();
1199  qoff_ = root->qoff_;
1200  }else if( root->soff_ >= min_offset_ &&
1201  root->soff_ < soff_ ) {
1202  seeds.Reset();
1203  }
1204 
1205  qstart_ = root->qstart_;
1206  qstop_ = root->qstop_;
1207 
1208  if( root->soff_ < min_offset_ ) {
1209  TSeqPos boundary = (root++)->soff_;
1210  ProcessBoundaryOffset( root->soff_ - static_cast<unsigned int>(min_offset_), boundary );
1211  // root->soff_ - CDbIndex::MIN_OFFSET, boundary );
1212  soff_ = root->soff_;
1213  return 2;
1214  }else {
1215  ProcessOffset( root->soff_ - static_cast<unsigned int>(min_offset_) );
1216  soff_ = root->soff_;
1217  return 1;
1218  }
1219 }
1220 
1221 //-------------------------------------------------------------------------
1222 template< bool LEGACY, unsigned long NHITS, typename derived_t >
1223 INLINE
1225 {
1226  TSeqNum num_subjects = index_impl_.NumSubjects() - 1;
1227 
1228  for( subject_ = 0; subject_ < num_subjects; ++subject_ ) {
1229  TDerived * self = static_cast< TDerived * >( this );
1230  self->SetSubjInfo();
1231  TTrackedSeeds & seeds = seeds_[subject_];
1232  const SSubjRootsInfo & rinfo = roots_.GetSubjInfo( subject_ );
1233 
1234  if( rinfo.len_ > 0 ) {
1235  const SSeedRoot * roots = roots_.GetSubjRoots( subject_ );
1236  qoff_ = 0;
1237 
1238  for( unsigned long j = 0; j < rinfo.len_; ) {
1239  j += ProcessRoot( seeds, roots + j );
1240  }
1241 
1242  if( rinfo.extra_roots_ != 0 ) {
1243  typedef SSubjRootsInfo::TRoots TRoots;
1244  roots = &(*rinfo.extra_roots_)[0];
1245 
1246  for( TRoots::size_type j = 0;
1247  j < rinfo.extra_roots_->size(); ) {
1248  j += ProcessRoot( seeds, roots + j );
1249  }
1250  }
1251  }
1252 
1253  seeds.Reset();
1254  }
1255 }
1256 
1257 //-------------------------------------------------------------------------
1258 template< bool LEGACY, unsigned long NHITS, typename derived_t >
1259 INLINE
1261 {
1262  CNmerIterator nmer_it(
1263  index_impl_.hkey_width(), query_->sequence, qstart_, qstop_ );
1264 
1265  while( nmer_it.Next() ) {
1266  typename TIndex_Impl::TOffsetIterator off_it(
1267  index_impl_.OffsetIterator(
1268  nmer_it.Nmer(), options_.word_size ) );
1269  qoff_ = nmer_it.Pos();
1270 
1271  while( off_it.More() ) {
1272  subject_ = 0;
1273  subj_end_off_ = 0;
1274 
1275  while( off_it.Next() ) {
1276  TWord offset = off_it.Offset();
1277  TDerived * self = static_cast< TDerived * >( this );
1278 
1279  if( offset < min_offset_ ) {
1280  off_it.Next();
1281  TWord real_offset = off_it.Offset();
1282  TSeqPos soff = self->DecodeOffset( real_offset );
1283  SSeedRoot r1 = { qoff_, (TSeqPos)offset, qstart_, qstop_ };
1284  SSeedRoot r2 = { qoff_, soff, qstart_, qstop_ };
1285  roots_.Add2( r1, r2, subject_ );
1286  }else {
1287  TSeqPos soff = self->DecodeOffset( offset );
1288  SSeedRoot r = { qoff_, soff, qstart_, qstop_ };
1289  roots_.Add( r, subject_ );
1290  }
1291  }
1292  }
1293 
1294  if( roots_.Overflow() ) {
1295  TSeqPos old_qstart = qstart_;
1296  TSeqPos old_qstop = qstop_;
1297 
1298  ComputeSeeds();
1299  roots_.Reset();
1300 
1301  qstart_ = old_qstart;
1302  qstop_ = old_qstop;
1303  }
1304  }
1305 }
1306 
1307 //-------------------------------------------------------------------------
1308 template< bool LEGACY, unsigned long NHITS, typename derived_t >
1311 {
1312  const BlastSeqLoc * curloc = locs_;
1313 
1314  while( curloc != 0 ) {
1315  if( curloc->ssr != 0 ) {
1316  qstart_ = curloc->ssr->left;
1317  qstop_ = curloc->ssr->right + 1;
1318  // cerr << "SEGMENT: " << qstart_ << " - " << qstop_ << endl;
1319  SearchInt();
1320  }
1321 
1322  curloc = curloc->next;
1323  }
1324 
1325  ComputeSeeds();
1326  const TSubjectMap & subject_map = index_impl_.GetSubjectMap();
1329  options_.word_size,
1330  0, index_impl_.NumChunks(), subject_map.GetSubjectMap(),
1331  index_impl_.StopSeq() - index_impl_.StartSeq() ) );
1332 
1333  for( typename TTrackedSeedsSet::size_type i = 0, k = 1;
1334  i < seeds_.size(); ++i ) {
1335  seeds_[i].Finalize();
1336  TSeqNum nchunks = subject_map.GetNumChunks( (TSeqNum)i );
1337 
1338  for( TSeqNum j = 0; j < nchunks; ++j ) {
1339  result->SetResults(
1340  (TSeqNum)(k++), seeds_[i].GetHitList( j ) );
1341  }
1342  }
1343 
1344  return result;
1345 }
1346 
1347 //-------------------------------------------------------------------------
1348 /** CSearch CRTP (to be removed). */
1349 template< bool LEGACY, unsigned long NHITS >
1350 class CSearch;
1351 
1352 //-------------------------------------------------------------------------
1353 template< bool LEGACY, unsigned long NHITS >
1354 class CSearch
1355  : public CSearch_Base< LEGACY, NHITS, CSearch< LEGACY, NHITS > >
1356 {
1357  /** @name Convenience declarations. */
1358  /**@{*/
1362  /**@}*/
1363 
1364  public:
1365 
1366  /** Object constructor.
1367  @param index_impl [I] the index implementation object
1368  @param query [I] query data encoded in BLASTNA
1369  @param locs [I] set of query locations to search
1370  @param options [I] search options
1371  */
1373  const TIndex_Impl & index_impl,
1374  const BLAST_SequenceBlk * query,
1375  const BlastSeqLoc * locs,
1376  const TSearchOptions & options )
1377  : TBase( index_impl, query, locs, options )
1378  {}
1379 
1380 
1381  /** Set the parameters of the current subject sequence. */
1383  {
1384  typedef typename TIndex_Impl::TSubjectMap TSubjectMap;
1385  const TSubjectMap & subject_map =
1386  this->index_impl_.GetSubjectMap();
1387  subject_map.SetSubjInfo(
1388  this->subject_, this->subj_start_, this->subj_end_ );
1389  }
1390 
1391  /** Decode offset value into subject position.
1392 
1393  @param offset Offset value.
1394 
1395  @return Corresponding position in the subject.
1396  */
1398  {
1399  typedef typename TIndex_Impl::TSubjectMap TSubjectMap;
1400  const TSubjectMap & subject_map =
1401  this->index_impl_.GetSubjectMap();
1402  std::pair< TSeqNum, TSeqPos > decoded =
1403  subject_map.DecodeOffset( offset );
1404  this->subject_ = decoded.first;
1405  SetSubjInfo();
1406  return decoded.second;
1407  }
1408 };
1409 
1410 //-------------------------------------------------------------------------
1412  const BLAST_SequenceBlk * query, const BlastSeqLoc * locs,
1413  const SSearchOptions & search_options )
1414 {
1415  if( search_options.two_hits == 0 )
1416  if( header_.legacy_ ) {
1417  CSearch< true, ONE_HIT > searcher(
1418  dynamic_cast< CDbIndex_Impl< true > & >(*this), query, locs, search_options );
1419  return searcher();
1420  }
1421  else {
1422  CSearch< false, ONE_HIT > searcher(
1423  dynamic_cast< CDbIndex_Impl< false > & >(*this), query, locs, search_options );
1424  return searcher();
1425  }
1426  else
1427  if( header_.legacy_ ) {
1428  CSearch< true, TWO_HIT > searcher(
1429  dynamic_cast< CDbIndex_Impl< true > & >(*this), query, locs, search_options );
1430  return searcher();
1431  }
1432  else {
1433  CSearch< false, TWO_HIT > searcher(
1434  dynamic_cast< CDbIndex_Impl< false > & >(*this), query, locs, search_options );
1435  return searcher();
1436  }
1437 }
1438 
1439 END_SCOPE( blastdbindex )
1441 
Ungapped extension structures that are common to nucleotide and protein extension routines.
BlastInitHitList * BLAST_InitHitListNew(void)
Allocate memory for the BlastInitHitList structure.
Definition: blast_extend.c:216
Boolean BLAST_SaveInitialHit(BlastInitHitList *init_hitlist, Int4 q_off, Int4 s_off, BlastUngappedData *ungapped_data)
Save the initial hit data into the initial hit list structure.
Definition: blast_extend.c:325
Structures and functions prototypes used for BLAST gapped extension.
Structures and API used for saving BLAST hits.
This class represents a set of seeds obtained by searching all subjects represented by the index.
Definition: dbindex.hpp:493
Implementation of the BLAST database index.
Definition: dbindex_sp.hpp:333
TOffsetData::TIterator TOffsetIterator
Definition: dbindex_sp.hpp:343
const TSubjectMap & GetSubjectMap() const
Get the subject map instance from the index object.
Definition: dbindex_sp.hpp:400
TSeqNum NumSubjects() const
Get the total number of logical sequences in the index.
Definition: dbindex_sp.hpp:394
Base class providing high level interface to index objects.
Definition: dbindex.hpp:435
CConstRef< CSearchResults > Search(const BLAST_SequenceBlk *query, const BlastSeqLoc *locs, const SSearchOptions &search_options)
Search the index.
SIndexHeader header_
The index header structure.
Definition: dbindex.hpp:950
static const unsigned long CR
Letters per byte in the sequence store.
Definition: dbindex.hpp:441
CMemoryFile –.
Definition: ncbifile.hpp:2861
Type used to iterate over the consecutive Nmer values of the query sequence.
bool Next()
Advance the iterator.
TWord nmer_
Nmer value reported by Nmer().
bool state_
false, if the end of the sequence has been reached.
TSeqPos stop_
One past the last position in the query.
TSeqPos pos_
Position returned by Pos().
unsigned long hkey_width_
Hash key width (in base pairs).
TSeqPos count_
Auxiliary member used to determine the next valid position.
TWord hkey_mask_
Hash key mask.
TSeqPos Pos() const
Get the current position in the query sequence.
TWord Nmer() const
Get the Nmer value corresponding to the current state of the iterator object.
const Uint1 * query_
The query data (BLASTNA encoded).
CNmerIterator(unsigned long hkey_width, const Uint1 *query, TSeqPos start, TSeqPos stop)
Object constructor.
CRef –.
Definition: ncbiobj.hpp:618
CSearch_Base –.
Definition: Search_.hpp:86
unsigned long min_offset_
Minumum offset used by the index.
void ProcessBoundaryOffset(TWord offset, TWord bounds)
Process a seed candidate that is close to the masked out or ambigous region of the subject.
void SearchInt()
Helper method to search a particular segment of the query.
const BlastSeqLoc * locs_
Set of query locations to search.
std::vector< TTrackedSeeds > TTrackedSeedsSet
Representation of the set of currently tracked seeds for all subject sequences.
TWord subj_start_
Start position of subject_.
TWord subj_start_off_
Start offset of subject_.
STrackedSeed< NHITS > TTrackedSeed
Alias for convenience.
const BLAST_SequenceBlk * query_
The query sequence encoded in BLASTNA.
TWord subj_end_
One past the end position of subject_.
derived_t TDerived
CSeedRoots roots_
Collection of initial soff/qoff pairs.
void ProcessOffset(TWord offset)
Process a regular seed candidate.
CConstRef< CDbIndex::CSearchResults > operator()()
Performs the search.
unsigned long ProcessRoot(TTrackedSeeds &seeds, const SSeedRoot *root)
Process a single root.
TTrackedSeedsSet seeds_
The set of currently tracked seeds.
TSeqNum subject_
Logical id of the subject sequence containing the offset value currently being considered.
TSeqPos qoff_
Current query offset.
const TIndex_Impl & index_impl_
The index implementation object.
CTrackedSeeds< NHITS > TTrackedSeeds
void ExtendRight(TTrackedSeed &seed, TSeqPos nmax=~(TSeqPos) 0) const
Extend a seed candidate to the right.
TSearchOptions options_
Search options.
TSeqPos soff_
Current subject offset.
void ComputeSeeds()
Compute the seeds after all roots are collected.
CSearch_Base(const TIndex_Impl &index_impl, const BLAST_SequenceBlk *query, const BlastSeqLoc *locs, const TSearchOptions &options)
Object constructor.
TSeqPos qstart_
Start of the current query segment.
TSeqPos qstop_
One past the end of the current query segment.
CDbIndex_Impl< LEGACY > TIndex_Impl
TIndex_Impl::TSubjectMap TSubjectMap
void ExtendLeft(TTrackedSeed &seed, TSeqPos nmax=~(TSeqPos) 0) const
Extend a seed candidate to the left.
unsigned long code_bits_
Number of bits to represent special offset prefix.
TWord subj_end_off_
End offset of subject_.
CDbIndex::SSearchOptions TSearchOptions
Alias for convenience.
CSearch –.
Definition: Search.hpp:68
TBase::TIndex_Impl TIndex_Impl
CSearch(const TIndex_Impl &index_impl, const BLAST_SequenceBlk *query, const BlastSeqLoc *locs, const TSearchOptions &options)
Object constructor.
CSearch_Base< LEGACY, NHITS, CSearch > TBase
TSeqPos DecodeOffset(TWord offset)
Decode offset value into subject position.
void SetSubjInfo()
Set the parameters of the current subject sequence.
TBase::TSearchOptions TSearchOptions
Seed roots container for all subjects.
CSeedRoots(TSeqNum num_subjects=0)
Object constructor.
unsigned long total_
Currenr total number of elements.
void Add(const SSeedRoot &root, TSeqNum subject)
Append a normal (non boundary) root to the container.
unsigned long subj_roots_len_bits_
Log_2 of n_subj_roots_.
~CSeedRoots()
Object destructor.
void Allocate()
Reallocate all the storage.
void Reset()
Reinitialize the structure.
static const unsigned long LIMIT_ROOTS
Max number of roots before triggering overflow.
void Add2(const SSeedRoot &root1, const SSeedRoot &root2, TSeqNum subject)
Append a boundary root (both parts) to the container.
bool Overflow() const
Check if the max number of elements is reached.
const SSubjRootsInfo & GetSubjInfo(TSeqNum subject) const
Get the set of roots for a particular subject.
SSubjRootsInfo * rinfo_
Array of root information structures for each subject.
SSubjRootsInfo::TRoots TRoots
Alias type for convenience.
SSeedRoot * roots_
Roots array preallocated for all subjects.
void CleanUp()
Clean up all the dynamically allocated memory.
TSeqNum num_subjects_
Number of subjects in the index.
static const unsigned long TOTAL_CACHE
Assumption on the amound of cache in the system.
const SSeedRoot * GetSubjRoots(TSeqNum subject) const
Return the preallocated array of roots for a particular subject.
unsigned long n_subj_roots_
Space is preallocated for this number of roots per subject.
unsigned long total_roots_
Max number of roots in preallocated storage.
Type representing subject map data.
Definition: dbindex.hpp:1028
TSeqNum GetNumChunks(TSeqNum lid) const
Get number of chunks combined into a given logical sequence.
Definition: dbindex.hpp:1110
std::pair< TSeqNum, TSeqPos > DecodeOffset(TWord offset) const
Decode offset.
Definition: dbindex.hpp:1149
const TWord * GetSubjectMap() const
Provides a mapping from real subject ids and chunk numbers to internal logical subject ids.
Definition: dbindex.hpp:1085
void SetSubjInfo(TSeqNum subj, TWord &start, TWord &end) const
Return the subject information based on the given logical subject id.
Definition: dbindex.hpp:1164
CTrackedSeeds(const TSubjectMap &subject_map, const CDbIndex::SSearchOptions &options)
Object constructor.
TBase::TSubjectMap TSubjectMap
TBase::TTrackedSeed TTrackedSeed
CTrackedSeeds(const CTrackedSeeds &rhs)
Object copy constructor.
CTrackedSeeds_Base< ONE_HIT > TBase
unsigned long word_size_
Target word size.
CTrackedSeeds_Base< TWO_HIT > TBase
TBase::TTrackedSeed TTrackedSeed
TBase::TSubjectMap TSubjectMap
unsigned long contig_len_
Min continuous length to save unconditionally.
unsigned long window_
Window for two-hit based search.
CTrackedSeeds(const TSubjectMap &subject_map, const CDbIndex::SSearchOptions &options)
Object constructor.
unsigned long stride_
Stride value used by the index.
Representation of a collection of tacked seeds for a specific subject sequence.
TSeeds::iterator TIter
TSeeds seeds_
List of seed candidates.
TIter it_
Iterator pointing to the tracked seed that is about to be inspected.
STrackedSeed< NHITS > TTrackedSeed
BlastInitHitList * GetHitList(TSeqNum num) const
Get the list of saved seeds.
const TSubjectMap * subject_map_
The subject map object.
CTrackedSeeds_Base(const TSubjectMap &subject_map)
Object constructor.
void AppendSimple(const TTrackedSeed &seed)
Add a seed to the set of tracked seeds.
TSeqNum lid_
Logical sequence number.
void SetLId(TSeqNum lid)
Set the correspondence between this object and a logical sequence.
CTrackedSeeds_Base(const CTrackedSeeds_Base &rhs)
Object copy constructor.
void Append(const TTrackedSeed &seed, unsigned long word_size)
Add a seed to the set of tracked seeds.
THitLists hitlists_
The result sets (one per chunk).
std::vector< BlastInitHitList * > THitLists
std::list< TTrackedSeed > TSeeds
void SaveSeed(const TTrackedSeed &seed)
Save the tracked seed for reporting in the search result set.
void Reset()
Prepare for processing of the next query position.
CTrackedSeeds functionality that is different depending on whether a one-hit or two-hit based search ...
static const char * bounds[]
const unsigned long TWO_HIT
Use two-hit search.
Definition: dbindex.hpp:58
unsigned long GetMinOffset(unsigned long stride)
Compute the minimum offset value needed encode offsets based on stride.
Definition: dbindex.cpp:446
const unsigned long ONE_HIT
Use one-hit search (normal).
Definition: dbindex.hpp:57
unsigned long GetCodeBits(unsigned long stride)
Compute the number of bits to encode special offsets based on stride.
Definition: dbindex.cpp:438
static const unsigned long CR
CDbIndex::TWord TWord
CMemoryFile * MapFile(const std::string &fname)
Memory map a file and return a pointer to the mapped area.
#define INLINE
Definition: dbindex_sp.hpp:42
CDbIndex::TSeqNum TSeqNum
Forwarding declarations for convenience.
Definition: dbindex_sp.hpp:45
CDbIndex::TWord TWord
Definition: dbindex_sp.hpp:46
#define true
Definition: bool.h:35
int offset
Definition: replacements.h:160
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define ERR_POST(message)
Error posting with file, line number information but without error codes.
Definition: ncbidiag.hpp:186
uint8_t Uint1
1-byte (8-bit) unsigned integer
Definition: ncbitype.h:99
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
CSearch_Base(void)
Definition: Search_.cpp:130
int i
int len
EIPRangeType t
Definition: ncbi_localip.c:101
Defines classes: CDirEntry, CFile, CDir, CSymLink, CMemoryFile, CFileUtil, CFileLock,...
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
static const sljit_gpr r1
static const sljit_gpr r2
Structure to hold a sequence.
Definition: blast_def.h:242
Structure to hold all initial HSPs for a given subject sequence.
Definition: blast_extend.h:158
Used to hold a set of positions, mostly used for filtering.
Definition: blast_def.h:204
SSeqRange * ssr
location data on the sequence.
Definition: blast_def.h:206
struct BlastSeqLoc * next
next in linked list
Definition: blast_def.h:205
Simple record type used to specify index search parameters.
Definition: dbindex.hpp:639
unsigned long two_hits
Window for two-hit method (see megablast docs).
Definition: dbindex.hpp:641
bool legacy_
This is a legacy index format.
Definition: dbindex.hpp:291
Representation of a seed root.
TSeqPos qstop_
1 + end of the corresponding query interval.
TSeqPos qstart_
Start of the corresponding query interval.
TSeqPos soff_
Corresponding subject offset.
TSeqPos qoff_
Query offset.
Int4 left
left endpoint of range (zero based)
Definition: blast_def.h:156
Int4 right
right endpoint of range (zero based)
Definition: blast_def.h:157
SSeedRoot container for one subject.
TRoots * extra_roots_
Storage for extra roots.
void CleanUp()
Clean up extra allocated memory.
unsigned int len_
Current number of stored roots.
std::vector< SSeedRoot > TRoots
Container implementation type.
TSeqPos qoff_
Query offset of the seed's origin.
TSeqPos qright_
Offset of the rightmost position of the seed in the query.
STrackedSeed(TSeqPos qoff, TSeqPos soff, TSeqPos len, TSeqPos qright)
Instance constructor.
TSeqPos len_
Length of the seed.
TSeqPos soff_
Subject offset of the seed's origin.
TSeqPos second_hit_
Right end of the first hit.
TSeqPos qoff_
Query offset of the seed's origin.
TSeqPos soff_
Subject offset of the seed's origin.
TSeqPos qright_
Offset of the rightmost position of the seed in the query.
TSeqPos len_
Length of the seed.
STrackedSeed(TSeqPos qoff, TSeqPos soff, TSeqPos len, TSeqPos qright)
Instance constructor.
Representation of a seed being tracked by the search algorithm.
static string subject
static string query
static int seed
Definition: test_table.cpp:132
else result
Definition: token2.c:20
static Uint4 letter(char c)
static bool ambig(char c)
#define const
Definition: zconf.h:232
Modified on Wed Sep 04 15:05:03 2024 by modify_doxy.py rev. 669887