NCBI C++ ToolKit
dbindex.hpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: dbindex.hpp 100101 2023-06-15 14:10:29Z merezhuk $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Aleksandr Morgulis
27  *
28  * File Description:
29  * Header file for CDbIndex and some related classes.
30  *
31  */
32 
33 #ifndef C_DB_INDEX_HPP
34 #define C_DB_INDEX_HPP
35 
36 #include <corelib/ncbiobj.hpp>
39 
40 #include "sequence_istream.hpp"
41 
43 BEGIN_SCOPE( blastdbindex )
44 
45 // Compression types.
46 const unsigned long UNCOMPRESSED = 0UL; /**< No compression. */
47 
48 // Encoding of entries in offset lists.
49 const unsigned long OFFSET_COMBINED = 1UL; /**< Combination of chunk
50  number and chunk-based
51  offset. */
52 
53 // Index bit width.
54 const unsigned long WIDTH_32 = 0UL; /**< 32-bit index. */
55 
56 // Switching between one-hit and two-hit searches.
57 const unsigned long ONE_HIT = 0UL; /**< Use one-hit search (normal). */
58 const unsigned long TWO_HIT = 1UL; /**< Use two-hit search. */
59 
60 // Level of progress reporting.
61 const unsigned long REPORT_QUIET = 0UL; /**< No progress reporting. */
62 const unsigned long REPORT_NORMAL = 1UL; /**< Normal reporting. */
63 const unsigned long REPORT_VERBOSE = 2UL; /**< Verbose reporting. */
64 
65 /** Compute the number of bits to encode special offsets based on stride.
66 
67  @param stride the value of stride
68 
69  @return number of bits necessary to encode numbers [0 - stride].
70  */
71 extern unsigned long GetCodeBits( unsigned long stride );
72 
73 /** Compute the minimum offset value needed encode offsets based on stride.
74 
75  @param stride the value of stride
76 
77  @return minimum offset used by an index with the given stride
78  */
79 unsigned long GetMinOffset( unsigned long stride );
80 
81 /** Exceptions that superheader objects can throw. */
83 {
84 public:
85 
86  /** Numerical error codes. */
87  enum EErrCode
88  {
89  eFile, ///< filesystem error
90  eRead, ///< stream reading error
91  eWrite, ///< stream writing error
92  eEndian, ///< wrong index endianness
93  eVersion, ///< unrecognized index format version
94  eSize ///< wrong header size
95  };
96 
97  /** Get a human readable description of the exception type.
98 
99  @return string describing the exception type
100  */
101  virtual const char * GetErrCodeString() const override
102  {
103  switch( GetErrCode() ) {
104  case eFile: return "access failure";
105  case eRead: return "read failure";
106  case eWrite: return "write failure";
107  case eEndian: return "endianness mismatch";
108  case eVersion: return "unknown index format version";
109  case eSize: return "wrong header size";
110  default: return CException::GetErrCodeString();
111  }
112  }
113 
115 };
116 
117 /** Base class for index superheaders. */
119 {
120 public:
121 
122  /** Old style index without superheader.
123 
124  This should never appear in the 'version' field of superheader.
125  */
126  static const Uint4 INDEX_FORMAT_VERSION_0 = 0;
127 
128  /** Old style index with superheader. */
129  static const Uint4 INDEX_FORMAT_VERSION_1 = 1;
130 
131  /** Symbolic values for endianess. */
132  enum EEndianness { eLittleEndian = 0, eBigEndian };
133 
134  /** Get the endianness of the host system. */
135  static Uint4 GetSystemEndianness( void );
136 
137  /** Generate index volume file name from the index base name.
138 
139  @param idxname index base name
140  @param volume volume ordinal number
141 
142  @return corresponding index volume file name
143  */
144  static std::string GenerateIndexVolumeName(
145  const std::string & idxname, size_t volume );
146 
147  /** Object constructor.
148 
149  Reads the superheader structure from the file.
150 
151  @param size actual size of the superheader file
152  @param endianness superheader file endianness
153  @param version index format version
154 
155  @throw CIndexSuperHeader_Exception
156  */
157  CIndexSuperHeader_Base( size_t size, Uint4 endianness, Uint4 version );
158 
159  /* Object constructor.
160 
161  Used to create a superheader object for saving.
162 
163  @param version index format version
164  */
166 
167  /** Object destructor. */
169 
170  /** Get the endianness of the superheader. */
172 
173  /** Get the index format version. */
174  Uint4 GetVersion( void );
175 
176  /** Get number of sequences in the index (total of all volumes). */
177  virtual Uint4 GetNumSeq( void ) const = 0;
178 
179  /** Get number of volumes in the index. */
180  virtual Uint4 GetNumVol( void ) const = 0;
181 
182  /** Save the superheader into the file.
183 
184  @param fname output file name
185 
186  @throw CIndexSuperHeader_Exception
187  */
188  virtual void Save( const std::string & fname ) = 0;
189 
190 protected:
191 
192  // Size in bytes of the common part of superheader file for all versions.
193  static const size_t COMMON_SIZE = 2*sizeof( Uint4 );
194 
195  /** Save common part to the given stream.
196 
197  @param os output stream
198  @param fname file name (for reporting)
199 
200  @throw CIndexSuperHeader_Exception
201  */
202  void Save( std::ostream & os, const std::string & fname );
203 
204  size_t actual_size_; //< superheader file size reported by OS
205 
206 private:
207 
208  Uint4 endianness_; //< superheader endianness
209  Uint4 version_; //< index format version
210 };
211 
212 /** Superheader derived classes parametrized by index format version. */
213 template< Uint4 INDEX_FORMAT_VERSION > class NCBI_XBLAST_EXPORT CIndexSuperHeader;
214 
215 /** Superheader for old style indices. */
216 template<> class NCBI_XBLAST_EXPORT
217 CIndexSuperHeader< CIndexSuperHeader_Base::INDEX_FORMAT_VERSION_1 >
218  : public CIndexSuperHeader_Base
219 {
220 public:
221 
222  /** Object constructor.
223 
224  Reads the superheader structure from the file.
225 
226  @param size actual size of the superheader file
227  @param endianness superheader file endianness
228  @param version index format version
229  @param fname index superheader file name
230  @param is input stream corresponding to superheader file
231 
232  @throw CIndexSuperHeader_Exception
233  */
235  size_t size, Uint4 endianness, Uint4 version,
236  const std::string & fname, std::istream & is );
237 
238  /** Object constructor.
239 
240  Used to create a superheader object for saving.
241 
242  @param n_seq number of sequences in the database volume
243  @param n_vol number of index volumes in the index for a given
244  database volume.
245 
246  @throw CIndexSuperHeader_Exception
247  */
248  CIndexSuperHeader( Uint4 n_seq, Uint4 n_vol );
249 
250  /** Get number of sequences in the index (total of all volumes).
251 
252  @note Overrides CIndexSuperHeader_Base::GetNumSeq().
253  */
254  virtual Uint4 GetNumSeq( void ) const { return num_seq_; }
255 
256  /** Get number of volumes in the index.
257  */
258  virtual Uint4 GetNumVol( void ) const { return num_vol_; }
259 
260  /** Save the superheader into the file.
261 
262  @param fname output file name
263 
264  @throw CIndexSuperHeader_Exception
265  */
266  virtual void Save( const std::string & fname );
267 
268 private:
269 
270  /// Expected size of the superheader file.
271  static const size_t EXPECTED_SIZE = COMMON_SIZE + 2*sizeof( Uint4 );
272 
273  Uint4 num_seq_; //< total number of sequences in all index volumes
274  Uint4 num_vol_; //< total number of volumes in the index
275 };
276 
277 /** Read superheader structure from the file.
278 
279  @param fname superheader file name
280 
281  @return shared pointer to the superheader object
282 
283  @throw CIndexSuperHeader_Exception
284 */
286 GetIndexSuperHeader( const std::string & fname );
287 
288 /** Structure into which an index header is loaded. */
290 {
291  bool legacy_; /**< This is a legacy index format. */
292 
293  unsigned long hkey_width_; /**< Size in bp of the Nmer used as a hash key. */
294  unsigned long stride_; /**< Stride used to index database locations. */
295  unsigned long ws_hint_; /**< Word size hint used during index creation. */
296 
297  unsigned long max_chunk_size_; /**< Chunk size used to split subjects. */
298  unsigned long chunk_overlap_; /**< Overlap of neighboring chunks. */
299 
300  CSequenceIStream::TStreamPos start_; /**< OID of the first sequence in the index. */
301  CSequenceIStream::TStreamPos start_chunk_; /**< Number of the first chunk of the first sequence in the index. */
302  CSequenceIStream::TStreamPos stop_; /**< OID of the last sequence in the index. */
303  CSequenceIStream::TStreamPos stop_chunk_; /**< Number of the last chunk of the last sequence in the index. */
304 };
305 
306 /** Read the index header information from the given file.
307  @param fname [I] name of the index volume file
308  @return the number of subjects in the index volume (from the volume header)
309 */
310 size_t GetIdxVolNumOIDs( const std::string & fname );
311 
312 /** A vector or pointer based sequence wrapper.
313  Serves as either a std::vector wrapper or holds a constant size
314  sequence pointed to by an external pointer.
315 */
316 template< typename T >
318 {
319  typedef std::vector< T > TVector; /**< Sequence type being wrapped. */
320 
321  public:
322 
323  /**@name Declarations forwarded from TVector. */
324  /**@{*/
325  typedef typename TVector::size_type size_type;
326  typedef typename TVector::value_type value_type;
327  typedef typename TVector::reference reference;
328  typedef typename TVector::const_reference const_reference;
329  /**@}*/
330 
331  /** Iterator type pointing to const data. */
332  typedef const T * const_iterator;
333 
334  /** Object constructor.
335  Initializes the object as a std::vector wrapper.
336  @param sz [I] initial size
337  @param v [I] initial element value
338  */
339  CVectorWrap( size_type sz = 0, T v = T() )
340  : base_( 0 ), data_( sz, v ), vec_( true )
341  { if( !data_.empty() ) base_ = &data_[0]; }
342 
343  /** Make the object hold an external sequence.
344  @param base [I] pointer to the external sequence
345  @param sz [I] size of the external sequence
346  */
347  void SetPtr( T * base, size_type sz )
348  {
349  base_ = base;
350  vec_ = false;
351  size_ = sz;
352  }
353 
354  /** Indexing operator.
355  @param n [I] index
356  @return reference to the n-th element
357  */
359  { return base_[n]; }
360 
361  /** Indexing operator.
362  @param n [I] index
363  @return reference to constant value of the n-th element.
364  */
366  { return base_[n]; }
367 
368  /** Change the size of the sequence.
369  Only works when the object holds a std::vector.
370  @param n [I] new sequence size
371  @param v [I] initial value for newly created elements
372  */
373  void resize( size_type n, T v = T() )
374  {
375  if( vec_ ) {
376  data_.resize( n, v );
377  base_ = &data_[0];
378  }
379  }
380 
381  /** Get the sequence size.
382  @return length of the sequence
383  */
384  size_type size() const
385  { return vec_ ? data_.size() : size_; }
386 
387  /** Get the start of the sequence.
388  @return iterator pointing to the beginning of the sequence.
389  */
390  const_iterator begin() const { return base_; }
391 
392  /** Get the end of the sequence.
393  @return iterator pointing to past the end of the sequence.
394  */
396  { return vec_ ? base_ + data_.size() : base_ + size_; }
397 
398  private:
399 
400  T * base_; /**< Pointer to the first element of the sequence. */
401  TVector data_; /**< std::vector object wrapped by this object. */
402  bool vec_; /**< Flag indicating whether it is a wrapper or a holder of external sequence. */
403  size_type size_; /**< Size of the external sequence. */
404 };
405 
406 /** Types of exception the indexing library can throw.
407  */
409 {
410  public:
411 
412  /** Numerical error codes. */
413  enum EErrCode
414  {
415  eBadOption, /**< Bad index creation/search option. */
416  eBadSequence, /**< Bad input sequence data. */
417  eBadVersion, /**< Wrong index version. */
418  eBadData, /**< Bad index data. */
419  eIO /**< I/O error. */
420  };
421 
422  /** Get a human readable description of the exception type.
423  @return string describing the exception type
424  */
425  virtual const char * GetErrCodeString() const override;
426 
428 };
429 
430 class CSubjectMap;
431 
432 /** Base class providing high level interface to index objects.
433  */
435 {
436  public:
437 
438  /** Letters per byte in the sequence store.
439  Sequence data is stored in the index packed 4 bases per byte.
440  */
441  static const unsigned long CR = 4;
442 
443  /** Only process every STRIDEth nmer.
444  STRIDE value of 5 allows for search of contiguous seeds of
445  length >= 16.
446  */
447  static const unsigned long STRIDE = 5;
448 
449  /** Offsets below this are reserved for special purposes.
450  Bits 0-2 of such an offset represent the distance from
451  the start of the Nmer to the next invalid base to the left of
452  the Nmer. Bits 3-5 represent the distance from the end of the
453  Nmer to the next invalid base to the right of the Nmer.
454  */
455  static const unsigned long MIN_OFFSET = 64;
456 
457  /** How many bits are used for special codes for first/last nmers.
458  See comment to MIN_OFFSET.
459  */
460  static const unsigned long CODE_BITS = 3;
461 
462  /** Index version that this library handles. */
463  static const unsigned char VERSION = (unsigned char)5;
464 
465  /** Simple record type used to specify index creation parameters.
466  */
467  struct SOptions
468  {
469  bool idmap; /**< Indicator of the index map creation. */
470  bool legacy; /**< Indicator of the legacy index format. */
471  unsigned long stride; /**< Stride to use for stored database locations. */
472  unsigned long ws_hint; /**< Most likely word size to use for searches. */
473  unsigned long hkey_width; /**< Width of the hash key in bits. */
474  unsigned long chunk_size; /**< Long sequences are split into chunks
475  of this size. */
476  unsigned long chunk_overlap; /**< Amount by which individual chunks overlap. */
477  unsigned long report_level; /**< Verbose index creation. */
478  unsigned long max_index_size; /**< Maximum index size in megabytes. */
479 
480  std::string stat_file_name; /**< File to write index statistics into. */
481  };
482 
483  /** Type used to enumerate sequences in the index. */
485 
486  /** Type representing main memory unit of the index structure. */
487  typedef Uint4 TWord;
488 
489  /** This class represents a set of seeds obtained by searching
490  all subjects represented by the index.
491  */
492  class CSearchResults : public CObject
493  {
494  /** Each vector item points to results for a particular
495  logical subject.
496  */
497  typedef vector< BlastInitHitList * > TResults;
498 
499  public:
500 
501  /** Convenience declaration */
503 
504  /** Object constructor.
505  @param word_size [I] word size used for the search
506  @param start [I] logical subject corresponding to the
507  first element of the result set
508  @param size [I] number of logical subjects covered by
509  this result set
510  @param map [I] mapping from (subject, chunk) pairs to
511  logical sequence ids
512  @param map_size [I] number of elements in map
513  */
515  unsigned long word_size,
516  TSeqNum start, TSeqNum size,
517  const TWord * map, size_t map_size )
518  : word_size_( word_size ), start_( start ), results_( size, 0 )
519  {
520  for( size_t i = 0; i < map_size; ++i ) {
521  map_.push_back( map[i] );
522  }
523  }
524 
525  /** Get the result set for a particular logical subject.
526  @param seq [I] logical subject number
527  @return pointer to a C structure describing the set of seeds
528  */
530  {
531  if( seq == 0 ) return 0;
532  else if( seq - start_ - 1 >= results_.size() ) return 0;
533  else return results_[seq - start_ - 1];
534  }
535 
536  /** Get the search word size.
537 
538  @return Word size value used for the search.
539  */
540  unsigned long GetWordSize() const { return word_size_; }
541 
542  private:
543 
544  /** Map a subject sequence and a chunk number to
545  internal logical id.
546  @param subj The subject id.
547  @param chunk The chunk number.
548  @return Internal logical id of the given sequence.
549  */
550  TSeqNum MapSubject( TSeqNum subj, TSeqNum chunk ) const
551  {
552  if( subj >= map_.size() ) return 0;
553  return (TSeqNum)(map_[subj]) + chunk;
554  }
555 
556  public:
557 
558  /** Get the result set for a particular subject and chunk.
559  @param subj The subject id.
560  @param chunk The chunk number.
561  @return pointer to a C structure describing the set of seeds
562  */
564  { return GetResults( MapSubject( subj, chunk ) ); }
565 
566  /** Check if any results are available for a given subject sequence.
567 
568  @param subj The subject id.
569 
570  @return true if there are seeds available for this subject,
571  false otherwise.
572  */
573  bool CheckResults( TSeqNum subj ) const
574  {
575  if( subj >= map_.size() ) return false;
576  bool res = false;
577 
578  TSeqNum start = MapSubject( subj, 0 );
579  TSeqNum end = MapSubject( subj + 1, 0 );
580  if( end == 0 ) end = start_ + static_cast<TSeqNum>(results_.size()) + 1;
581 
582  for( TSeqNum chunk = start; chunk < end; ++chunk ) {
583  if( GetResults( chunk ) != 0 ) {
584  res = true;
585  break;
586  }
587  }
588 
589  return res;
590  }
591 
592  /** Set the result set for a given logical subject.
593  @param seq [I] logical subject number
594  @param res [I] pointer to the C structure describing
595  the set of seeds
596  */
598  {
599  if( seq > 0 && seq - start_ - 1 < results_.size() ) {
600  results_[seq - start_ - 1] = res;
601  }
602  }
603 
604  /** Object destructor. */
606  {
607  for( TResults::iterator it = results_.begin();
608  it != results_.end(); ++it ) {
609  if( *it ) {
610  BLAST_InitHitListFree( *it );
611  }
612  }
613  }
614 
615  /** Get the number of logical sequences in the results set.
616  @return number of sequences in the result set
617  */
618  TSeqNum NumSeq() const { return static_cast<TSeqNum>(results_.size()); }
619 
620  private:
621 
622  unsigned long word_size_; /**< Word size used for the search. */
623  TSeqNum start_; /**< Starting logical subject number. */
624  TResults results_; /**< The combined result set. */
625  vector< Uint8 > map_; /**< (subject,chunk)->(logical id) map. */
626  };
627 
628  /** Creates an SOptions instance initialized with default values.
629 
630  @return instance of SOptions filled with default option values
631  */
632  static SOptions DefaultSOptions();
633 
634  /** Simple record type used to specify index search parameters.
635  For description of template types see documentation for
636  CDbIndex::SOptions.
637  */
639  {
640  unsigned long word_size; /**< Target seed length. */
641  unsigned long two_hits; /**< Window for two-hit method (see megablast docs). */
642  };
643 
644  /** Create an index object.
645 
646  Creates an instance of CDbIndex using the named resource as input.
647  The name of the resource is given by the <TT>fname</TT> parameter.
648 
649  @param fname [I] input file name
650  @param oname [I] output file name
651  @param start [I] number of the first sequence in the index
652  @param start_chunk [I] number of the first chunk at which the starting
653  sequence should be processed
654  @param stop [I/O] number of the last sequence in the index;
655  returns the number of the actual last sequece
656  stored
657  @param stop_chunk [I/O] number of the last chunk of the last sequence
658  in the index
659  @param options [I] index creation parameters
660  */
661  static void MakeIndex(
662  const std::string & fname,
663  const std::string & oname,
664  TSeqNum start, TSeqNum start_chunk,
665  TSeqNum & stop, TSeqNum & stop_chunk,
666  const SOptions & options
667  );
668 
669  /** Create an index object.
670 
671  This function is the same as
672  CDbIndex::MakeIndex( fname, start, start_chunk, stop, stop_chunk, options )
673  with start_chunk set to 0.
674  */
675  static void MakeIndex(
676  const std::string & fname,
677  const std::string & oname,
678  TSeqNum start,
679  TSeqNum & stop, TSeqNum & stop_chunk,
680  const SOptions & options )
681  { MakeIndex( fname, oname, start, 0, stop, stop_chunk, options ); }
682 
683  /** Create an index object.
684 
685  This function is the same as
686  CDbIndex::MakeIndex( fname, start, stop, stop_chunk, options )
687  except that it does not need <TT>stop_chunk</TT> parameter and
688  can only be used to create indices containing whole sequences.
689  */
690  static void MakeIndex(
691  const std::string & fname,
692  const std::string & oname,
693  TSeqNum start, TSeqNum & stop,
694  const SOptions & options
695  );
696 
697  /** Create an index object.
698 
699  Creates an instance of CDbIndex using a given stream as input.
700 
701  @param input [I] stream for reading sequence and mask information
702  @param oname [I] output file name
703  @param start [I] number of the first sequence in the index
704  @param start_chunk [I] number of the first chunk at which the starting
705  sequence should be processed
706  @param stop [I/O] number of the last sequence in the index;
707  returns the number of the actual last sequece
708  stored
709  @param stop_chunk [I/O] number of the last chunk of the last sequence
710  in the index
711  @param options [I] index creation parameters
712  */
713  static void MakeIndex(
715  const std::string & oname,
716  TSeqNum start, TSeqNum start_chunk,
717  TSeqNum & stop, TSeqNum & stop_chunk,
718  const SOptions & options
719  );
720 
721  /** Create an index object.
722 
723  This function is the same as
724  CDbIndex::MakeIndex( input, start, start_chunk, stop, stop_chunk, options )
725  with start_chunk set to 0.
726  */
727  static void MakeIndex(
729  const std::string & oname,
730  TSeqNum start,
731  TSeqNum & stop, TSeqNum & stop_chunk,
732  const SOptions & options )
733  { MakeIndex( input, oname, start, 0, stop, stop_chunk, options ); }
734 
735  /** Create an index object.
736 
737  This function is the same as
738  CDbIndex::MakeIndex( input, start, stop, stop_chunk, options )
739  except that it does not need <TT>stop_chunk</TT> parameter and
740  can only be used to create indices containing whole sequences.
741  */
742  static void MakeIndex(
744  const std::string & oname,
745  TSeqNum start, TSeqNum & stop,
746  const SOptions & options
747  );
748 
749  /** Load index.
750 
751  @param fname [I] file containing index data
752 
753  @return CRef to the loaded index
754  */
755  static CRef< CDbIndex > Load( const std::string & fname, bool nomap = false );
756 
757  /** Search the index.
758 
759  @param query [I] the query sequence in BLASTNA format
760  @param locs [I] which parts of the query to search
761  @param search_options [I] search parameters
762  */
764  const BLAST_SequenceBlk * query,
765  const BlastSeqLoc * locs,
766  const SSearchOptions & search_options
767  );
768 
769  /** Index object destructor. */
770  virtual ~CDbIndex() {}
771 
772  /** Get the OID of the first sequence in the index.
773  @return OID of the first sequence in the index
774  */
775  TSeqNum StartSeq() const { return start_; }
776 
777  /** Get the number of the first chunk of the first sequence
778  in the index.
779  @return the number of the first sequence chunk in the index
780  */
781  TSeqNum StartChunk() const { return start_chunk_; }
782 
783  /** Get the OID of the last sequence in the index.
784  @return OID of the last sequence in the index
785  */
786  TSeqNum StopSeq() const { return stop_; }
787 
788  /** Get the number of the last chunk of the last sequence
789  in the index.
790  @return the number of the last sequence chunk in the index
791  */
792  TSeqNum StopChunk() const { return stop_chunk_; }
793 
794  /** Get the length of the subject sequence.
795 
796  @param oid Ordinal id of the subject sequence.
797 
798  @return Length of the sequence in bases.
799  */
800  virtual TSeqPos GetSeqLen( TSeqNum /*oid*/ ) const
801  {
802  NCBI_THROW(
803  CDbIndex_Exception, eBadVersion,
804  "GetSeqLen() is not supported in this index version." );
805  return 0;
806  }
807 
808  /** Get the sequence data of the subject sequence.
809 
810  @param oid Ordinal id of the subject sequence.
811 
812  @return Pointer to the sequence data.
813  */
814  virtual const Uint1 * GetSeqData( TSeqNum /*oid*/ ) const
815  {
816  NCBI_THROW(
817  CDbIndex_Exception, eBadVersion,
818  "GetSeqData() is not supported in this index version." );
819  return 0;
820  }
821 
822  /** If possible reduce the index footpring by unmapping
823  the portion that does not contain sequence data.
824  */
825  virtual void Remap() {}
826 
827  private:
828 
829  /** Load index from an open stream.
830  @param is [I] stream containing index data
831  @return object containing loaded index data
832  */
834 
835  /** Load index from a named file.
836  Usually this is used to memmap() the file data into
837  the index structure.
838  @param fname [I] index file name
839  @param nomap [I] if 'true', then read the the file
840  instead of mmap()'ing it
841  @return object containing loaded index data
842  */
843  template< bool LEGACY >
844  static CRef< CDbIndex > LoadIndex(
845  const std::string & fname, bool nomap = false );
846 
847  /** Actual implementation of seed searching.
848  Must be implemented by child classes.
849  @sa Search
850  */
852  const BLAST_SequenceBlk *,
853  const BlastSeqLoc *,
854  const SSearchOptions & )
855  { return CConstRef< CSearchResults >( null ); }
856 
857  public:
858 
860  {
863  };
864 
866 
868 
869  TSeqNum getStartOId() const { return header_.start_; }
870  TSeqNum getStopOId() const { return header_.stop_; }
871 
873  {
874  ASSERT( oid >= getStartOId() );
875  return oid - getStartOId();
876  }
877 
879  {
880  ASSERT( sid <= getStopOId() - getStartOId() );
881  return sid + getStartOId();
882  }
883 
884  unsigned long getHKeyWidth() const { return header_.hkey_width_; }
885  unsigned long getStride() const { return header_.stride_; }
886  unsigned long getWSHint() const { return header_.ws_hint_; }
887 
888  unsigned long getMaxChunkSize() const { return header_.max_chunk_size_; }
889  unsigned long getChunkOverlap() const { return header_.chunk_overlap_; }
890 
891  bool isLegacy() const { return header_.legacy_; }
892 
893  TWord getSubjectLength( TSeqNum sid ) const;
894  TSeqNum getCId( TSeqNum sid, TSeqNum rcid ) const;
895  TSeqNum getCId( TSeqNum sid ) const { return getCId( sid, 0 ); }
896  pair< TSeqNum, TSeqNum > getSRCId( TSeqNum cid ) const;
897  TSeqNum getSIdByCId( TSeqNum cid ) const { return getSRCId( cid ).first; }
898  TWord getChunkLength( TSeqNum cid ) const;
899  TWord getChunkLength( TSeqNum sid, TSeqNum rcid ) const
900  { return getChunkLength( getCId( sid, rcid ) ); }
901  TSeqNum getCIdByLRCId( TSeqNum lid, TSeqNum rcid ) const;
903  { return getSIdByCId( getCIdByLRCId( lid, rcid ) ); }
904  pair< TSeqNum, TSeqPos > getRCIdOffByLIdOff( TSeqNum lid, TSeqPos loff ) const;
905 
906  pair< TSeqNum, TSeqPos > getCIdOffByLIdOff( TSeqNum lid, TSeqPos loff ) const
907  {
908  pair< TSeqNum, TSeqPos > t = getRCIdOffByLIdOff( lid, loff );
909  return make_pair( getCIdByLRCId( lid, t.first ), t.second );
910  }
911 
912  TSeqPos getSOff( TSeqNum sid, TSeqNum rcid, TSeqPos coff ) const;
913 
914  pair< TSeqNum, TSeqPos > getSIdOffByCIdOff( TSeqNum cid, TSeqPos coff ) const
915  {
916  pair< TSeqNum, TSeqNum > t = getSRCId( cid );
917  return make_pair( t.first, getSOff( t.first, t.second, coff ) );
918  }
919 
920  pair< TSeqNum, TSeqPos > getSIdOffByLIdOff( TSeqNum lid, TSeqPos loff ) const
921  {
922  pair< TSeqNum, TSeqPos > t = getCIdOffByLIdOff( lid, loff );
923  return getSIdOffByCIdOff( t.first, t.second );
924  }
925 
926  TSeqNum getNumSubjects() const;
927  TSeqNum getNumChunks() const;
928  TSeqNum getNumChunks( TSeqNum sid ) const;
929 
930  const Uint1 * getSeqData( TSeqNum sid ) const;
931 
932  TSeqNum getLId( const TOffsetValue & v ) const;
933  TSeqPos getLOff( const TOffsetValue & v ) const;
934 
935  const string getBioseqIdBySId( TSeqNum sid ) const
936  {
937  if( sid < idmap_.size() ) return idmap_[sid];
938  else return "unknown";
939  }
940 
941  const vector< string > & getIdMap() const { return idmap_; }
942 
943  protected:
944 
945  TSeqNum start_; /**< OID of the first sequence in the index. */
946  TSeqNum start_chunk_; /**< Number of the first chunk of the first sequence. */
947  TSeqNum stop_; /**< OID of the last sequence in the inex. */
948  TSeqNum stop_chunk_; /**< Number of the last chunk of the last sequence. */
949 
950  SIndexHeader header_; /**< The index header structure. */
951  TSubjectMap * subject_map_; /**< The subject map object. */
952  vector< string > idmap_; /**< Mapping from source ids to bioseq ids. */
953 };
954 
955 /** Class representing index hash table and offset list database.
956 */
958 {
960 
961  public:
962 
963  /** Index word type (public to support Solaris). */
965 
967 
968  /** The type of the hash table.
969  The hash table implements the mapping from Nmer values to
970  the corresponding offset lists.
971  */
973 
974  /** Object constructor.
975  Creates the object by mapping data from a memory segment.
976  @param map [I/O] pointer to the memory segment
977  @param hkey_width [I] width in bp of the hash key
978  @param stride [I] stride of the index
979  @param ws_hint [I] ws_hint value of the index
980  */
982  TWord ** map, unsigned long hkey_width,
983  unsigned long stride, unsigned long ws_hint );
984 
985  /** Get the width of the hash key in base pairs.
986  @return hash key width
987  */
988  unsigned long hkey_width() const { return hkey_width_; }
989 
990  /** Accessor for minimum offset value.
991 
992  @return the minimum offset value
993  */
994  unsigned long getMinOffset() const { return min_offset_; }
995 
996  /** Accessor for stride value.
997 
998  @return the stride value
999  */
1000  unsigned long getStride() const { return stride_; }
1001 
1002  /** Accessor for ws_hint value.
1003 
1004  @return the ws_hint value
1005  */
1006  unsigned long getWSHint() const { return ws_hint_; }
1007 
1008  protected:
1009 
1010  /** Auxiliary data member used for importing the offset
1011  list data.
1012  */
1014 
1015  unsigned long hkey_width_; /**< Hash key width in bp. */
1016  unsigned long stride_; /**< Stride value used by the index. */
1017  unsigned long ws_hint_; /**< ws_hint values used by the index. */
1018  unsigned long min_offset_; /**< Minimum offset value used by the index. */
1019 
1020  THashTable hash_table_; /**< The hash table (mapping from
1021  Nmer values to the lists of
1022  offsets. */
1023 };
1024 
1025 /** Type representing subject map data.
1026 */
1028 {
1029  private:
1030 
1034 
1035  /** Type used to map database oids to the chunk info. */
1037 
1038  /** Type used for compressed subject sequence data storage. */
1040 
1041  /** Type for storing the chunk data.
1042  For raw offset encoding the offset into the vector serves also
1043  as the internal logical sequence id.
1044  */
1046 
1047  typedef CVectorWrap< TWord > TLengths; /**< Subject lengths storage type. */
1048  typedef CVectorWrap< TWord > TLIdMap; /**< Local id -> chunks map storage type. */
1049 
1050  public:
1051 
1052  /** Trivial constructor. */
1053  CSubjectMap() : total_( 0 ) {}
1054 
1055  /** Constructs object by mapping to the memory segment.
1056  @param map [I/O] pointer to the memory segment
1057  @param start [I] database oid of the first sequence
1058  in the map
1059  @param stop [I] database oid of the last sequence
1060  in the map
1061  @param stride [I] index stride value
1062  */
1063  CSubjectMap(
1064  TWord ** map, TSeqNum start, TSeqNum stop,
1065  unsigned long stride );
1066 
1067  CSubjectMap( TWord ** map, const SIndexHeader & header );
1068 
1069  /** Loads index by mapping to the memory segment.
1070  @param map [I/O] pointer to the memory segment
1071  @param start [I] database oid of the first sequence
1072  in the map
1073  @param stop [I] database oid of the last sequence
1074  in the map
1075  @param stride [I] index stride value
1076  */
1077  void Load(
1078  TWord ** map, TSeqNum start, TSeqNum stop,
1079  unsigned long stride );
1080 
1081  /** Provides a mapping from real subject ids and chunk numbers to
1082  internal logical subject ids.
1083  @return start of the (subject,chunk)->id mapping
1084  */
1085  const TWord * GetSubjectMap() const { return &subjects_[0]; }
1086 
1087  /** Return the start of the raw storage for compressed subject
1088  sequence data.
1089  @return start of the sequence data storage
1090  */
1091  const Uint1 * GetSeqStoreBase() const { return &seq_store_[0]; }
1092 
1093  /** Return the size in bytes of the eaw sequence storage.
1094 
1095  @return Size of the sequence data storage.
1096  */
1097  TWord GetSeqStoreSize() const { return total_; }
1098 
1099  /** Get the total number of sequence chunks in the map.
1100  @return number of chunks in the map
1101  */
1102  TSeqNum NumChunks() const { return (TSeqNum)(chunks_.size()); }
1103 
1104  /** Get number of chunks combined into a given logical sequence.
1105 
1106  @param lid The logical sequence id.
1107 
1108  @return Corresponding number of chunks.
1109  */
1111  {
1112  TWord * ptr = (TWord *)&lid_map_[0] + (lid<<2);
1113  return *(ptr + 1) - *ptr;
1114  }
1115 
1116  /** Get the logical sequence id from the database oid and the
1117  chunk number.
1118  @param subject [I] database oid
1119  @param chunk [I] the chunk number
1120  @return logical sequence id corresponding to subject and chunk
1121  */
1123  {
1124  if( subject < subjects_.size() ) {
1125  TSeqNum result =
1126  (TSeqNum)(subjects_[subject]) + chunk;
1127 
1128  if( result < chunks_.size() ) {
1129  return result;
1130  }
1131  }
1132 
1133  return 0;
1134  }
1135 
1136  /** Accessor for stride value.
1137 
1138  @return the stride value used by the index
1139  */
1140  unsigned long GetStride() const { return stride_; }
1141 
1142  /** Decode offset.
1143 
1144  @param offset The encoded offset value.
1145 
1146  @return A pair with first element being the local subject sequence
1147  id and the second element being the subject offset.
1148  */
1149  std::pair< TSeqNum, TSeqPos > DecodeOffset( TWord offset ) const
1150  {
1151  offset -= min_offset_;
1152  return std::make_pair(
1154  (TSeqPos)(min_offset_ +
1156  }
1157 
1158  /** Return the subject information based on the given logical subject
1159  id.
1160  @param subj [I] logical subject id
1161  @param start [0] starting offset of subj in the sequence store
1162  @param end [0] 1 + ending offset of subj in the sequence store
1163  */
1165  TSeqNum subj, TWord & start, TWord & end ) const
1166  {
1167  TWord * ptr = (TWord *)&lid_map_[0] + (subj<<2) + 2;
1168  start = *ptr++;
1169  end = *ptr;
1170  }
1171 
1172  /** Map logical sequence id and logical sequence offset to
1173  relative chunk number and chunk offset.
1174 
1175  @param lid The logical sequence id.
1176  @param soff The logical sequence offset.
1177 
1178  @return Pair of relative chunk number and chunk offset.
1179  */
1180  std::pair< TSeqNum, TSeqPos > MapSubjOff(
1181  TSeqNum lid, TSeqPos soff ) const
1182  {
1183  static const unsigned long CR = CDbIndex::CR;
1184 
1185  TWord * ptr = (TWord *)&lid_map_[0] + (lid<<2);
1186  TSeqNum start = (TSeqNum)*ptr++;
1187  TSeqNum end = (TSeqNum)*ptr++;
1188  TWord lid_start = *ptr;
1189  TWord abs_offset = lid_start + (TWord)soff/CR;
1190 
1191  typedef TChunks::const_iterator TChunksIter;
1192  TChunksIter siter = chunks_.begin() + start;
1193  TChunksIter eiter = chunks_.begin() + end;
1194  ASSERT( siter != eiter );
1195  TChunksIter res = std::upper_bound( siter, eiter, abs_offset );
1196  ASSERT( res != siter );
1197  --res;
1198 
1199  return std::make_pair(
1200  (TSeqNum)(res - siter),
1201  (TSeqPos)(soff - (*res - lid_start)*CR) );
1202  }
1203 
1204  /** Map logical id and relative chunk to absolute chunk id.
1205 
1206  @param lid logical sequence id
1207  @param lchunk chunk number within the logical sequence
1208 
1209  @return chunk id of the corresponding chunk
1210  */
1211  TSeqNum MapLId2Chunk( TSeqNum lid, TSeqNum lchunk ) const
1212  {
1213  TWord * ptr = (TWord *)&lid_map_[0] + (lid<<2);
1214  TSeqNum start = (TSeqNum)*ptr++;
1215  return start + lchunk;
1216  }
1217 
1218  /** Get the total number of logical sequences in the map.
1219  @return number of chunks in the map
1220  */
1222  { return static_cast<TSeqNum>(1 + (lid_map_.size()>>2)); }
1223 
1224  /** Get the length of the subject sequence.
1225 
1226  @param oid Ordinal id of the subject sequence.
1227 
1228  @return Length of the sequence in bases.
1229  */
1231  { return lengths_[oid]; }
1232 
1233  /** Get the sequence data of the subject sequence.
1234 
1235  @param oid Ordinal id of the subject sequence.
1236 
1237  @return Pointer to the sequence data.
1238  */
1239  const Uint1 * GetSeqData( TSeqNum oid ) const
1240  {
1241  TWord chunk = subjects_[oid] - 1;
1242  TWord start_index = chunks_[chunk];
1243  return &seq_store_[0] + start_index;
1244  }
1245 
1247  {
1248  ASSERT( sid <= subjects_.size() );
1249  return lengths_[sid];
1250  }
1251 
1252  TSeqNum getCId( TSeqNum sid, TSeqNum rcid ) const
1253  {
1254  ASSERT( sid <= subjects_.size() );
1255  TSeqNum result = subjects_[sid] + rcid - 1;
1256  ASSERT( result <= chunks_.size() );
1257  return result;
1258  }
1259 
1260  typedef pair< TSeqNum, TSeqPos > TSOPair;
1261  typedef pair< TSeqNum, TSeqNum > TSCPair;
1262  typedef vector< TSCPair > TSCPairMap;
1263 
1264  TSCPair getSRCId( TSeqNum cid ) const
1265  {
1266  ASSERT( cid < chunks_.size() );
1267  return c2s_map_[cid];
1268  }
1269 
1271  {
1272  ASSERT( cid < chunks_.size() );
1273  if( cid < chunks_.size() - 1 ) {
1274  TSCPair t = getSRCId( cid );
1275 
1276  if( t.first < subjects_.size() - 1 ) {
1277  TSeqNum nc = subjects_[t.first + 1] - subjects_[t.first];
1278  return (t.second == nc - 1) ? static_cast<TWord>(max_chunk_size_) :
1279  getSubjectLength( t.first )%(
1280  static_cast<TWord>(max_chunk_size_ - chunk_overlap_ ));
1281  }
1282  else return static_cast<TWord>(max_chunk_size_);
1283  }
1284  else {
1285  return static_cast<TWord>( getSubjectLength( static_cast<TSeqNum>(subjects_.size() - 2) )%(
1287  }
1288  }
1289 
1291  {
1292  ASSERT( lid < lid_map_.size() );
1293  TWord * ptr = (TWord *)&lid_map_[0] + (lid<<2);
1294  TSeqNum start = (TSeqNum)*ptr++;
1295  ASSERT( rcid < (TSeqNum)*ptr - start );
1296  return start + rcid;
1297  }
1298 
1300  {
1301  ASSERT( lid < lid_map_.size() );
1302  static const unsigned long CR = CDbIndex::CR;
1303 
1304  TWord * ptr = (TWord *)&lid_map_[0] + (lid<<2);
1305  TSeqNum start = (TSeqNum)*ptr++;
1306  TSeqNum end = (TSeqNum)*ptr++;
1307  ASSERT( start < chunks_.size() );
1308  ASSERT( end <= chunks_.size() );
1309  TWord lid_start = *ptr;
1310  TWord abs_offset = lid_start + (TWord)loff/CR;
1311  ASSERT( abs_offset < seq_store_.size() );
1312 
1313  typedef TChunks::const_iterator TChunksIter;
1314  TChunksIter siter = chunks_.begin() + start;
1315  TChunksIter eiter = chunks_.begin() + end;
1316  ASSERT( siter != eiter );
1317  TChunksIter res = std::upper_bound( siter, eiter, abs_offset );
1318  ASSERT( res != siter );
1319  --res;
1320 
1321  return std::make_pair(
1322  (TSeqNum)(res - siter),
1323  (TSeqPos)(loff - (*res - lid_start)*CR) );
1324  }
1325 
1327  TSeqNum rcid, TSeqPos coff ) const
1328  {
1329  ASSERT( sid < subjects_.size() - 1 );
1330  ASSERT( subjects_[sid] - 1 + rcid < chunks_.size() );
1331  TSeqPos res = static_cast<TSeqPos>(rcid*(max_chunk_size_ - chunk_overlap_) + coff);
1332  ASSERT( res < lengths_[sid] );
1333  return res;
1334  }
1335 
1336  TSeqNum getNumSubjects() const { return static_cast<TSeqNum>(subjects_.size() - 1); }
1337  TSeqNum getNumChunks() const { return static_cast<TSeqNum>(chunks_.size()); }
1338 
1340  {
1341  ASSERT( sid < subjects_.size() -1 );
1342  if( sid < subjects_.size() - 2 ) {
1343  return subjects_[sid + 1] - subjects_[sid];
1344  }
1345  else return static_cast<TSeqNum>( chunks_.size() + 1 - subjects_[sid]);
1346  }
1347 
1348  const Uint1 * getSeqData( TSeqNum sid ) const
1349  {
1350  ASSERT( sid < subjects_.size() - 1 );
1351  TWord chunk = subjects_[sid] - 1;
1352  TWord start_index = chunks_[chunk];
1353  return &seq_store_[0] + start_index;
1354  }
1355 
1356  TSeqNum getLId( const TOffsetValue & v ) const
1357  { return (TSeqNum)(v.offset>>offset_bits_); }
1358 
1359  TSeqPos getLOff( const TOffsetValue & v ) const
1360  { return (TSeqPos)((v.offset&offset_mask_)*stride_); }
1361 
1362  private:
1363 
1364  /** Set up the sequence store from the memory segment.
1365  @param map [I/O] points to the memory segment
1366  */
1367  void SetSeqDataFromMap( TWord ** map );
1368 
1369  TSubjects subjects_; /**< Mapping from database oids to the chunk info. */
1370  TSeqStore seq_store_; /**< Storage for the raw subject sequence data. */
1371  TWord total_; /**< Size in bytes of the raw sequence storage.
1372  (only valid after the complete object has
1373  been constructed) */
1374  TChunks chunks_; /**< Collection of individual chunk descriptors. */
1375 
1376  unsigned long stride_; /**< Index stride value. */
1377  unsigned long min_offset_; /**< Minimum offset used by the index. */
1378 
1379  TLengths lengths_; /**< Subject lengths storage. */
1380  TLIdMap lid_map_; /**< Local id -> chunk map storage. */
1381  Uint1 offset_bits_; /**< Number of bits used to encode offset. */
1382  TWord offset_mask_; /**< Mask to extract offsets. */
1383  TSCPairMap c2s_map_; /**< CId -> (SId, RCId) map. */
1384 
1385  unsigned long max_chunk_size_;
1386  unsigned long chunk_overlap_;
1387 };
1388 
1389 inline CDbIndex::TWord
1391 { return subject_map_->getSubjectLength( sid ); }
1392 
1393 inline CDbIndex::TSeqNum
1395 { return subject_map_->getCId( sid, rcid ); }
1396 
1397 inline pair< CDbIndex::TSeqNum, CDbIndex::TSeqNum >
1399 { return subject_map_->getSRCId( cid ); }
1400 
1402 { return subject_map_->getChunkLength( cid ); }
1403 
1404 inline CDbIndex::TSeqNum
1406 { return subject_map_->getCIdByLRCId( lid, rcid ); }
1407 
1408 inline pair< CDbIndex::TSeqNum, TSeqPos >
1410 { return subject_map_->getRCIdOffByLIdOff( lid, loff ); }
1411 
1413  CDbIndex::TSeqNum sid, CDbIndex::TSeqNum rcid, TSeqPos coff ) const
1414 { return subject_map_->getSOff( sid, rcid, coff ); }
1415 
1417 { return subject_map_->getNumSubjects(); }
1418 
1420 { return subject_map_->getNumChunks(); }
1421 
1423 { return subject_map_->getNumChunks( sid ); }
1424 
1425 inline const Uint1 * CDbIndex::getSeqData( CDbIndex::TSeqNum sid ) const
1426 { return subject_map_->getSeqData( sid ); }
1427 
1429  const CDbIndex::TOffsetValue & v ) const
1430 { return subject_map_->getLId( v ); }
1431 
1433 { return subject_map_->getLOff( v ); }
1434 
1435 END_SCOPE( blastdbindex )
1437 
1438 #endif
1439 
Definitions used throughout BLAST.
#define NCBI_XBLAST_EXPORT
NULL operations for other cases.
Definition: blast_export.h:65
Ungapped extension structures that are common to nucleotide and protein extension routines.
BlastInitHitList * BLAST_InitHitListFree(BlastInitHitList *init_hitlist)
Free memory for the BlastInitList structure.
Definition: blast_extend.c:261
#define true
Definition: bool.h:35
CConstRef –.
Definition: ncbiobj.hpp:1266
This class represents a set of seeds obtained by searching all subjects represented by the index.
Definition: dbindex.hpp:493
unsigned long GetWordSize() const
Get the search word size.
Definition: dbindex.hpp:540
void SetResults(TSeqNum seq, BlastInitHitList *res)
Set the result set for a given logical subject.
Definition: dbindex.hpp:597
TSeqNum start_
Starting logical subject number.
Definition: dbindex.hpp:623
CSearchResults(unsigned long word_size, TSeqNum start, TSeqNum size, const TWord *map, size_t map_size)
Object constructor.
Definition: dbindex.hpp:514
BlastInitHitList * GetResults(TSeqNum subj, TSeqNum chunk) const
Get the result set for a particular subject and chunk.
Definition: dbindex.hpp:563
vector< Uint8 > map_
(subject,chunk)->(logical id) map.
Definition: dbindex.hpp:625
~CSearchResults()
Object destructor.
Definition: dbindex.hpp:605
TSeqNum MapSubject(TSeqNum subj, TSeqNum chunk) const
Map a subject sequence and a chunk number to internal logical id.
Definition: dbindex.hpp:550
CDbIndex::TWord TWord
Convenience declaration.
Definition: dbindex.hpp:502
unsigned long word_size_
Word size used for the search.
Definition: dbindex.hpp:622
bool CheckResults(TSeqNum subj) const
Check if any results are available for a given subject sequence.
Definition: dbindex.hpp:573
BlastInitHitList * GetResults(TSeqNum seq) const
Get the result set for a particular logical subject.
Definition: dbindex.hpp:529
vector< BlastInitHitList * > TResults
Each vector item points to results for a particular logical subject.
Definition: dbindex.hpp:497
TResults results_
The combined result set.
Definition: dbindex.hpp:624
TSeqNum NumSeq() const
Get the number of logical sequences in the results set.
Definition: dbindex.hpp:618
Types of exception the indexing library can throw.
Definition: dbindex.hpp:409
EErrCode
Numerical error codes.
Definition: dbindex.hpp:414
@ eBadOption
Bad index creation/search option.
Definition: dbindex.hpp:415
@ eBadVersion
Wrong index version.
Definition: dbindex.hpp:417
@ eBadData
Bad index data.
Definition: dbindex.hpp:418
@ eBadSequence
Bad input sequence data.
Definition: dbindex.hpp:416
NCBI_EXCEPTION_DEFAULT(CDbIndex_Exception, CException)
Base class providing high level interface to index objects.
Definition: dbindex.hpp:435
TSeqNum StartSeq() const
Get the OID of the first sequence in the index.
Definition: dbindex.hpp:775
pair< TSeqNum, TSeqPos > getSIdOffByCIdOff(TSeqNum cid, TSeqPos coff) const
Definition: dbindex.hpp:914
TSeqNum getNumSubjects() const
Definition: dbindex.hpp:1416
static CRef< CDbIndex > LoadIndex(CNcbiIstream &is)
Load index from an open stream.
const Uint1 * getSeqData(TSeqNum sid) const
Definition: dbindex.hpp:1425
TSeqNum StopChunk() const
Get the number of the last chunk of the last sequence in the index.
Definition: dbindex.hpp:792
TSeqNum getSIdByLRCId(TSeqNum lid, TSeqNum rcid) const
Definition: dbindex.hpp:902
TSeqNum getSIdByOId(TSeqNum oid) const
Definition: dbindex.hpp:872
bool isLegacy() const
Definition: dbindex.hpp:891
static void MakeIndex(CSequenceIStream &input, const std::string &oname, TSeqNum start, TSeqNum &stop, TSeqNum &stop_chunk, const SOptions &options)
Create an index object.
Definition: dbindex.hpp:727
CSubjectMap TSubjectMap
Definition: dbindex.hpp:867
TSeqPos getSOff(TSeqNum sid, TSeqNum rcid, TSeqPos coff) const
Definition: dbindex.hpp:1412
pair< TSeqNum, TSeqPos > getRCIdOffByLIdOff(TSeqNum lid, TSeqPos loff) const
Definition: dbindex.hpp:1409
TSeqNum getCIdByLRCId(TSeqNum lid, TSeqNum rcid) const
Definition: dbindex.hpp:1405
const vector< string > & getIdMap() const
Definition: dbindex.hpp:941
TSeqNum getCId(TSeqNum sid, TSeqNum rcid) const
Definition: dbindex.hpp:1394
pair< TSeqNum, TSeqNum > getSRCId(TSeqNum cid) const
Definition: dbindex.hpp:1398
virtual ~CDbIndex()
Index object destructor.
Definition: dbindex.hpp:770
pair< TSeqNum, TSeqPos > getCIdOffByLIdOff(TSeqNum lid, TSeqPos loff) const
Definition: dbindex.hpp:906
const string getBioseqIdBySId(TSeqNum sid) const
Definition: dbindex.hpp:935
TSeqNum start_
OID of the first sequence in the index.
Definition: dbindex.hpp:945
SOffsetValue TOffsetValue
Definition: dbindex.hpp:865
unsigned long getChunkOverlap() const
Definition: dbindex.hpp:889
unsigned long getHKeyWidth() const
Definition: dbindex.hpp:884
TSeqNum getCId(TSeqNum sid) const
Definition: dbindex.hpp:895
unsigned long getMaxChunkSize() const
Definition: dbindex.hpp:888
virtual const Uint1 * GetSeqData(TSeqNum) const
Get the sequence data of the subject sequence.
Definition: dbindex.hpp:814
TSeqNum stop_chunk_
Number of the last chunk of the last sequence.
Definition: dbindex.hpp:948
Uint4 TWord
Type representing main memory unit of the index structure.
Definition: dbindex.hpp:487
virtual TSeqPos GetSeqLen(TSeqNum) const
Get the length of the subject sequence.
Definition: dbindex.hpp:800
SIndexHeader header_
The index header structure.
Definition: dbindex.hpp:950
TSeqNum start_chunk_
Number of the first chunk of the first sequence.
Definition: dbindex.hpp:946
unsigned long getWSHint() const
Definition: dbindex.hpp:886
static void MakeIndex(const std::string &fname, const std::string &oname, TSeqNum start, TSeqNum &stop, TSeqNum &stop_chunk, const SOptions &options)
Create an index object.
Definition: dbindex.hpp:675
TSeqNum getLId(const TOffsetValue &v) const
Definition: dbindex.hpp:1428
TWord getChunkLength(TSeqNum cid) const
Definition: dbindex.hpp:1401
TSeqPos getLOff(const TOffsetValue &v) const
Definition: dbindex.hpp:1432
TSubjectMap * subject_map_
The subject map object.
Definition: dbindex.hpp:951
TWord getChunkLength(TSeqNum sid, TSeqNum rcid) const
Definition: dbindex.hpp:899
TSeqNum getSIdByCId(TSeqNum cid) const
Definition: dbindex.hpp:897
unsigned long getStride() const
Definition: dbindex.hpp:885
TSeqNum StartChunk() const
Get the number of the first chunk of the first sequence in the index.
Definition: dbindex.hpp:781
vector< string > idmap_
Mapping from source ids to bioseq ids.
Definition: dbindex.hpp:952
TSeqNum getNumChunks() const
Definition: dbindex.hpp:1419
virtual CConstRef< CSearchResults > DoSearch(const BLAST_SequenceBlk *, const BlastSeqLoc *, const SSearchOptions &)
Actual implementation of seed searching.
Definition: dbindex.hpp:851
pair< TSeqNum, TSeqPos > getSIdOffByLIdOff(TSeqNum lid, TSeqPos loff) const
Definition: dbindex.hpp:920
static const unsigned long CR
Letters per byte in the sequence store.
Definition: dbindex.hpp:441
virtual void Remap()
If possible reduce the index footpring by unmapping the portion that does not contain sequence data.
Definition: dbindex.hpp:825
TSeqNum stop_
OID of the last sequence in the inex.
Definition: dbindex.hpp:947
CSequenceIStream::TStreamPos TSeqNum
Type used to enumerate sequences in the index.
Definition: dbindex.hpp:484
TWord getSubjectLength(TSeqNum sid) const
Definition: dbindex.hpp:1390
TSeqNum getStopOId() const
Definition: dbindex.hpp:870
TSeqNum getStartOId() const
Definition: dbindex.hpp:869
TSeqNum getOIdBySId(TSeqNum sid) const
Definition: dbindex.hpp:878
TSeqNum StopSeq() const
Get the OID of the last sequence in the index.
Definition: dbindex.hpp:786
virtual Uint4 GetNumVol(void) const
Get number of volumes in the index.
Definition: dbindex.hpp:258
virtual Uint4 GetNumSeq(void) const
Get number of sequences in the index (total of all volumes).
Definition: dbindex.hpp:254
Base class for index superheaders.
Definition: dbindex.hpp:119
Uint4 GetVersion(void)
Get the index format version.
virtual Uint4 GetNumSeq(void) const =0
Get number of sequences in the index (total of all volumes).
virtual void Save(const std::string &fname)=0
Save the superheader into the file.
virtual ~CIndexSuperHeader_Base()
Object destructor.
Definition: dbindex.hpp:168
Uint4 GetEndianness(void)
Get the endianness of the superheader.
virtual Uint4 GetNumVol(void) const =0
Get number of volumes in the index.
EEndianness
Symbolic values for endianess.
Definition: dbindex.hpp:132
Exceptions that superheader objects can throw.
Definition: dbindex.hpp:83
EErrCode
Numerical error codes.
Definition: dbindex.hpp:88
@ eWrite
stream writing error
Definition: dbindex.hpp:91
@ eVersion
unrecognized index format version
Definition: dbindex.hpp:93
@ eEndian
wrong index endianness
Definition: dbindex.hpp:92
@ eRead
stream reading error
Definition: dbindex.hpp:90
@ eFile
filesystem error
Definition: dbindex.hpp:89
virtual const char * GetErrCodeString() const override
Get a human readable description of the exception type.
Definition: dbindex.hpp:101
NCBI_EXCEPTION_DEFAULT(CIndexSuperHeader_Exception, CException)
Superheader derived classes parametrized by index format version.
Definition: dbindex.hpp:213
CObject –.
Definition: ncbiobj.hpp:180
Class representing index hash table and offset list database.
Definition: dbindex.hpp:958
THashTable hash_table_
The hash table (mapping from Nmer values to the lists of offsets.
Definition: dbindex.hpp:1020
CVectorWrap< TWord > THashTable
The type of the hash table.
Definition: dbindex.hpp:972
COffsetData_Base(TWord **map, unsigned long hkey_width, unsigned long stride, unsigned long ws_hint)
Object constructor.
Definition: dbindex.cpp:550
TWord total_
Auxiliary data member used for importing the offset list data.
Definition: dbindex.hpp:1013
unsigned long min_offset_
Minimum offset value used by the index.
Definition: dbindex.hpp:1018
unsigned long stride_
Stride value used by the index.
Definition: dbindex.hpp:1016
unsigned long getMinOffset() const
Accessor for minimum offset value.
Definition: dbindex.hpp:994
CDbIndex::TWord TWord
Index word type (public to support Solaris).
Definition: dbindex.hpp:964
unsigned long ws_hint_
ws_hint values used by the index.
Definition: dbindex.hpp:1017
unsigned long hkey_width_
Hash key width in bp.
Definition: dbindex.hpp:1015
unsigned long hkey_width() const
Get the width of the hash key in base pairs.
Definition: dbindex.hpp:988
CDbIndex::SOffsetValue TOffsetValue
Definition: dbindex.hpp:966
unsigned long getWSHint() const
Accessor for ws_hint value.
Definition: dbindex.hpp:1006
unsigned long getStride() const
Accessor for stride value.
Definition: dbindex.hpp:1000
Iterator for 0-terminated pre-ordered offset lists.
Definition: dbindex_sp.hpp:97
CRef –.
Definition: ncbiobj.hpp:618
Class used to abstract reading nucleotide sequences from various sources.
Uint4 TStreamPos
Type used to represent positions within a sequence stream.
Type representing subject map data.
Definition: dbindex.hpp:1028
TSeqNum getLId(const TOffsetValue &v) const
Definition: dbindex.hpp:1356
TLengths lengths_
Subject lengths storage.
Definition: dbindex.hpp:1379
TWord getSubjectLength(TSeqNum sid) const
Definition: dbindex.hpp:1246
const Uint1 * GetSeqStoreBase() const
Return the start of the raw storage for compressed subject sequence data.
Definition: dbindex.hpp:1091
TSeqNum GetNumChunks(TSeqNum lid) const
Get number of chunks combined into a given logical sequence.
Definition: dbindex.hpp:1110
unsigned long GetStride() const
Accessor for stride value.
Definition: dbindex.hpp:1140
CVectorWrap< TWord > TChunks
Type for storing the chunk data.
Definition: dbindex.hpp:1045
CDbIndex::TOffsetValue TOffsetValue
Definition: dbindex.hpp:1033
TWord offset_mask_
Mask to extract offsets.
Definition: dbindex.hpp:1382
pair< TSeqNum, TSeqNum > TSCPair
Definition: dbindex.hpp:1261
CVectorWrap< TWord > TLIdMap
Local id -> chunks map storage type.
Definition: dbindex.hpp:1048
unsigned long stride_
Index stride value.
Definition: dbindex.hpp:1376
TSCPair getSRCId(TSeqNum cid) const
Definition: dbindex.hpp:1264
std::pair< TSeqNum, TSeqPos > DecodeOffset(TWord offset) const
Decode offset.
Definition: dbindex.hpp:1149
pair< TSeqNum, TSeqPos > TSOPair
Definition: dbindex.hpp:1260
TSeqNum getCIdByLRCId(TSeqNum lid, TSeqNum rcid) const
Definition: dbindex.hpp:1290
TSeqNum NumSubjects() const
Get the total number of logical sequences in the map.
Definition: dbindex.hpp:1221
unsigned long max_chunk_size_
Definition: dbindex.hpp:1385
TSeqNum MapSubject(TSeqNum subject, TSeqNum chunk) const
Get the logical sequence id from the database oid and the chunk number.
Definition: dbindex.hpp:1122
void Load(TWord **map, TSeqNum start, TSeqNum stop, unsigned long stride)
Loads index by mapping to the memory segment.
Definition: dbindex.cpp:450
unsigned long min_offset_
Minimum offset used by the index.
Definition: dbindex.hpp:1377
const TWord * GetSubjectMap() const
Provides a mapping from real subject ids and chunk numbers to internal logical subject ids.
Definition: dbindex.hpp:1085
Uint1 offset_bits_
Number of bits used to encode offset.
Definition: dbindex.hpp:1381
TWord total_
Size in bytes of the raw sequence storage.
Definition: dbindex.hpp:1371
TLIdMap lid_map_
Local id -> chunk map storage.
Definition: dbindex.hpp:1380
TSeqNum NumChunks() const
Get the total number of sequence chunks in the map.
Definition: dbindex.hpp:1102
std::pair< TSeqNum, TSeqPos > MapSubjOff(TSeqNum lid, TSeqPos soff) const
Map logical sequence id and logical sequence offset to relative chunk number and chunk offset.
Definition: dbindex.hpp:1180
TSubjects subjects_
Mapping from database oids to the chunk info.
Definition: dbindex.hpp:1369
TSeqPos getSOff(TSeqNum sid, TSeqNum rcid, TSeqPos coff) const
Definition: dbindex.hpp:1326
TChunks chunks_
Collection of individual chunk descriptors.
Definition: dbindex.hpp:1374
TWord GetSeqStoreSize() const
Return the size in bytes of the eaw sequence storage.
Definition: dbindex.hpp:1097
TSeqStore seq_store_
Storage for the raw subject sequence data.
Definition: dbindex.hpp:1370
CVectorWrap< TWord > TLengths
Subject lengths storage type.
Definition: dbindex.hpp:1047
TSeqPos GetSeqLen(TSeqNum oid) const
Get the length of the subject sequence.
Definition: dbindex.hpp:1230
unsigned long chunk_overlap_
Definition: dbindex.hpp:1386
TSeqNum getNumChunks(TSeqNum sid) const
Definition: dbindex.hpp:1339
void SetSeqDataFromMap(TWord **map)
Set up the sequence store from the memory segment.
Definition: dbindex.cpp:489
const Uint1 * getSeqData(TSeqNum sid) const
Definition: dbindex.hpp:1348
void SetSubjInfo(TSeqNum subj, TWord &start, TWord &end) const
Return the subject information based on the given logical subject id.
Definition: dbindex.hpp:1164
const Uint1 * GetSeqData(TSeqNum oid) const
Get the sequence data of the subject sequence.
Definition: dbindex.hpp:1239
TSeqNum MapLId2Chunk(TSeqNum lid, TSeqNum lchunk) const
Map logical id and relative chunk to absolute chunk id.
Definition: dbindex.hpp:1211
CSubjectMap()
Trivial constructor.
Definition: dbindex.hpp:1053
TSeqPos getLOff(const TOffsetValue &v) const
Definition: dbindex.hpp:1359
TSeqNum getNumChunks() const
Definition: dbindex.hpp:1337
TSOPair getRCIdOffByLIdOff(TSeqNum lid, TSeqPos loff) const
Definition: dbindex.hpp:1299
TSCPairMap c2s_map_
CId -> (SId, RCId) map.
Definition: dbindex.hpp:1383
CDbIndex::TWord TWord
Definition: dbindex.hpp:1032
CVectorWrap< TWord > TSubjects
Type used to map database oids to the chunk info.
Definition: dbindex.hpp:1036
CVectorWrap< Uint1 > TSeqStore
Type used for compressed subject sequence data storage.
Definition: dbindex.hpp:1039
CDbIndex::TSeqNum TSeqNum
Definition: dbindex.hpp:1031
vector< TSCPair > TSCPairMap
Definition: dbindex.hpp:1262
TSeqNum getCId(TSeqNum sid, TSeqNum rcid) const
Definition: dbindex.hpp:1252
TWord getChunkLength(TSeqNum cid) const
Definition: dbindex.hpp:1270
TSeqNum getNumSubjects() const
Definition: dbindex.hpp:1336
A vector or pointer based sequence wrapper.
Definition: dbindex.hpp:318
void resize(size_type n, T v=T())
Change the size of the sequence.
Definition: dbindex.hpp:373
TVector::reference reference
Definition: dbindex.hpp:327
bool vec_
Flag indicating whether it is a wrapper or a holder of external sequence.
Definition: dbindex.hpp:402
std::vector< T > TVector
Sequence type being wrapped.
Definition: dbindex.hpp:319
TVector::size_type size_type
Definition: dbindex.hpp:325
T * base_
Pointer to the first element of the sequence.
Definition: dbindex.hpp:400
const T * const_iterator
Iterator type pointing to const data.
Definition: dbindex.hpp:332
TVector data_
std::vector object wrapped by this object.
Definition: dbindex.hpp:401
void SetPtr(T *base, size_type sz)
Make the object hold an external sequence.
Definition: dbindex.hpp:347
size_type size() const
Get the sequence size.
Definition: dbindex.hpp:384
TVector::const_reference const_reference
Definition: dbindex.hpp:328
TVector::value_type value_type
Definition: dbindex.hpp:326
const_reference operator[](size_type n) const
Indexing operator.
Definition: dbindex.hpp:365
const_iterator begin() const
Get the start of the sequence.
Definition: dbindex.hpp:390
CVectorWrap(size_type sz=0, T v=T())
Object constructor.
Definition: dbindex.hpp:339
reference operator[](size_type n)
Indexing operator.
Definition: dbindex.hpp:358
size_type size_
Size of the external sequence.
Definition: dbindex.hpp:403
const_iterator end() const
Get the end of the sequence.
Definition: dbindex.hpp:395
Definition: map.hpp:338
#define T(s)
Definition: common.h:230
const unsigned long WIDTH_32
32-bit index.
Definition: dbindex.hpp:54
const unsigned long OFFSET_COMBINED
Combination of chunk number and chunk-based offset.
Definition: dbindex.hpp:49
CRef< CIndexSuperHeader_Base > GetIndexSuperHeader(const std::string &fname)
Read superheader structure from the file.
Definition: dbindex.cpp:130
const unsigned long TWO_HIT
Use two-hit search.
Definition: dbindex.hpp:58
const unsigned long REPORT_QUIET
No progress reporting.
Definition: dbindex.hpp:61
const unsigned long REPORT_NORMAL
Normal reporting.
Definition: dbindex.hpp:62
unsigned long GetMinOffset(unsigned long stride)
Compute the minimum offset value needed encode offsets based on stride.
Definition: dbindex.cpp:446
const unsigned long ONE_HIT
Use one-hit search (normal).
Definition: dbindex.hpp:57
const unsigned long UNCOMPRESSED
No compression.
Definition: dbindex.hpp:46
const unsigned long REPORT_VERBOSE
Verbose reporting.
Definition: dbindex.hpp:63
unsigned long GetCodeBits(unsigned long stride)
Compute the number of bits to encode special offsets based on stride.
Definition: dbindex.cpp:438
size_t GetIdxVolNumOIDs(const std::string &fname)
Read the index header information from the given file.
Definition: dbindex.cpp:255
static const unsigned long CR
CDbIndex::TSeqNum TSeqNum
Forwarding declarations for convenience.
Definition: dbindex_sp.hpp:45
CDbIndex::TWord TWord
Definition: dbindex_sp.hpp:46
@ eSize
Definition: grid_cli.hpp:176
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
string
Definition: cgiapp.hpp:687
#define _DEBUG_ARG(arg)
Definition: ncbidbg.hpp:134
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
virtual const char * GetErrCodeString(void) const
Get error code interpreted as text.
Definition: ncbiexpt.cpp:444
uint8_t Uint1
1-byte (8-bit) unsigned integer
Definition: ncbitype.h:99
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
Definition: ncbistre.hpp:146
#define VERSION
Definition: config.h:13
static int input()
int i
yy_size_t n
static int version
Definition: mdb_load.c:29
double value_type
The numeric datatype used by the parser.
Definition: muParserDef.h:228
const struct ncbi::grid::netcache::search::fields::SIZE size
EIPRangeType t
Definition: ncbi_localip.c:101
#define ASSERT
macro for assert.
Definition: ncbi_std.h:107
Portable reference counted smart and weak pointers using CWeakRef, CRef, CObject and CObjectEx.
@ eRead
Definition: ns_types.hpp:56
int offset
Definition: replacements.h:160
Structure to hold a sequence.
Definition: blast_def.h:242
Structure to hold all initial HSPs for a given subject sequence.
Definition: blast_extend.h:158
Used to hold a set of positions, mostly used for filtering.
Definition: blast_def.h:204
Simple record type used to specify index creation parameters.
Definition: dbindex.hpp:468
bool legacy
Indicator of the legacy index format.
Definition: dbindex.hpp:470
unsigned long report_level
Verbose index creation.
Definition: dbindex.hpp:477
unsigned long max_index_size
Maximum index size in megabytes.
Definition: dbindex.hpp:478
unsigned long chunk_size
Long sequences are split into chunks of this size.
Definition: dbindex.hpp:474
std::string stat_file_name
File to write index statistics into.
Definition: dbindex.hpp:480
unsigned long ws_hint
Most likely word size to use for searches.
Definition: dbindex.hpp:472
unsigned long chunk_overlap
Amount by which individual chunks overlap.
Definition: dbindex.hpp:476
bool idmap
Indicator of the index map creation.
Definition: dbindex.hpp:469
unsigned long hkey_width
Width of the hash key in bits.
Definition: dbindex.hpp:473
unsigned long stride
Stride to use for stored database locations.
Definition: dbindex.hpp:471
Simple record type used to specify index search parameters.
Definition: dbindex.hpp:639
unsigned long two_hits
Window for two-hit method (see megablast docs).
Definition: dbindex.hpp:641
unsigned long word_size
Target seed length.
Definition: dbindex.hpp:640
Structure into which an index header is loaded.
Definition: dbindex.hpp:290
unsigned long chunk_overlap_
Overlap of neighboring chunks.
Definition: dbindex.hpp:298
CSequenceIStream::TStreamPos stop_chunk_
Number of the last chunk of the last sequence in the index.
Definition: dbindex.hpp:303
CSequenceIStream::TStreamPos start_chunk_
Number of the first chunk of the first sequence in the index.
Definition: dbindex.hpp:301
unsigned long ws_hint_
Word size hint used during index creation.
Definition: dbindex.hpp:295
unsigned long hkey_width_
Size in bp of the Nmer used as a hash key.
Definition: dbindex.hpp:293
bool legacy_
This is a legacy index format.
Definition: dbindex.hpp:291
unsigned long max_chunk_size_
Chunk size used to split subjects.
Definition: dbindex.hpp:297
CSequenceIStream::TStreamPos stop_
OID of the last sequence in the index.
Definition: dbindex.hpp:302
unsigned long stride_
Stride used to index database locations.
Definition: dbindex.hpp:294
CSequenceIStream::TStreamPos start_
OID of the first sequence in the index.
Definition: dbindex.hpp:300
static string subject
static string query
else result
Definition: token2.c:20
#define const
Definition: zconf.h:230
Modified on Sat Dec 02 09:20:40 2023 by modify_doxy.py rev. 669887