NCBI C++ ToolKit
dbindex_factory.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: dbindex_factory.cpp 100101 2023-06-15 14:10:29Z merezhuk $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Aleksandr Morgulis
27  *
28  * File Description:
29  * Implementation of index creation functionality.
30  *
31  */
32 
33 #include <ncbi_pch.hpp>
34 
35 #include <iostream>
36 #include <sstream>
37 #include <string>
38 #include <corelib/ncbi_limits.hpp>
39 
41 #include <objmgr/seq_vector.hpp>
42 #include <objmgr/util/sequence.hpp>
44 
47 
48 #ifdef LOCAL_SVN
49 
51 #include "dbindex.hpp"
52 
53 #else
54 
57 
58 #endif
59 
61 BEGIN_SCOPE( blastdbindex )
62 
63 /**@name Useful constants from CDbIndex scope. */
64 /**@{*/
65 static const unsigned long CR = CDbIndex::CR;
66 /**@}*/
67 
68 /** Alias for CDbIndex::TWord type. */
69 typedef CDbIndex::TWord TWord;
70 
71 /** Alias for index creation options. */
73 
74 //-------------------------------------------------------------------------
75 /** Convert an integer to hex string representation.
76  @param word [I] the integer value
77  @return string containing the hexadecimal representation of word.
78 */
79 const std::string to_hex_str( TWord word )
80 {
81  std::ostringstream os;
82  os << hex << word;
83  return os.str();
84 }
85 
86 //-------------------------------------------------------------------------
87 /** Write a word into a binary output stream.
88  This functin is endian-dependant and not portable between platforms
89  with different endianness.
90  @param os output stream; must be open in binary mode
91  @param word value to write to the stream
92 */
93 template< typename word_t >
94 void WriteWord( CNcbiOstream & os, word_t word )
95 { os.write( reinterpret_cast< char * >( &word ), sizeof( word_t ) ); }
96 
97 //-------------------------------------------------------------------------
98 /** Convertion from IUPACNA to NCBI2NA (+1).
99  @param r residue value in IUPACNA
100  @return 1 + NCBI2NA value of r, if defined;
101  0 otherwise
102  */
104 {
105  switch( r ) {
106  case 'A': return 1;
107  case 'C': return 2;
108  case 'G': return 3;
109  case 'T': return 4;
110  default : return 0;
111  }
112 }
113 
114 //-------------------------------------------------------------------------
115 /** Part of the CSubjectMap_Factory class that is independent of template
116  parameters.
117 */
119 {
120  public:
121 
122  typedef CSequenceIStream::TSeqData TSeqData; /**< forwarded type */
123  typedef CDbIndex::TSeqNum TSeqNum; /**< forwarded type */
124 
125  /** Type used to store a masked segment internally. */
126  struct SSeqSeg // public to compile under Solaris
127  {
128  TSeqPos start_; /**< Start of the segment. */
129  TSeqPos stop_; /**< One past the end of the segment. */
130 
131  /** Object constructor.
132  @param start start of the new segment
133  @param stop one past the end of the new segment
134  */
135  SSeqSeg( TSeqPos start, TSeqPos stop = 0 )
136  : start_( start ), stop_( stop )
137  {}
138  };
139 
140  protected:
141 
142  /** Sequence data without masking. */
143  typedef objects::CSeqVector TSeq;
144 
145  /** Masking information. */
147 
148  /** The inner most type needed to access mask data in the
149  representation returned by ReadFasta().
150  */
151  typedef objects::CSeq_loc::TPacked_int::Tdata TLocs;
152 
153  /** Container type used to store compressed sequence information. */
154  typedef std::vector< Uint1 > TSeqStore;
155 
156  /** Increment used to increase seqstore capacity. */
157  static const TSeqStore::size_type SS_INCR = 100*1024*1024;
158 
159  /** Threshold for the difference between seqstore size and capacity. */
160  static const TSeqStore::size_type SS_THRESH = 10*1024*1024;
161 
162  /** Type for storing mapping from subject oids to the chunk numbers. */
163  typedef std::vector< TSeqNum > TSubjects;
164 
165  /** A helper class used when creating internal set masked locations
166  in the process of converting the sequence data to NCBI2NA and
167  storing it in seq_store_.
168  */
169  class CMaskHelper : public CObject
170  {
171  private:
172 
173  typedef CSequenceIStream::TMask TMask; /**< forwarded type */
174 
175  /** See documentation for CSubjectMap_Factory_Base::TLocs. */
176  typedef objects::CSeq_loc::TPacked_int::Tdata TLocs;
177 
178  /** Collection of TLocs extracted from
179  CSequenceIStream::TSeqData.
180  */
181  typedef std::vector< const TLocs * > TLocsVec;
182 
183  public:
184 
185  /** Default object constructor. */
187 
188  /** Initialize the iterators after the masked locations
189  are added.
190  */
191  void Init();
192 
193  /** Add a set of masked intervals.
194  The data must be in the form of packed intervals.
195  @param loc set of packed intervals to add
196  */
197  void Add( const TMask::value_type & loc )
198  {
199  if( loc->IsPacked_int() ) {
200  c_locs_.push_back(
201  &( loc->GetPacked_int().Get() ) );
202  }
203  }
204 
205  /** Check if a point falls within the intervals stored
206  in the object.
207  @param pos the coordinate in the sequence
208  @return true, if pos belongs to one of the intervals
209  added to the object; false otherwise
210  */
211  bool In( TSeqPos pos );
212 
213  /** Backtrack to the first interval to the left of pos
214  or to the beginning, if not possible.
215  @param pos [I] the target position
216  */
217  void Adjust( TSeqPos pos );
218 
219  private:
220 
221  /** Check if the end of iteration has been reached.
222  @return true if the end of iteration has not been reached;
223  false otherwise
224  */
225  bool Good() const { return vit_ != c_locs_.end(); }
226 
227  /** Iteration step. */
228  void Advance();
229 
230  /** Iteration step backwords.
231  @return true, if retreat was successful, false if there is
232  nowhere to retreat
233  */
234  bool Retreat();
235 
236  TLocsVec c_locs_; /**< Container with sets of masked intervals. */
237  TLocsVec::const_iterator vit_; /**< State of the iterator over c_locs_ (outer iteration). */
238  TLocs::const_iterator it_; /**< State of the iterator over *vit_ (inner iteration). */
239  TSeqPos start_; /**< Left end of *it_. */
240  TSeqPos stop_; /**< One past the right end of *it_. */
241  };
242 
243  /** Maximum internal sequence size.
244  When the library is integrated with BLAST, this should
245  correspond to the maximum subject chunk size used in BLAST.
246  */
247  unsigned long chunk_size_;
248 
249  /** Length of overlap between consequtive chunks of one sequence.
250  When the library is integrated with BLAST, this should
251  correspond to the subject chunk overlap length used in BLAST.
252  */
253  unsigned long chunk_overlap_;
254 
255  /** Level of reporting requested by the user. */
256  unsigned long report_level_;
257 
258  TSeqNum committed_; /**< Logical number of the last committed sequence. */
259  TSeqNum last_chunk_; /**< Logical number of last processed sequence. */
260  TSeqNum c_chunk_; /**< Current chunk number of the sequence currently being processed. */
261  TSeq c_seq_; /**< Sequence data of the sequence currently being processed. */
262  CRef<objects::CObjectManager> om_; /**< Reference to the ObjectManager instance. */
263  TSeqStore seq_store_; /**< Container for storing the packed sequence data. */
264  TSeqStore::size_type ss_cap_; /**< Current seq_store capacity. */
265  TSubjects subjects_; /**< Mapping from subject oid to chunk information. */
266  CRef< CMaskHelper > mask_helper_; /**< Auxiliary object used to compute unmasked parts of the sequences. */
267  unsigned long stride_; /**< Stride selected in index creation options. */
268  unsigned long min_offset_; /**< Minimum offset value used by the index. */
269 
270  /** Object constructor.
271  @param options index creation options
272  */
274  const TOptions & options )
275  : chunk_size_( options.chunk_size ),
276  chunk_overlap_( options.chunk_overlap ),
277  report_level_( options.report_level ),
278  committed_( 0 ), last_chunk_( 0 ),
279  om_( objects::CObjectManager::GetInstance() ),
280  seq_store_( options.stride, 0 ),
281  ss_cap_( SS_INCR ),
282  mask_helper_( null ),
283  stride_( options.stride ),
284  min_offset_( GetMinOffset( options.stride ) )
285  {}
286 
287  /** Helper function used to extract CSeqVector instance from
288  a TSeqData object.
289  The extracted CSeqVector is stored in c_seq_ data member.
290  @param sd the object containing the input sequence data
291  */
292  string extractSeqVector( TSeqData & sd );
293 
294  public:
295 
296  /** Get the start of the compressed sequence storage space.
297  @return start of seq_store_
298  */
299  const Uint1 * seq_store_start() const { return &seq_store_[0]; }
300 
301  /** Start processing of the new input sequence.
302  @param sd new input sequence data
303  @param start_chunk only store data related to chunks numbered
304  higher than the value of this parameter
305  */
306  string NewSequenceInit( TSeqData & sd, TSeqNum start_chunk );
307 };
308 
309 /** To be merged with CSubjectMap_Factory_Base
310 */
312 {
313  public:
314 
315  /** Object constructor.
316  @param options index creation options
317  */
319  const TOptions & options )
320  : CSubjectMap_Factory_Base( options )
321  {}
322 
323  /** Get the total memory usage by the subject map in bytes.
324  @return memory usage by this instance
325  */
326  TWord total() const { return static_cast<TWord>(seq_store_.size()); }
327 
328  /** Append the next chunk of the input sequence currently being
329  processed to the subject map.
330 
331  This function only computes the valid segments and decides whether
332  iteration over chunks is complete.
333 
334  The return value of false should be used as iteration termination
335  condition.
336 
337  @param seq_off The start of the chunk data.
338  @return true for success; false if no more chunks were available
339  */
340  bool AddSequenceChunk( TSeqStore::size_type seq_off );
341 
342  /** Finalize processing of the current input sequence.
343  */
344  void Commit();
345 
346  /** Get the oid of the last processed sequence.
347  This function is used to get the oid of the last added subject
348  sequence after the index has been grown to the target size.
349  @return oid of the last added (possibly partially) sequence
350  */
351  TSeqNum GetLastSequence() const { return subjects_.size(); }
352 
353  /** Get the oid of the last chunk number of the last processed sequence.
354  @return the number of the last successfully added sequence chunk
355  */
357 
358  /** Get the internal oid of the last valid sequence.
359  This function is used by the offset data management classes
360  to see if some sequences need to be reevaluated.
361  @return internal oid of the last valid sequence
362  */
364 
365  protected:
366 
367  /** Information about the sequence chunk. */
368  struct SSeqInfo
369  {
370  /** Type containing the valid intervals. */
371  typedef std::vector< SSeqSeg > TSegs;
372 
373  /** Object constructor.
374  @param start start of the compressed sequence data
375  @param len length of the sequence
376  @param segs valid intervals
377  */
379  TWord start = 0,
380  TWord len = 0,
381  const TSegs & segs = TSegs() )
382  : seq_start_( start ), len_( len ), segs_( segs )
383  {}
384 
385  TWord seq_start_; /**< Start of the compressed sequence data. */
386  TWord len_; /**< Sequence length. */
387  TSegs segs_; /**< Valid intervals, i.e. everything
388  except masked and ambiguous bases. */
389  };
390 
391  /** Type for the collection of sequence chunks. */
392  typedef std::vector< SSeqInfo > TChunks;
393 
394  /** Collection of sequence chunks (or logical sequences).
395  For raw offsets the logical oid of the sequence is
396  its index in this collectin.
397  */
399 
400  public:
401 
402  typedef SSeqInfo TSeqInfo; /**< Type definition for external users. */
403  typedef SSeqSeg TSeqSeg; /**< Type definition for external users. */
404 
405 
406  /** Get the chunk info by internal oid
407  @param snum internal oid of the sequence
408  @return requested sequence information or NULL if no sequence
409  corresponding to snum exists
410  */
411  const TSeqInfo * GetSeqInfo( TSeqNum snum ) const
412  {
413  if( snum > last_chunk_ ) {
414  return 0;
415  }else {
416  return &chunks_[snum - 1];
417  }
418  }
419 
420  /** Save the subject map and sequence info.
421  @param os output stream open in binary mode
422  */
423  void Save( CNcbiOstream & os ) const;
424 
425  /** Revert to the state before the start of processing of the
426  current input sequence.
427  */
428  void RollBack();
429 };
430 
431 /** To be merged with CSubjectMap_Factory_Base.
432  */
434 {
435  public: // This section is for Solaris compilation.
436 
437  /** Base class. */
439 
440  /** @name Aliases to the names from the base class. */
441  /**@{*/
444  /**@}*/
445 
446  private:
447 
448  /** Type of lengths table. */
449  typedef vector< TWord > TLengthTable;
450 
451  /** Element of mapping of local sequence ids to chunks. */
453  {
454  TSeqNum start_; /**< First chunk. */
455  TSeqNum end_; /**< One past the last chunk. */
456  TSeqPos seq_start_; /**< Start of the combined sequence in seq_store. */
457  TSeqPos seq_end_; /**< End of the combined sequence in seq_store. */
458  };
459 
460  /** Type of mapping of local sequence ids to chunks. */
461  typedef vector< SLIdMapElement > TLIdMap;
462 
463  public:
464 
465  /** Object constructor.
466  @param options index creation options
467  */
469  const TOptions & options );
470 
471  /** Start processing of the new input sequence.
472 
473  In addition to base class functionality this function adds
474  an entry to the lengths table.
475 
476  @param sd new input sequence data
477  @param start_chunk only store data related to chunks numbered
478  higher than the value of this parameter
479  */
480  string NewSequenceInit( TSeqData & sd, TSeqNum start_chunk )
481  {
482  string result = TBase::NewSequenceInit( sd, start_chunk );
483  lengths_.push_back( this->c_seq_.size() );
484  return result;
485  }
486 
487  /** Append the next chunk of the input sequence currently being
488  processed to the subject map.
489  The return value of false should be used as iteration termination
490  condition.
491  @param overflow [O] returns true if lid overflow occured
492  @return true for success; false if no more chunks were available
493  */
494  bool AddSequenceChunk( bool & overflow );
495 
496  /** Check if index information should be produced for this offset.
497 
498  Typically it computes the full offset in way typical for the
499  corresponding version of index and checks if it is a multiple
500  of stride.
501 
502  @param seq Start of the buffer containing the compressed sequence.
503  @param off Offset relative to the start of seq.
504  @return true if information about this offset should be in the index;
505  false otherwise.
506  */
507  bool CheckOffset( const Uint1 * seq, TSeqPos off ) const;
508 
509  /** Encode an offset given a pointer to the compressed sequence
510  data and relative offset.
511  @param seq start of the buffer containing the compressed sequence
512  @param off offset relative to the start of seq
513  @return encoded offset that can be added to an offset list
514  */
515  TWord MakeOffset( const Uint1 * seq, TSeqPos off ) const;
516 
517  /** Encode an offset given an internal oid and relative offset.
518  @param seq internal oid of a sequence
519  @param off offset relative to the start of seq
520  @return encoded offset that can be added to an offset list
521  */
522  TWord MakeOffset( TSeqNum seq, TSeqPos off ) const;
523 
524  /** Save the subject map and sequence info.
525  @param os output stream open in binary mode
526  */
527  void Save( CNcbiOstream & os ) const;
528 
529  private:
530 
531  TLengthTable lengths_; /**< The table of subject sequence lengths. */
532  TLIdMap lid_map_; /**< Maping of local sequence ids to chunks. */
533  TSeqPos cur_lid_len_; /**< Current length of local sequence. */
534  Uint1 offset_bits_; /**< Number of bits used to encode offset. */
535 };
536 
537 //-------------------------------------------------------------------------
539 {
540  vit_ = c_locs_.begin();
541 
542  while( vit_ != c_locs_.end() ) {
543  it_ = (*vit_)->begin();
544 
545  if( it_ != (*vit_)->end() ) {
546  start_ = (*it_)->GetFrom();
547  stop_ = (*it_)->GetTo() + 1;
548  break;
549  }
550 
551  ++vit_;
552  }
553 }
554 
555 //-------------------------------------------------------------------------
557 {
558  while( Good() ) {
559  if( ++it_ != (*vit_)->end() ) {
560  start_ = (*it_)->GetFrom();
561  stop_ = (*it_)->GetTo() + 1;
562  return;
563  }
564 
565  ++vit_;
566  if( Good() ) it_ = (*vit_)->begin();
567  }
568 }
569 
570 //-------------------------------------------------------------------------
572 {
573  bool notdone;
574 
575  do{
576  notdone = Retreat();
577  }while( notdone && pos < stop_ );
578 }
579 
580 //-------------------------------------------------------------------------
582 {
583  if( c_locs_.empty() ) return false;
584 
585  if( !Good() ) {
586  --vit_;
587 
588  while( vit_ != c_locs_.begin() && (*vit_)->empty() ) {
589  --vit_;
590  }
591 
592  if( !(*vit_)->empty() ) {
593  it_ = (*vit_)->end();
594  --it_;
595  start_ = (*it_)->GetFrom();
596  stop_ = (*it_)->GetTo() + 1;
597  return true;
598  }
599 
600  vit_ = c_locs_.end();
601  return false;
602  }
603 
604  if( it_ != (*vit_)->begin() ) {
605  --it_;
606  start_ = (*it_)->GetFrom();
607  stop_ = (*it_)->GetTo() + 1;
608  return true;
609  }
610 
611  if( vit_ == c_locs_.begin() ) {
612  Init();
613  return false;
614  }
615 
616  --vit_;
617 
618  while( vit_ != c_locs_.begin() && (*vit_)->empty() ) {
619  --vit_;
620  }
621 
622  if( !(*vit_)->empty() ) {
623  it_ = (*vit_)->end();
624  --it_;
625  start_ = (*it_)->GetFrom();
626  stop_ = (*it_)->GetTo() + 1;
627  return true;
628  }
629 
630  Init();
631  return false;
632 }
633 
634 //-------------------------------------------------------------------------
636 {
637  while( Good() && pos >= stop_ ) Advance();
638  if( !Good() ) return false;
639  return pos >= start_;
640 }
641 
642 //-------------------------------------------------------------------------
644 {
645  objects::CSeq_entry * entry = sd.seq_entry_.GetPointerOrNull();
646 
647  if( entry == 0 ||
648  entry->Which() != objects::CSeq_entry_Base::e_Seq ) {
649  NCBI_THROW(
650  CDbIndex_Exception, eBadOption,
651  "input seq-entry is NULL or not a sequence" );
652  }
653 
654  objects::CScope scope( *om_ );
655  objects::CSeq_entry_Handle seh = scope.AddTopLevelSeqEntry( *entry );
656  objects::CBioseq_Handle bsh = seh.GetSeq();
657  c_seq_ = bsh.GetSeqVector( objects::CBioseq_Handle::eCoding_Iupac );
658  string idstr = objects::sequence::GetTitle( bsh );
659  Uint4 pos = static_cast<Uint4>(idstr.find_first_of( " \t" ));
660  idstr = idstr.substr( 0, pos );
661  return idstr;
662 }
663 
664 //-------------------------------------------------------------------------
666  TSeqData & sd, TSeqNum start_chunk )
667 {
668  string result = "unknown";
669  subjects_.push_back( 0 );
670  c_chunk_ = start_chunk;
671 
672  if( sd ) {
673  result = extractSeqVector( sd );
674  TMask & mask = sd.mask_locs_;
675  mask_helper_.Reset( new CMaskHelper );
676 
677  for( TMask::const_iterator mask_it = mask.begin();
678  mask_it != mask.end(); ++mask_it ) {
679  mask_helper_->Add( *mask_it );
680  }
681 
682  mask_helper_->Init();
683  }
684 
685  return result;
686 }
687 
688 //-------------------------------------------------------------------------
690 {
691  TWord tmp = static_cast<TWord>(subjects_.size());
692  TWord subject_map_size = static_cast<TWord>(
693  tmp*sizeof( TWord ) +
694  chunks_.size()*sizeof( TWord ));
695  WriteWord( os, subject_map_size );
696 
697  for( TSubjects::const_iterator cit = subjects_.begin();
698  cit != subjects_.end(); ++cit ) {
699  WriteWord( os, (TWord)(*cit) );
700  }
701 
702  for( TChunks::const_iterator cit = chunks_.begin();
703  cit != chunks_.end(); ++cit ) {
704  WriteWord( os, cit->seq_start_ );
705  }
706 
707  WriteWord( os, (TWord)(seq_store_.size()) );
708  WriteWord( os, (TWord)(seq_store_.size()) );
709  os.write( (char *)(&seq_store_[0]), seq_store_.size() );
710  os << std::flush;
711 }
712 
713 //-------------------------------------------------------------------------
715  TSeqStore::size_type seq_off )
716 {
717  TSeqPos chunk_start = static_cast<TSeqPos>((chunk_size_ - chunk_overlap_)*(c_chunk_++));
718 
719  if( chunk_start >= c_seq_.size() ) {
720  --c_chunk_;
721  return false;
722  }
723 
724  TSeqPos chunk_end =
725  std::min( (TSeqPos)(chunk_start + chunk_size_), c_seq_.size() );
726  TSeqPos chunk_len = chunk_end - chunk_start;
727  SSeqInfo::TSegs segs;
728 
729  if( chunk_len > 0 ) {
730  unsigned int lc = 0;
731  bool in = false, in1;
732  mask_helper_->Adjust( chunk_start );
733 
734  for( TSeqPos pos = chunk_start;
735  pos < chunk_end; ++pos, lc = (lc + 1)%CR ) {
736  Uint1 letter = base_value( c_seq_[pos] );
737 
738  if( letter == 0 ) {
739  in1 = true;
740  }else {
741  in1 = false;
742  --letter;
743  }
744 
745  in1 = (in1 || mask_helper_->In( pos ));
746 
747  if( in1 && !in ) {
748  if( segs.empty() ) {
749  segs.push_back( SSeqSeg( 0 ) );
750  }
751 
752  segs.rbegin()->stop_ = pos - chunk_start;
753  in = true;
754  }else if( !in1 && in ) {
755  segs.push_back( SSeqSeg( pos - chunk_start ) );
756  in = false;
757  }
758  }
759 
760  if( !in ) {
761  if( segs.empty() ) {
762  segs.push_back( SSeqSeg( 0 ) );
763  }
764 
765  segs.rbegin()->stop_ = chunk_end - chunk_start;
766  }
767  }
768 
769  chunks_.push_back(
770  TSeqInfo( static_cast<TWord>(seq_off), c_seq_.size(), segs ) );
771 
772  if( *subjects_.rbegin() == 0 ) {
773  *subjects_.rbegin() = static_cast<unsigned int>(chunks_.size());
774  }
775 
776  last_chunk_ = static_cast<TSeqNum>(chunks_.size());
777  return true;
778 }
779 
780 //-------------------------------------------------------------------------
782 {
783  if( !subjects_.empty() ) {
784  last_chunk_ = *subjects_.rbegin() - 1;
785  c_chunk_ = 0;
786  *subjects_.rbegin() = 0;
787  }
788 }
789 
790 //-------------------------------------------------------------------------
792 {
793  if( last_chunk_ < chunks_.size() ) {
794  TSeqStore::size_type newsize =
795  (TSeqStore::size_type)(chunks_[last_chunk_].seq_start_);
796  seq_store_.resize( newsize );
797  chunks_.resize( last_chunk_ );
798  }
799 
801 }
802 
803 //-------------------------------------------------------------------------
805  const TOptions & options )
806  : TBase( options ),
807  cur_lid_len_( 0 ), offset_bits_( 16 )
808 {
809  unsigned long max_len = (1 + options.chunk_size/stride_) + min_offset_;
810  while( (max_len>>offset_bits_) != 0 ) ++offset_bits_;
811 }
812 
813 //-------------------------------------------------------------------------
815 {
816  overflow = false;
817  bool starting = (this->c_chunk_ == 0);
818  TSeqPos chunk_start =static_cast<TSeqPos>(
819  (this->chunk_size_ - this->chunk_overlap_)*this->c_chunk_);
820  TBase::TSeqStore::size_type seq_off =
821  starting ? this->seq_store_.size() :
822  this->chunks_.rbegin()->seq_start_
823  + (this->chunk_size_ - this->chunk_overlap_)/CR;
824  if( !TBase::AddSequenceChunk( seq_off ) ) return false;
825  TBase::TSeq::size_type seqlen = this->c_seq_.size();
826 
827  // Combining sequences.
828  //
829  TSeqPos length_limit = (1<<(offset_bits_ - 1));
830  TSeqPos chunk_end = std::min(
831  (TSeqPos)(chunk_start + this->chunk_size_), seqlen );
832  TSeqPos chunk_len = chunk_end - chunk_start;
833 
834  if( lid_map_.empty() || cur_lid_len_ + chunk_len > length_limit ) {
835  Uint1 lid_bits = 8*sizeof( TWord ) - offset_bits_;
836  TSeqNum lid_limit = static_cast<TSeqNum>((1UL<<lid_bits));
837 
838  if( lid_map_.size() >= lid_limit ) {
839  overflow = true;
840  return true;
841  }
842 
843  SLIdMapElement newlid = { (Uint4) this->chunks_.size() - 1, 0,
844  (TSeqPos) seq_off };
845  lid_map_.push_back( newlid );
846  cur_lid_len_ = 0;
847  }
848 
849  lid_map_.rbegin()->end_ = static_cast<TSeqNum>(this->chunks_.size());
850  cur_lid_len_ += chunk_len;
851  lid_map_.rbegin()->seq_end_ =
852  lid_map_.rbegin()->seq_start_ + cur_lid_len_;
853 
854  if( starting && seqlen > 0 ) {
855  if( this->ss_cap_ <= this->seq_store_.size() + TBase::SS_THRESH ) {
856  this->ss_cap_ += TBase::SS_INCR;
857  this->seq_store_.reserve( this->ss_cap_ );
858  }
859  Uint1 accum = 0;
860  unsigned int lc = 0;
861 
862  for( TSeqPos pos = 0; pos < seqlen; ++pos, lc = (lc + 1)%CR ) {
863  Uint1 letter = base_value( this->c_seq_[pos] );
864  if( letter != 0 ) --letter;
865  accum = (accum << 2) + letter;
866  if( lc == 3 ) this->seq_store_.push_back( accum );
867  }
868 
869  if( lc != 0 ) {
870  accum <<= (CR - lc)*2;
871  this->seq_store_.push_back( accum );
872  }
873  }
874 
875  return true;
876 }
877 
878 //-------------------------------------------------------------------------
880  const Uint1 * seq, TSeqPos off ) const
881 {
882  TSeqPos soff = static_cast<TSeqPos>(seq - &(this->seq_store_[0]));
883  TLIdMap::const_reverse_iterator iter = lid_map_.rbegin();
884  while( iter != lid_map_.rend() && iter->seq_start_ > soff ) ++iter;
885  ASSERT( iter->seq_start_ <= soff );
886  off += (soff - iter->seq_start_)*CR;
887  return (off%stride_ == 0);
888 }
889 
890 //-------------------------------------------------------------------------
892  const Uint1 * seq, TSeqPos off ) const
893 {
894  TSeqPos soff = static_cast<TSeqPos>(seq - &(this->seq_store_[0]));
895  TLIdMap::const_reverse_iterator iter = lid_map_.rbegin();
896  while( iter != lid_map_.rend() && iter->seq_start_ > soff ) ++iter;
897  ASSERT( iter->seq_start_ <= soff );
898  off += (soff - iter->seq_start_)*CR;
899  off /= stride_;
900  off += min_offset_;
901  TWord result = static_cast<TWord>( ((lid_map_.rend() - iter - 1)<<offset_bits_) + off);
902  return result;
903 }
904 
905 //-------------------------------------------------------------------------
907  TSeqNum seqnum, TSeqPos off ) const
908 {
909  const Uint1 * seq =
910  &(this->seq_store_)[0] + (this->chunks_)[seqnum].seq_start_;
911  return MakeOffset( seq, off );
912 }
913 
914 //-------------------------------------------------------------------------
916 {
917  TWord sz = static_cast<TWord>(sizeof( TWord )*lengths_.size());
918  WriteWord( os, sz );
919  WriteWord( os, (TWord)offset_bits_ );
920 
921  for( TLengthTable::const_iterator it = lengths_.begin();
922  it != lengths_.end(); ++it ) {
923  WriteWord( os, (TWord)(*it) );
924  }
925 
926  sz = static_cast<TWord>(4*sizeof( TWord )*lid_map_.size());
927  WriteWord( os, sz );
928 
929  for( TLIdMap::const_iterator it = lid_map_.begin();
930  it != lid_map_.end(); ++it ) {
931  WriteWord( os, (TWord)(it->start_) );
932  WriteWord( os, (TWord)(it->end_) );
933  WriteWord( os, (TWord)(it->seq_start_) );
934  WriteWord( os, (TWord)(it->seq_end_) );
935  }
936 
937  TBase::Save( os );
938 }
939 
940 //-------------------------------------------------------------------------
941 /** Type representing an offset list corresponding to an Nmer.
942  See documentation of COffsetData_Factory classes for the description
943  of template parameters.
944 */
946 {
947  public:
948 
949  /** Set the index creation parameters.
950 
951  @param options index creation options
952  */
953  void SetIndexParams( const TOptions & options )
954  {
955  min_offset_ = GetMinOffset( options.stride );
956  mult_ = (options.ws_hint - options.hkey_width + 1)/options.stride;
957  }
958 
959  /** Add an offset to the list. Update the total.
960  @param item [I] offset to be appended to the list
961  @param total [I/O] change in the length of the list will
962  be applied to this argument
963  */
964  void AddData( TWord item, TWord & total );
965 
966  /** Truncate the list to the value of offset. Update the total.
967  The function removes the tail of the list corresponding
968  to elements that are at least as great as offset.
969  @param offset [I] offset value threshold
970  @param total [I/O] change in the length of the list will
971  be applied to this argument
972  */
973  void TruncateList( TWord offset, TWord & total );
974 
975  /** Return the size of the offset list in words.
976  @return size of the list in words
977  */
978  TWord Size() const { return (TWord)(data_.size()); }
979 
980  /** Save the offset list.
981  @param os output stream open in binary mode
982  */
983  void Save( CNcbiOstream & os ) const;
984 
985  public: // for Solaris
986 
987  struct SDataUnit;
988 
989  static const Uint4 DATA_UNIT_SIZE = 1 + 10*sizeof( SDataUnit * )/sizeof( TWord );
990 
991  struct SDataUnit
992  {
995  };
996 
997  class CDataPool
998  {
999  static const Uint4 BLOCK_SIZE = 1024*1024ULL;
1000 
1001  typedef vector< SDataUnit > TBlock;
1002  typedef vector< TBlock > TBlocks;
1003 
1004  public:
1005 
1006  CDataPool() : free_( 0 )
1007  {
1008  new_block();
1009  }
1010 
1012  {
1013  if( free_ != 0 ) {
1014  SDataUnit * result = free_;
1015  free_ = free_->next;
1016  return result;
1017  }
1018 
1019  if( first_unused_ >= BLOCK_SIZE ) new_block();
1020  return &(*pool_.rbegin())[first_unused_++];
1021  }
1022 
1023  void free( SDataUnit * d )
1024  {
1025  if( d == 0 ) return;
1026  SDataUnit * t = free_;
1027  free_ = d;
1028  while( d->next != 0 ) d = d->next;
1029  d->next = t;
1030  }
1031 
1032  void clear()
1033  {
1034  free_ = 0;
1035  pool_.resize( 1 );
1036  first_unused_ = 0;
1037  }
1038 
1039  private:
1040 
1041  void new_block()
1042  {
1043  pool_.push_back( TBlock( BLOCK_SIZE ) );
1044  first_unused_ = 0;
1045  }
1046 
1048 
1050 
1052  };
1053 
1054  void SetDataPool( CDataPool * pool ) { data_.SetDataPool( pool ); }
1055 
1056  private:
1057 
1058  class CData
1059  {
1061  {
1062  public:
1063 
1065  SDataUnit * cunit,
1066  Uint4 cindex,
1067  Uint4 size )
1068  : cunit_( cunit ), cindex_( cindex ),
1069  size_( size ), prev_( 0 )
1070  { ASSERT( cindex_ != 0 ); }
1071 
1073  {
1074  if( size_ != 0 ) {
1075  if( cindex_ >= DATA_UNIT_SIZE ) {
1076  prev_ = &cunit_->data[cindex_ - 1];
1077  cunit_ = cunit_->next;
1078  cindex_ = 1;
1079  }
1080  else ++cindex_;
1081 
1082  --size_;
1083 
1084  if( size_ == 0 ) {
1085  cunit_ = 0;
1086  cindex_ = 1;
1087  prev_ = 0;
1088  }
1089  }
1090 
1091  return *this;
1092  }
1093 
1095  {
1096  if( size_ != 0 ) {
1097  ASSERT( cindex_ != 0 );
1098  --cindex_;
1099  ++size_;
1100  }
1101 
1102  return *this;
1103  }
1104 
1105  TWord operator*() const
1106  {
1107  ASSERT( size_ != 0 );
1108  ASSERT( cindex_ != 0 || prev_ != 0 );
1109  ASSERT( cindex_ == 0 || cunit_ != 0 );
1110  return ( cindex_ != 0 ) ? cunit_->data[cindex_ - 1]
1111  : *prev_;
1112  }
1113 
1114  friend bool operator==(
1115  const CDataIterator & rhs,
1116  const CDataIterator & lhs )
1117  {
1118  return rhs.cunit_ == lhs.cunit_ ?
1119  rhs.cunit_ == 0 ?
1120  true :
1121  rhs.cindex_ == lhs.cindex_ :
1122  false;
1123  }
1124 
1125  friend bool operator!=(
1126  const CDataIterator & rhs,
1127  const CDataIterator & lhs )
1128  { return !(rhs == lhs); }
1129 
1130  private:
1131 
1136  };
1137 
1138  public:
1139 
1141  typedef Uint4 size_type;
1142 
1143  CData() : pool_( 0 ),
1144  start_( 0 ), curr_( 0 ), last_( 0 ), size_( 0 )
1145  {}
1146 
1147  void SetDataPool( CDataPool * pool ) { pool_ = pool; }
1148 
1150  { return const_iterator( start_, 1, size_ ); }
1151 
1153  { return const_iterator( 0, 1, 0 ); }
1154 
1155  Uint4 size() const { return size_; }
1156  bool empty() const { return (size() == 0); }
1157 
1158  void push_back( const TWord & d )
1159  {
1160  if( start_ == 0 ) {
1161  start_ = curr_ = pool_->alloc();
1162  start_->next = 0;
1163  }
1164 
1165  curr_->data[last_++] = d;
1166 
1167  if( last_ >= DATA_UNIT_SIZE ) {
1168  SDataUnit * t = pool_->alloc();
1169  t->next = 0;
1170  curr_->next = t;
1171  curr_ = t;
1172  last_ = 0;
1173  }
1174 
1175  ++size_;
1176  }
1177 
1178  void resize( Uint4 newsize )
1179  {
1180  if( newsize == 0 ) {
1181  pool_->free( start_ );
1182  start_ = curr_ = 0;
1183  size_ = last_ = 0;
1184  return;
1185  }
1186 
1187  while( newsize > size() ) push_back( 0 );
1188  Uint4 t = 0;
1189  SDataUnit * tp = 0, * tn = start_;
1190 
1191  while( t < newsize ) {
1192  t += DATA_UNIT_SIZE;
1193  tp = tn;
1194  tn = tp->next;
1195  }
1196 
1197  pool_->free( tn );
1198  curr_ = tp;
1199  last_ = DATA_UNIT_SIZE - (t - newsize) - 1;
1200  size_ = newsize;
1201  }
1202 
1203  private:
1204 
1206 
1211  };
1212 
1213  /** Type used to store offset list data. */
1214  typedef CData TData;
1215 
1216  TData data_; /**< Offset list data storage. */
1217  unsigned long min_offset_; /**< Minimum offset used by the index. */
1218  unsigned long mult_; /**< Max multiple to use in list pre-ordering. */
1219 };
1220 
1221 //-------------------------------------------------------------------------
1222 inline void COffsetList::Save( CNcbiOstream & os) const
1223 {
1224  for( TData::const_iterator cit = data_.begin();
1225  cit != data_.end(); ++cit )
1226  if( *cit < min_offset_ ) {
1227  WriteWord( os, *cit );
1228  WriteWord( os, *(++cit) );
1229  }
1230  else if( (*cit)%mult_ == 0 ) WriteWord( os, *cit );
1231 
1232  unsigned long m = mult_;
1233 
1234  while( --m > 0 ) {
1235  for( TData::const_iterator cit = data_.begin();
1236  cit != data_.end(); ++cit ) {
1237  if( *cit < min_offset_ ) ++cit;
1238  else {
1239  bool skip = false;
1240 
1241  for( unsigned long n = mult_; n > m; --n )
1242  if( (*cit)%n == 0 ) { skip = true; break; }
1243 
1244  if( !skip && (*cit)%m == 0 ) WriteWord( os, *cit );
1245  }
1246  }
1247  }
1248 
1249  if( !data_.empty() ) {
1250  WriteWord( os, (TWord)0 );
1251  }
1252 }
1253 
1254 //-------------------------------------------------------------------------
1255 inline void COffsetList::AddData( TWord item, TWord & total )
1256 {
1257  data_.push_back( item );
1258  ++total;
1259 }
1260 
1261 //-------------------------------------------------------------------------
1263 {
1264  bool flag = false;
1266 
1267  for( TData::size_type i = 0; i < data_.size(); ++i, ++it ) {
1268  if( *it < min_offset_ ) {
1269  flag = true;
1270  continue;
1271  }
1272 
1273  if( *it >= offset ) {
1274  if( flag ) {
1275  --i; --it;
1276  }
1277 
1278  TData::size_type diff = data_.size() - i;
1279  data_.resize( i );
1280  total -= diff;
1281  return;
1282  }else {
1283  flag = false;
1284  }
1285  }
1286 }
1287 
1288 //-------------------------------------------------------------------------
1289 /** A class responsible for creation and management of Nmer
1290  offset lists.
1291 */
1293 {
1294  public:
1295 
1296  typedef CSubjectMap_Factory TSubjectMap; /**< Rename for consistency. */
1297 
1298  /** Object constructor.
1299  @param subject_map structure to use to map logical oids to the
1300  actual sequence data
1301  @param options index creation options
1302  */
1304  TSubjectMap & subject_map,
1305  const CDbIndex::SOptions & options,
1306  COffsetList::CDataPool * pool )
1307  : subject_map_( subject_map ),
1308  hash_table_( 1<<(2*options.hkey_width) ),
1309  total_( 0 ),
1310  hkey_width_( options.hkey_width ),
1311  last_seq_( 0 ),
1312  options_( options ),
1313  code_bits_( GetCodeBits( options.stride ) )
1314  {
1315  for( THashTable::iterator i = hash_table_.begin();
1316  i != hash_table_.end(); ++i ) {
1317  i->SetIndexParams( options_ );
1318  i->SetDataPool( pool );
1319  }
1320  }
1321 
1322  /** Get the total memory usage by offset lists in bytes.
1323  @return memory usage by this instance
1324  */
1325  const TWord total() const { return total_; }
1326 
1327  /** Bring offset lists up to date with the corresponding
1328  subject map instance.
1329  */
1330  void Update();
1331 
1332  /** Save the offset lists into the binary output stream.
1333  @param os output stream; must be open in binary mode
1334  */
1335  void Save( CNcbiOstream & os );
1336 
1337  private:
1338 
1339  /** Type used for individual offset lists. */
1341 
1342  typedef CDbIndex::TSeqNum TSeqNum; /**< Forwarding from CDbIndex. */
1343  typedef TSubjectMap::TSeqInfo TSeqInfo; /**< Forwarding from TSubjectMap. */
1344 
1345  /** Type used for mapping Nmer values to corresponding
1346  offset lists.
1347  */
1348  typedef std::vector< TOffsetList > THashTable;
1349 
1350  /** Truncate the offset lists according to the information
1351  from the subject map.
1352  Checks if the last oid for which information is added
1353  to the offset lists is more than the last valid oid
1354  in the subject map and erases extraenious information.
1355  */
1356  void Truncate();
1357 
1358  /** Update offset lists with information corresponding to
1359  the given sequence.
1360  @param sinfo new sequence information
1361  */
1362  void AddSeqInfo( const TSeqInfo & sinfo );
1363 
1364  /** Update offset lists with information corresponding to
1365  the given valid segment of a sequence.
1366  @param seq points to the start of the sequence
1367  @param seqlen length of seq
1368  @param start start of the segment
1369  @param stop one past the end of the segment
1370  */
1371  void AddSeqSeg(
1372  const Uint1 * seq, TWord seqlen,
1373  TSeqPos start, TSeqPos stop );
1374 
1375  /** Encode the offset data and add to the offset list
1376  corresponding to the given Nmer value.
1377  @param nmer the Nmer value
1378  @param start start of the current valid segment
1379  @param stop one past the end of the current valid segment
1380  @param curr end of the Nmer within the sequence
1381  @param offset offset encoded with subject map instance
1382  */
1383  void EncodeAndAddOffset(
1384  TWord nmer,
1385  TSeqPos start, TSeqPos stop,
1386  TSeqPos curr, TWord offset );
1387 
1388  TSubjectMap & subject_map_; /**< Instance of subject map structure. */
1389  THashTable hash_table_; /**< Mapping from Nmer values to the corresponding offset lists. */
1390  TWord total_; /**< Current size of the structure in bytes. */
1391  unsigned long hkey_width_; /**< Nmer width in bases. */
1392  TSeqNum last_seq_; /**< Logical oid of last processed sequence. */
1393 
1394  const CDbIndex::SOptions & options_; /**< Index options. */
1395  unsigned long code_bits_; /**< Number of bits to encode special offset prefixes. */
1396 };
1397 
1398 //-------------------------------------------------------------------------
1400 {
1401  ++this->total_;
1402 
1403  for( THashTable::const_iterator cit = hash_table_.begin();
1404  cit != hash_table_.end(); ++cit ) {
1405  if( cit->Size() > 0 ) ++this->total_;
1406  }
1407 
1408  bool stat = !options_.stat_file_name.empty();
1409  std::unique_ptr< CNcbiOfstream > stats;
1410 
1411  if( stat ) {
1412  stats.reset(
1413  new CNcbiOfstream( options_.stat_file_name.c_str() ) );
1414  }
1415 
1416  WriteWord( os, total() );
1417  TWord tot = 0;
1418  unsigned long nmer = 0;
1419 
1420  for( THashTable::const_iterator cit = hash_table_.begin();
1421  cit != hash_table_.end(); ++cit, ++nmer ) {
1422  if( cit->Size() != 0 ) {
1423  ++tot;
1424  }
1425 
1426  if( cit->Size() != 0 )
1427  WriteWord( os, tot );
1428  else WriteWord( os, (TWord)0 );
1429 
1430  tot += cit->Size();
1431 
1432  if( stat && cit->Size() > 0 ) {
1433  *stats << hex << setw( 10 ) << nmer
1434  << " " << dec << cit->Size() << endl;
1435  }
1436  }
1437 
1438  WriteWord( os, total() );
1439  WriteWord( os, (TWord)0 );
1440 
1441  for( THashTable::const_iterator cit = hash_table_.begin();
1442  cit != hash_table_.end(); ++cit ) {
1443  cit->Save( os );
1444  }
1445 
1446  os << std::flush;
1447 }
1448 
1449 //-------------------------------------------------------------------------
1451  TWord nmer, TSeqPos start, TSeqPos stop,
1452  TSeqPos curr, TWord offset )
1453 {
1454  TSeqPos start_diff = curr + 2 - static_cast<TSeqPos>(hkey_width_) - start;
1455  TSeqPos end_diff = stop - curr;
1456 
1457  if( start_diff <= options_.stride || end_diff <= options_.stride ) {
1458  if( start_diff > options_.stride ) start_diff = 0;
1459  if( end_diff > options_.stride ) end_diff = 0;
1460  TWord code = (start_diff<<code_bits_) + end_diff;
1461  hash_table_[(THashTable::size_type)nmer].AddData(
1462  code, total_ );
1463  }
1464 
1465  hash_table_[(THashTable::size_type)nmer].AddData(
1466  offset, total_ );
1467 }
1468 
1469 //-------------------------------------------------------------------------
1471  const Uint1 * seq, TWord , TSeqPos start, TSeqPos stop )
1472 {
1473  const TWord nmer_mask = (((TWord)1)<<(2*hkey_width_)) - 1;
1474  const Uint1 letter_mask = 0x3;
1475  TWord nmer = 0;
1476  unsigned long count = 0;
1477 
1478  for( TSeqPos curr = start; curr < stop; ++curr, ++count ) {
1479  Uint1 unit = seq[curr/CR];
1480  Uint1 letter = ((unit>>(6 - 2*(curr%CR)))&letter_mask);
1481  nmer = ((nmer<<2)&nmer_mask) + letter;
1482 
1483  if( count >= hkey_width_ - 1 ) {
1484  if( subject_map_.CheckOffset( seq, curr ) ) {
1485  TWord offset = subject_map_.MakeOffset( seq, curr );
1486  EncodeAndAddOffset( nmer, start, stop, curr, offset );
1487  }
1488  }
1489  }
1490 }
1491 
1492 //-------------------------------------------------------------------------
1494 {
1495  for( TSeqInfo::TSegs::const_iterator it = sinfo.segs_.begin();
1496  it != sinfo.segs_.end(); ++it ) {
1497  AddSeqSeg(
1499  sinfo.len_, it->start_, it->stop_ );
1500  }
1501 }
1502 
1503 //-------------------------------------------------------------------------
1505 {
1508 
1509  for( THashTable::iterator it = hash_table_.begin();
1510  it != hash_table_.end(); ++it ) {
1511  it->TruncateList( offset, total_ );
1512  }
1513 }
1514 
1515 //-------------------------------------------------------------------------
1517 {
1519  Truncate();
1520  }
1521 
1522  const TSeqInfo * sinfo;
1523 
1524  while( (sinfo = subject_map_.GetSeqInfo( last_seq_ + 1 )) != 0 ) {
1525  AddSeqInfo( *sinfo );
1526  ++last_seq_;
1527  }
1528 }
1529 
1530 //-------------------------------------------------------------------------
1531 /** Index factory implementation.
1532  */
1534 {
1535  private:
1536 
1537  static const Uint8 MEGABYTE = 1024*1024ULL; /**< Obvious... */
1538 
1539  public:
1540 
1541  /** Create an index implementation object.
1542 
1543  @param input [I] stream for reading sequence and mask information
1544  @param oname [I] output file name
1545  @param start [I] number of the first sequence in the index
1546  @param start_chunk [I] number of the first chunk at which the starting
1547  sequence should be processed
1548  @param stop [I/O] number of the last sequence in the index;
1549  returns the number of the actual last sequece
1550  stored
1551  @param stop_chunk [I/O] number of the last chunk of the last sequence
1552  in the index
1553  @param options [I] index creation parameters
1554  */
1555  static void Create(
1557  const std::string & oname,
1558  TSeqNum start, TSeqNum start_chunk,
1559  TSeqNum & stop, TSeqNum & stop_chunk,
1560  const SOptions & options
1561  );
1562 
1563  /** Object destructor. */
1565 
1566  private:
1567 
1568  /** Save the index header.
1569  @param os output stream open in binary mode
1570  @param options index creation options
1571  @param start oid of the first sequence in the index
1572  @param start_chunk chunk number of the first chunk of the first sequence
1573  @param stop oid of the last sequence in the index
1574  @param stop_chunk chunk number of the last chunk of the last sequence
1575  */
1576  static void SaveHeader(
1577  CNcbiOstream & os,
1578  const SOptions & options,
1579  TSeqNum start,
1580  TSeqNum start_chunk,
1581  TSeqNum stop,
1582  TSeqNum stop_chunk );
1583 
1584  /** Called by CDbIndex::Create() (should be merged?).
1585  */
1586  static void do_create(
1587  CSequenceIStream & input, const std::string & oname,
1588  TSeqNum start, TSeqNum start_chunk,
1589  TSeqNum & stop, TSeqNum & stop_chunk,
1590  const SOptions & options
1591  );
1592 
1593  /** Another forward from do_create() (should be merged?).
1594  */
1595  static void do_create_1_2(
1596  CSequenceIStream & input, const std::string & oname,
1597  TSeqNum start, TSeqNum start_chunk,
1598  TSeqNum & stop, TSeqNum & stop_chunk,
1599  const SOptions & options
1600  );
1601 };
1602 
1603 //-------------------------------------------------------------------------
1605  CNcbiOstream & os,
1606  const SOptions & options,
1607  TSeqNum start,
1608  TSeqNum start_chunk,
1609  TSeqNum stop,
1610  TSeqNum stop_chunk )
1611 {
1612  if( options.legacy ) {
1613  WriteWord( os, (unsigned char)VERSION );
1614  for( int i = 0; i < 7; ++i ) WriteWord( os, (unsigned char)0 );
1615  WriteWord( os, (Uint8)WIDTH_32 );
1616  WriteWord( os, (TWord)options.hkey_width );
1618  WriteWord( os, (TWord)UNCOMPRESSED );
1619  }
1620  else {
1621  WriteWord( os, (unsigned char)(VERSION + 1) );
1622  for( int i = 0; i < 7; ++i ) WriteWord( os, (unsigned char)0 );
1623  WriteWord( os, (Uint8)WIDTH_32 );
1624  WriteWord( os, (TWord)options.hkey_width );
1625  WriteWord( os, (TWord)options.stride );
1626  WriteWord( os, (TWord)options.ws_hint );
1627  }
1628 
1629  WriteWord( os, (TWord)start );
1630  WriteWord( os, (TWord)start_chunk );
1631  WriteWord( os, (TWord)stop );
1632  WriteWord( os, (TWord)stop_chunk );
1633  os << std::flush;
1634 }
1635 
1636 //-------------------------------------------------------------------------
1638  CSequenceIStream & input, const std::string & oname,
1639  TSeqNum start, TSeqNum start_chunk,
1640  TSeqNum & stop, TSeqNum & stop_chunk, const SOptions & options )
1641 {
1642  do_create(
1643  input, oname, start, start_chunk, stop, stop_chunk, options );
1644 }
1645 
1646 //-------------------------------------------------------------------------
1648  CSequenceIStream & input, const std::string & oname,
1649  TSeqNum start, TSeqNum start_chunk,
1650  TSeqNum & stop, TSeqNum & stop_chunk, const SOptions & options )
1651 {
1652  do_create_1_2(
1653  input, oname, start, start_chunk, stop, stop_chunk, options );
1654 }
1655 
1656 //-------------------------------------------------------------------------
1658  CSequenceIStream & input, const std::string & oname,
1659  TSeqNum start, TSeqNum start_chunk,
1660  TSeqNum & stop, TSeqNum & stop_chunk, const SOptions & options )
1661 {
1663  typedef COffsetData_Factory TOffsetData;
1664 
1665  std::unique_ptr< COffsetList::CDataPool > pool(
1666  new COffsetList::CDataPool );
1667 
1668  TSubjectMap subject_map( options );
1669  TOffsetData offset_data( subject_map, options, pool.get() );
1670 
1671  TSeqNum i = start;
1672 
1673  if( i >= stop ) {
1674  stop = start;
1675  return;
1676  }
1677 
1678  vector< string > idmap;
1679 
1680  while( i < stop ) {
1682 
1683  CRef< TSeqData > seq_data( input.next() );
1684  TSeqData * sd = seq_data.GetNonNullPointer();
1685  string idstr = subject_map.NewSequenceInit( *sd, start_chunk );
1686  idmap.push_back( idstr );
1687 
1688  if( !*sd ) {
1689  if( i == start ) {
1690  stop = start;
1691  return;
1692  }
1693 
1694  stop = i;
1695  stop_chunk = 0;
1696  break;
1697  }
1698 
1699  bool overflow;
1700 
1701  while( subject_map.AddSequenceChunk( overflow ) ) {
1702  if( !overflow ) {
1703  offset_data.Update();
1704  }
1705  else {
1706  std::cerr << "WARNING: logical sequence id overflow. "
1707  << "Starting new volume." << std::endl;
1708  }
1709 
1710  Uint8 total = (Uint8)subject_map.total() +
1711  ((Uint8)sizeof( TWord ))*offset_data.total();
1712 
1713  if( total > MEGABYTE*options.max_index_size || overflow ) {
1714  input.putback();
1715  subject_map.RollBack();
1716  offset_data.Update();
1717  subject_map.Commit();
1718  stop = start + subject_map.GetLastSequence() - 1;
1719  stop_chunk = subject_map.GetLastSequenceChunk();
1720  break;
1721  }
1722  }
1723 
1724  subject_map.Commit();
1725  start_chunk = 0;
1726  ++i;
1727  }
1728 
1729  {
1730  std::ostringstream os;
1731  os << "Last processed: sequence "
1732  << start + subject_map.GetLastSequence() - 1
1733  << " ; chunk " << subject_map.GetLastSequenceChunk()
1734  << std::endl;
1735  }
1736 
1737  {
1738  std::ostringstream os;
1739  os << "Index size: "
1740  << subject_map.total() + sizeof( TWord )*offset_data.total()
1741  << " bytes (not counting the hash table)." << std::endl;
1742  }
1743 
1744  CNcbiOfstream os( oname.c_str(), IOS_BASE::binary );
1745  SaveHeader( os, options, start, start_chunk, stop, stop_chunk );
1746  offset_data.Save( os );
1747  subject_map.Save( os );
1748 
1749  if( options.idmap ) {
1750  string mapname = oname + ".map";
1751  CNcbiOfstream maps( mapname.c_str() );
1752 
1753  for( vector< string >::const_iterator i = idmap.begin();
1754  i != idmap.end(); ++i ) {
1755  maps << *i << "\n";
1756  }
1757 
1758  maps << flush;
1759  }
1760 }
1761 
1762 //-------------------------------------------------------------------------
1764  const std::string & fname, const std::string & oname,
1765  TSeqNum start, TSeqNum start_chunk,
1766  TSeqNum & stop, TSeqNum & stop_chunk, const SOptions & options )
1767 {
1768  // Make an CSequenceIStream out of fname and forward to
1769  // MakeIndex( CSequenceIStream &, ... ).
1770  CSequenceIStreamFasta input( fname );
1771  MakeIndex(
1772  input, oname, start, start_chunk,
1773  stop, stop_chunk, options );
1774 }
1775 
1776 //-------------------------------------------------------------------------
1778  const std::string & fname, const std::string & oname,
1779  TSeqNum start, TSeqNum & stop, const SOptions & options )
1780 {
1781  TSeqNum t; // unused
1782  MakeIndex( fname, oname, start, stop, t, options );
1783 }
1784 
1785 //-------------------------------------------------------------------------
1787  CSequenceIStream & input, const std::string & oname,
1788  TSeqNum start, TSeqNum start_chunk,
1789  TSeqNum & stop, TSeqNum & stop_chunk, const SOptions & options )
1790 {
1791  typedef CDbIndex_Factory TIndex_Impl;
1792  TIndex_Impl::Create(
1793  input, oname, start, start_chunk, stop, stop_chunk, options );
1794 }
1795 
1796 //-------------------------------------------------------------------------
1798  CSequenceIStream & input, const std::string & oname,
1799  TSeqNum start, TSeqNum & stop, const SOptions & options )
1800 {
1801  TSeqNum t; // unused
1802  MakeIndex( input, oname, start, stop, t, options );
1803 }
1804 
1805 END_SCOPE( blastdbindex )
1807 
#define static
Structures and functions prototypes used for BLAST gapped extension.
Structures and API used for saving BLAST hits.
ncbi::TMaskedQueryRegions mask
#define true
Definition: bool.h:35
Types of exception the indexing library can throw.
Definition: dbindex.hpp:409
Index factory implementation.
static void Create(CSequenceIStream &input, const std::string &oname, TSeqNum start, TSeqNum start_chunk, TSeqNum &stop, TSeqNum &stop_chunk, const SOptions &options)
Create an index implementation object.
virtual ~CDbIndex_Factory()
Object destructor.
static const Uint8 MEGABYTE
Obvious...
static void SaveHeader(CNcbiOstream &os, const SOptions &options, TSeqNum start, TSeqNum start_chunk, TSeqNum stop, TSeqNum stop_chunk)
Save the index header.
static void do_create(CSequenceIStream &input, const std::string &oname, TSeqNum start, TSeqNum start_chunk, TSeqNum &stop, TSeqNum &stop_chunk, const SOptions &options)
Called by CDbIndex::Create() (should be merged?).
static void do_create_1_2(CSequenceIStream &input, const std::string &oname, TSeqNum start, TSeqNum start_chunk, TSeqNum &stop, TSeqNum &stop_chunk, const SOptions &options)
Another forward from do_create() (should be merged?).
Base class providing high level interface to index objects.
Definition: dbindex.hpp:435
CSubjectMap TSubjectMap
Definition: dbindex.hpp:867
Uint4 TWord
Type representing main memory unit of the index structure.
Definition: dbindex.hpp:487
static const unsigned char VERSION
Index version that this library handles.
Definition: dbindex.hpp:463
static void MakeIndex(const std::string &fname, const std::string &oname, TSeqNum start, TSeqNum start_chunk, TSeqNum &stop, TSeqNum &stop_chunk, const SOptions &options)
Create an index object.
CSequenceIStream::TStreamPos TSeqNum
Type used to enumerate sequences in the index.
Definition: dbindex.hpp:484
CObjectManager –.
CObject –.
Definition: ncbiobj.hpp:180
A class responsible for creation and management of Nmer offset lists.
CSubjectMap_Factory TSubjectMap
Rename for consistency.
std::vector< TOffsetList > THashTable
Type used for mapping Nmer values to corresponding offset lists.
TWord total_
Current size of the structure in bytes.
TSubjectMap::TSeqInfo TSeqInfo
Forwarding from TSubjectMap.
void AddSeqInfo(const TSeqInfo &sinfo)
Update offset lists with information corresponding to the given sequence.
void Save(CNcbiOstream &os)
Save the offset lists into the binary output stream.
void AddSeqSeg(const Uint1 *seq, TWord seqlen, TSeqPos start, TSeqPos stop)
Update offset lists with information corresponding to the given valid segment of a sequence.
CDbIndex::TSeqNum TSeqNum
Forwarding from CDbIndex.
TSubjectMap & subject_map_
Instance of subject map structure.
void EncodeAndAddOffset(TWord nmer, TSeqPos start, TSeqPos stop, TSeqPos curr, TWord offset)
Encode the offset data and add to the offset list corresponding to the given Nmer value.
unsigned long code_bits_
Number of bits to encode special offset prefixes.
THashTable hash_table_
Mapping from Nmer values to the corresponding offset lists.
TSeqNum last_seq_
Logical oid of last processed sequence.
const CDbIndex::SOptions & options_
Index options.
void Truncate()
Truncate the offset lists according to the information from the subject map.
unsigned long hkey_width_
Nmer width in bases.
COffsetList TOffsetList
Type used for individual offset lists.
COffsetData_Factory(TSubjectMap &subject_map, const CDbIndex::SOptions &options, COffsetList::CDataPool *pool)
Object constructor.
const TWord total() const
Get the total memory usage by offset lists in bytes.
void Update()
Bring offset lists up to date with the corresponding subject map instance.
static const Uint4 BLOCK_SIZE
vector< SDataUnit > TBlock
void free(SDataUnit *d)
friend bool operator!=(const CDataIterator &rhs, const CDataIterator &lhs)
friend bool operator==(const CDataIterator &rhs, const CDataIterator &lhs)
CDataIterator(SDataUnit *cunit, Uint4 cindex, Uint4 size)
CDataIterator const_iterator
void SetDataPool(CDataPool *pool)
const_iterator end() const
void resize(Uint4 newsize)
const_iterator begin() const
void push_back(const TWord &d)
Type representing an offset list corresponding to an Nmer.
CData TData
Type used to store offset list data.
TWord Size() const
Return the size of the offset list in words.
void SetIndexParams(const TOptions &options)
Set the index creation parameters.
unsigned long min_offset_
Minimum offset used by the index.
void SetDataPool(CDataPool *pool)
void AddData(TWord item, TWord &total)
Add an offset to the list.
void Save(CNcbiOstream &os) const
Save the offset list.
TData data_
Offset list data storage.
unsigned long mult_
Max multiple to use in list pre-ordering.
static const Uint4 DATA_UNIT_SIZE
void TruncateList(TWord offset, TWord &total)
Truncate the list to the value of offset.
Sequence stream for reading FASTA formatted files.
Class used to abstract reading nucleotide sequences from various sources.
TSeqData::TMask TMask
Public alias for type containing masking info.
A helper class used when creating internal set masked locations in the process of converting the sequ...
TLocs::const_iterator it_
State of the iterator over *vit_ (inner iteration).
bool In(TSeqPos pos)
Check if a point falls within the intervals stored in the object.
objects::CSeq_loc::TPacked_int::Tdata TLocs
See documentation for CSubjectMap_Factory_Base::TLocs.
std::vector< const TLocs * > TLocsVec
Collection of TLocs extracted from CSequenceIStream::TSeqData.
void Init()
Initialize the iterators after the masked locations are added.
bool Good() const
Check if the end of iteration has been reached.
void Adjust(TSeqPos pos)
Backtrack to the first interval to the left of pos or to the beginning, if not possible.
TLocsVec c_locs_
Container with sets of masked intervals.
bool Retreat()
Iteration step backwords.
TSeqPos stop_
One past the right end of *it_.
CMaskHelper()
Default object constructor.
void Add(const TMask::value_type &loc)
Add a set of masked intervals.
TLocsVec::const_iterator vit_
State of the iterator over c_locs_ (outer iteration).
CSequenceIStream::TMask TMask
forwarded type
Part of the CSubjectMap_Factory class that is independent of template parameters.
CSequenceIStream::TMask TMask
Masking information.
TSeqStore seq_store_
Container for storing the packed sequence data.
TSeqNum committed_
Logical number of the last committed sequence.
unsigned long report_level_
Level of reporting requested by the user.
TSeqStore::size_type ss_cap_
Current seq_store capacity.
CRef< CMaskHelper > mask_helper_
Auxiliary object used to compute unmasked parts of the sequences.
CDbIndex::TSeqNum TSeqNum
forwarded type
TSeqNum last_chunk_
Logical number of last processed sequence.
unsigned long chunk_size_
Maximum internal sequence size.
string extractSeqVector(TSeqData &sd)
Helper function used to extract CSeqVector instance from a TSeqData object.
unsigned long chunk_overlap_
Length of overlap between consequtive chunks of one sequence.
objects::CSeq_loc::TPacked_int::Tdata TLocs
The inner most type needed to access mask data in the representation returned by ReadFasta().
static const TSeqStore::size_type SS_THRESH
Threshold for the difference between seqstore size and capacity.
unsigned long stride_
Stride selected in index creation options.
objects::CSeqVector TSeq
Sequence data without masking.
unsigned long min_offset_
Minimum offset value used by the index.
const Uint1 * seq_store_start() const
Get the start of the compressed sequence storage space.
std::vector< Uint1 > TSeqStore
Container type used to store compressed sequence information.
static const TSeqStore::size_type SS_INCR
Increment used to increase seqstore capacity.
string NewSequenceInit(TSeqData &sd, TSeqNum start_chunk)
Start processing of the new input sequence.
CSequenceIStream::TSeqData TSeqData
forwarded type
TSubjects subjects_
Mapping from subject oid to chunk information.
std::vector< TSeqNum > TSubjects
Type for storing mapping from subject oids to the chunk numbers.
CRef< objects::CObjectManager > om_
Reference to the ObjectManager instance.
TSeq c_seq_
Sequence data of the sequence currently being processed.
TSeqNum c_chunk_
Current chunk number of the sequence currently being processed.
CSubjectMap_Factory_Base(const TOptions &options)
Object constructor.
To be merged with CSubjectMap_Factory_Base.
bool AddSequenceChunk(TSeqStore::size_type seq_off)
Append the next chunk of the input sequence currently being processed to the subject map.
TSeqNum GetLastSequenceChunk() const
Get the oid of the last chunk number of the last processed sequence.
CSubjectMap_Factory_TBase(const TOptions &options)
Object constructor.
void Commit()
Finalize processing of the current input sequence.
void Save(CNcbiOstream &os) const
Save the subject map and sequence info.
TSeqNum LastGoodSequence() const
Get the internal oid of the last valid sequence.
void RollBack()
Revert to the state before the start of processing of the current input sequence.
TChunks chunks_
Collection of sequence chunks (or logical sequences).
TSeqNum GetLastSequence() const
Get the oid of the last processed sequence.
std::vector< SSeqInfo > TChunks
Type for the collection of sequence chunks.
const TSeqInfo * GetSeqInfo(TSeqNum snum) const
Get the chunk info by internal oid.
SSeqSeg TSeqSeg
Type definition for external users.
SSeqInfo TSeqInfo
Type definition for external users.
TWord total() const
Get the total memory usage by the subject map in bytes.
To be merged with CSubjectMap_Factory_Base.
Uint1 offset_bits_
Number of bits used to encode offset.
TBase::TSeqData TSeqData
string NewSequenceInit(TSeqData &sd, TSeqNum start_chunk)
Start processing of the new input sequence.
vector< TWord > TLengthTable
Type of lengths table.
TBase::TSeqNum TSeqNum
TSeqPos cur_lid_len_
Current length of local sequence.
TLengthTable lengths_
The table of subject sequence lengths.
void Save(CNcbiOstream &os) const
Save the subject map and sequence info.
TLIdMap lid_map_
Maping of local sequence ids to chunks.
vector< SLIdMapElement > TLIdMap
Type of mapping of local sequence ids to chunks.
TWord MakeOffset(const Uint1 *seq, TSeqPos off) const
Encode an offset given a pointer to the compressed sequence data and relative offset.
CSubjectMap_Factory_TBase TBase
Base class.
bool CheckOffset(const Uint1 *seq, TSeqPos off) const
Check if index information should be produced for this offset.
CSubjectMap_Factory(const TOptions &options)
Object constructor.
bool AddSequenceChunk(bool &overflow)
Append the next chunk of the input sequence currently being processed to the subject map.
Type representing subject map data.
Definition: dbindex.hpp:1028
static const int chunk_size
static void Init(void)
Definition: cursor6.c:76
const unsigned long WIDTH_32
32-bit index.
Definition: dbindex.hpp:54
const unsigned long OFFSET_COMBINED
Combination of chunk number and chunk-based offset.
Definition: dbindex.hpp:49
unsigned long GetMinOffset(unsigned long stride)
Compute the minimum offset value needed encode offsets based on stride.
Definition: dbindex.cpp:446
const unsigned long UNCOMPRESSED
No compression.
Definition: dbindex.hpp:46
unsigned long GetCodeBits(unsigned long stride)
Compute the number of bits to encode special offsets based on stride.
Definition: dbindex.cpp:438
Uint1 base_value(objects::CSeqVectorTypes::TResidue r)
Convertion from IUPACNA to NCBI2NA (+1).
void WriteWord(CNcbiOstream &os, word_t word)
Write a word into a binary output stream.
static const unsigned long CR
CDbIndex::TWord TWord
Alias for CDbIndex::TWord type.
const std::string to_hex_str(TWord word)
Convert an integer to hex string representation.
CDbIndex::TSeqNum TSeqNum
Forwarding declarations for convenience.
Definition: dbindex_sp.hpp:45
CDbIndex::TWord TWord
Definition: dbindex_sp.hpp:46
static int lc
Definition: getdata.c:30
objects::CSeqVectorTypes::TResidue TResidue
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
@ null
Definition: ncbimisc.hpp:646
string
Definition: cgiapp.hpp:687
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
NCBI_XOBJUTIL_EXPORT string GetTitle(const CBioseq_Handle &hnd, TGetTitleFlags flags=0)
Definition: seqtitle.cpp:106
TObjectType * GetNonNullPointer(void)
Get pointer value and throw a null pointer exception if pointer is null.
Definition: ncbiobj.hpp:968
TObjectType * GetPointerOrNull(void) THROWS_NONE
Get pointer value.
Definition: ncbiobj.hpp:986
uint8_t Uint1
1-byte (8-bit) unsigned integer
Definition: ncbitype.h:99
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
uint64_t Uint8
8-byte (64-bit) unsigned integer
Definition: ncbitype.h:105
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
IO_PREFIX::ofstream CNcbiOfstream
Portable alias for ofstream.
Definition: ncbistre.hpp:500
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
Definition: ncbistre.hpp:149
static int input()
int i
yy_size_t n
int len
static void hex(unsigned char c)
Definition: mdb_dump.c:56
unsigned int word_t
Definition: bmconst.h:39
double value_type
The numeric datatype used by the parser.
Definition: muParserDef.h:228
EIPRangeType t
Definition: ncbi_localip.c:101
#define ASSERT
macro for assert.
Definition: ncbi_std.h:107
T min(T x_, T y_)
std::istream & in(std::istream &in_, double &x_)
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
The Object manager core.
static char tmp[2048]
Definition: utf8.c:42
int offset
Definition: replacements.h:160
CSequenceIStream::TSeqData TSeqData
Simple record type used to specify index creation parameters.
Definition: dbindex.hpp:468
bool legacy
Indicator of the legacy index format.
Definition: dbindex.hpp:470
unsigned long max_index_size
Maximum index size in megabytes.
Definition: dbindex.hpp:478
unsigned long chunk_size
Long sequences are split into chunks of this size.
Definition: dbindex.hpp:474
std::string stat_file_name
File to write index statistics into.
Definition: dbindex.hpp:480
unsigned long ws_hint
Most likely word size to use for searches.
Definition: dbindex.hpp:472
bool idmap
Indicator of the index map creation.
Definition: dbindex.hpp:469
unsigned long hkey_width
Width of the hash key in bits.
Definition: dbindex.hpp:473
unsigned long stride
Stride to use for stored database locations.
Definition: dbindex.hpp:471
TWord data[DATA_UNIT_SIZE]
Type containing the sequence itself along with the masking information.
CRef< objects::CSeq_entry > seq_entry_
Sequence data.
TMask mask_locs_
Masked portion of the sequence.
Element of mapping of local sequence ids to chunks.
TSeqPos seq_start_
Start of the combined sequence in seq_store.
TSeqNum end_
One past the last chunk.
TSeqPos seq_end_
End of the combined sequence in seq_store.
Type used to store a masked segment internally.
TSeqPos stop_
One past the end of the segment.
SSeqSeg(TSeqPos start, TSeqPos stop=0)
Object constructor.
TSeqPos start_
Start of the segment.
Information about the sequence chunk.
SSeqInfo(TWord start=0, TWord len=0, const TSegs &segs=TSegs())
Object constructor.
TSegs segs_
Valid intervals, i.e.
TWord seq_start_
Start of the compressed sequence data.
std::vector< SSeqSeg > TSegs
Type containing the valid intervals.
Definition: inftrees.h:24
else result
Definition: token2.c:20
static Uint4 letter(char c)
#define const
Definition: zconf.h:230
Modified on Tue Dec 05 02:23:07 2023 by modify_doxy.py rev. 669887