81 std::ostringstream os;
93 template<
typename word_t >
95 { os.write(
reinterpret_cast< char *
>( &word ),
sizeof(
word_t ) ); }
143 typedef objects::CSeqVector
TSeq;
151 typedef objects::CSeq_loc::TPacked_int::Tdata
TLocs;
157 static const TSeqStore::size_type
SS_INCR = 100*1024*1024;
160 static const TSeqStore::size_type
SS_THRESH = 10*1024*1024;
176 typedef objects::CSeq_loc::TPacked_int::Tdata
TLocs;
199 if( loc->IsPacked_int() ) {
201 &( loc->GetPacked_int().Get() ) );
238 TLocs::const_iterator
it_;
371 typedef std::vector< SSeqSeg >
TSegs;
543 it_ = (*vit_)->begin();
545 if(
it_ != (*vit_)->end() ) {
546 start_ = (*it_)->GetFrom();
547 stop_ = (*it_)->GetTo() + 1;
559 if( ++it_ != (*vit_)->end() ) {
560 start_ = (*it_)->GetFrom();
561 stop_ = (*it_)->GetTo() + 1;
566 if( Good() ) it_ = (*vit_)->begin();
577 }
while( notdone && pos < stop_ );
583 if( c_locs_.empty() )
return false;
588 while( vit_ != c_locs_.begin() && (*vit_)->empty() ) {
592 if( !(*vit_)->empty() ) {
593 it_ = (*vit_)->end();
595 start_ = (*it_)->GetFrom();
596 stop_ = (*it_)->GetTo() + 1;
600 vit_ = c_locs_.end();
604 if( it_ != (*vit_)->begin() ) {
606 start_ = (*it_)->GetFrom();
607 stop_ = (*it_)->GetTo() + 1;
611 if( vit_ == c_locs_.begin() ) {
618 while( vit_ != c_locs_.begin() && (*vit_)->empty() ) {
622 if( !(*vit_)->empty() ) {
623 it_ = (*vit_)->end();
625 start_ = (*it_)->GetFrom();
626 stop_ = (*it_)->GetTo() + 1;
637 while( Good() && pos >= stop_ ) Advance();
638 if( !Good() )
return false;
639 return pos >= start_;
648 entry->Which() != objects::CSeq_entry_Base::e_Seq ) {
651 "input seq-entry is NULL or not a sequence" );
654 objects::CScope scope( *
om_ );
655 objects::CSeq_entry_Handle seh = scope.AddTopLevelSeqEntry( *entry );
656 objects::CBioseq_Handle bsh = seh.GetSeq();
657 c_seq_ = bsh.GetSeqVector( objects::CBioseq_Handle::eCoding_Iupac );
659 Uint4 pos =
static_cast<Uint4>(idstr.find_first_of(
" \t" ));
660 idstr = idstr.substr( 0, pos );
668 string result =
"unknown";
677 for( TMask::const_iterator mask_it =
mask.begin();
678 mask_it !=
mask.end(); ++mask_it ) {
694 chunks_.size()*
sizeof(
TWord ));
697 for( TSubjects::const_iterator cit =
subjects_.begin();
702 for( TChunks::const_iterator cit = chunks_.begin();
703 cit != chunks_.end(); ++cit ) {
715 TSeqStore::size_type seq_off )
719 if( chunk_start >=
c_seq_.size() ) {
726 TSeqPos chunk_len = chunk_end - chunk_start;
729 if( chunk_len > 0 ) {
731 bool in =
false, in1;
734 for(
TSeqPos pos = chunk_start;
735 pos < chunk_end; ++pos,
lc = (
lc + 1)%
CR ) {
749 segs.push_back(
SSeqSeg( 0 ) );
752 segs.rbegin()->stop_ = pos - chunk_start;
754 }
else if( !in1 &&
in ) {
755 segs.push_back(
SSeqSeg( pos - chunk_start ) );
762 segs.push_back(
SSeqSeg( 0 ) );
765 segs.rbegin()->stop_ = chunk_end - chunk_start;
773 *
subjects_.rbegin() =
static_cast<unsigned int>(chunks_.size());
794 TSeqStore::size_type newsize =
795 (TSeqStore::size_type)(chunks_[
last_chunk_].seq_start_);
807 cur_lid_len_( 0 ), offset_bits_( 16 )
817 bool starting = (this->
c_chunk_ == 0);
820 TBase::TSeqStore::size_type seq_off =
822 this->
chunks_.rbegin()->seq_start_
825 TBase::TSeq::size_type seqlen = this->
c_seq_.size();
832 TSeqPos chunk_len = chunk_end - chunk_start;
838 if(
lid_map_.size() >= lid_limit ) {
854 if( starting && seqlen > 0 ) {
862 for(
TSeqPos pos = 0; pos < seqlen; ++pos,
lc = (
lc + 1)%
CR ) {
865 accum = (accum << 2) +
letter;
870 accum <<= (
CR -
lc)*2;
883 TLIdMap::const_reverse_iterator iter =
lid_map_.rbegin();
884 while( iter !=
lid_map_.rend() && iter->seq_start_ > soff ) ++iter;
885 ASSERT( iter->seq_start_ <= soff );
886 off += (soff - iter->seq_start_)*
CR;
895 TLIdMap::const_reverse_iterator iter =
lid_map_.rbegin();
896 while( iter !=
lid_map_.rend() && iter->seq_start_ > soff ) ++iter;
897 ASSERT( iter->seq_start_ <= soff );
898 off += (soff - iter->seq_start_)*
CR;
921 for( TLengthTable::const_iterator it =
lengths_.begin();
929 for( TLIdMap::const_iterator it =
lid_map_.begin();
1025 if( d == 0 )
return;
1128 {
return !(rhs == lhs); }
1180 if( newsize == 0 ) {
1191 while(
t < newsize ) {
1232 unsigned long m =
mult_;
1241 for(
unsigned long n =
mult_;
n > m; --
n )
1242 if( (*cit)%
n == 0 ) { skip =
true;
break; }
1244 if( !skip && (*cit)%m == 0 )
WriteWord( os, *cit );
1318 i->SetDataPool( pool );
1403 for( THashTable::const_iterator cit =
hash_table_.begin();
1405 if( cit->Size() > 0 ) ++this->
total_;
1409 std::unique_ptr< CNcbiOfstream >
stats;
1418 unsigned long nmer = 0;
1420 for( THashTable::const_iterator cit =
hash_table_.begin();
1422 if( cit->Size() != 0 ) {
1426 if( cit->Size() != 0 )
1432 if( stat && cit->Size() > 0 ) {
1433 *
stats <<
hex << setw( 10 ) << nmer
1434 <<
" " << dec << cit->Size() << endl;
1441 for( THashTable::const_iterator cit =
hash_table_.begin();
1455 TSeqPos end_diff = stop - curr;
1474 const Uint1 letter_mask = 0x3;
1476 unsigned long count = 0;
1478 for(
TSeqPos curr = start; curr < stop; ++curr, ++count ) {
1481 nmer = ((nmer<<2)&nmer_mask) +
letter;
1495 for( TSeqInfo::TSegs::const_iterator it = sinfo.
segs_.begin();
1496 it != sinfo.
segs_.end(); ++it ) {
1499 sinfo.
len_, it->start_, it->stop_ );
1509 for( THashTable::iterator it =
hash_table_.begin();
1614 for(
int i = 0;
i < 7; ++
i )
WriteWord( os, (
unsigned char)0 );
1622 for(
int i = 0;
i < 7; ++
i )
WriteWord( os, (
unsigned char)0 );
1643 input, oname, start, start_chunk, stop, stop_chunk, options );
1653 input, oname, start, start_chunk, stop, stop_chunk, options );
1665 std::unique_ptr< COffsetList::CDataPool > pool(
1669 TOffsetData offset_data( subject_map, options, pool.get() );
1678 vector< string > idmap;
1685 string idstr = subject_map.NewSequenceInit( *sd, start_chunk );
1686 idmap.push_back( idstr );
1701 while( subject_map.AddSequenceChunk( overflow ) ) {
1703 offset_data.Update();
1706 std::cerr <<
"WARNING: logical sequence id overflow. "
1707 <<
"Starting new volume." << std::endl;
1711 ((
Uint8)
sizeof(
TWord ))*offset_data.total();
1715 subject_map.RollBack();
1716 offset_data.Update();
1717 subject_map.Commit();
1718 stop = start + subject_map.GetLastSequence() - 1;
1719 stop_chunk = subject_map.GetLastSequenceChunk();
1724 subject_map.Commit();
1730 std::ostringstream os;
1731 os <<
"Last processed: sequence "
1732 << start + subject_map.GetLastSequence() - 1
1733 <<
" ; chunk " << subject_map.GetLastSequenceChunk()
1738 std::ostringstream os;
1739 os <<
"Index size: "
1740 << subject_map.total() +
sizeof(
TWord )*offset_data.total()
1741 <<
" bytes (not counting the hash table)." << std::endl;
1745 SaveHeader( os, options, start, start_chunk, stop, stop_chunk );
1746 offset_data.Save( os );
1747 subject_map.Save( os );
1749 if( options.
idmap ) {
1750 string mapname = oname +
".map";
1754 i != idmap.end(); ++
i ) {
1772 input, oname, start, start_chunk,
1773 stop, stop_chunk, options );
1782 MakeIndex( fname, oname, start, stop,
t, options );
1792 TIndex_Impl::Create(
1793 input, oname, start, start_chunk, stop, stop_chunk, options );
Structures and functions prototypes used for BLAST gapped extension.
Structures and API used for saving BLAST hits.
Types of exception the indexing library can throw.
Index factory implementation.
static void Create(CSequenceIStream &input, const std::string &oname, TSeqNum start, TSeqNum start_chunk, TSeqNum &stop, TSeqNum &stop_chunk, const SOptions &options)
Create an index implementation object.
virtual ~CDbIndex_Factory()
Object destructor.
static const Uint8 MEGABYTE
Obvious...
static void SaveHeader(CNcbiOstream &os, const SOptions &options, TSeqNum start, TSeqNum start_chunk, TSeqNum stop, TSeqNum stop_chunk)
Save the index header.
static void do_create(CSequenceIStream &input, const std::string &oname, TSeqNum start, TSeqNum start_chunk, TSeqNum &stop, TSeqNum &stop_chunk, const SOptions &options)
Called by CDbIndex::Create() (should be merged?).
static void do_create_1_2(CSequenceIStream &input, const std::string &oname, TSeqNum start, TSeqNum start_chunk, TSeqNum &stop, TSeqNum &stop_chunk, const SOptions &options)
Another forward from do_create() (should be merged?).
Base class providing high level interface to index objects.
Uint4 TWord
Type representing main memory unit of the index structure.
static const unsigned char VERSION
Index version that this library handles.
static void MakeIndex(const std::string &fname, const std::string &oname, TSeqNum start, TSeqNum start_chunk, TSeqNum &stop, TSeqNum &stop_chunk, const SOptions &options)
Create an index object.
CSequenceIStream::TStreamPos TSeqNum
Type used to enumerate sequences in the index.
A class responsible for creation and management of Nmer offset lists.
CSubjectMap_Factory TSubjectMap
Rename for consistency.
std::vector< TOffsetList > THashTable
Type used for mapping Nmer values to corresponding offset lists.
TWord total_
Current size of the structure in bytes.
TSubjectMap::TSeqInfo TSeqInfo
Forwarding from TSubjectMap.
void AddSeqInfo(const TSeqInfo &sinfo)
Update offset lists with information corresponding to the given sequence.
void Save(CNcbiOstream &os)
Save the offset lists into the binary output stream.
void AddSeqSeg(const Uint1 *seq, TWord seqlen, TSeqPos start, TSeqPos stop)
Update offset lists with information corresponding to the given valid segment of a sequence.
CDbIndex::TSeqNum TSeqNum
Forwarding from CDbIndex.
TSubjectMap & subject_map_
Instance of subject map structure.
void EncodeAndAddOffset(TWord nmer, TSeqPos start, TSeqPos stop, TSeqPos curr, TWord offset)
Encode the offset data and add to the offset list corresponding to the given Nmer value.
unsigned long code_bits_
Number of bits to encode special offset prefixes.
THashTable hash_table_
Mapping from Nmer values to the corresponding offset lists.
TSeqNum last_seq_
Logical oid of last processed sequence.
const CDbIndex::SOptions & options_
Index options.
void Truncate()
Truncate the offset lists according to the information from the subject map.
unsigned long hkey_width_
Nmer width in bases.
COffsetList TOffsetList
Type used for individual offset lists.
COffsetData_Factory(TSubjectMap &subject_map, const CDbIndex::SOptions &options, COffsetList::CDataPool *pool)
Object constructor.
const TWord total() const
Get the total memory usage by offset lists in bytes.
void Update()
Bring offset lists up to date with the corresponding subject map instance.
static const Uint4 BLOCK_SIZE
vector< SDataUnit > TBlock
CDataIterator & operator--()
CDataIterator & operator++()
friend bool operator!=(const CDataIterator &rhs, const CDataIterator &lhs)
friend bool operator==(const CDataIterator &rhs, const CDataIterator &lhs)
CDataIterator(SDataUnit *cunit, Uint4 cindex, Uint4 size)
CDataIterator const_iterator
void SetDataPool(CDataPool *pool)
const_iterator end() const
void resize(Uint4 newsize)
const_iterator begin() const
void push_back(const TWord &d)
Type representing an offset list corresponding to an Nmer.
CData TData
Type used to store offset list data.
TWord Size() const
Return the size of the offset list in words.
void SetIndexParams(const TOptions &options)
Set the index creation parameters.
unsigned long min_offset_
Minimum offset used by the index.
void SetDataPool(CDataPool *pool)
void AddData(TWord item, TWord &total)
Add an offset to the list.
void Save(CNcbiOstream &os) const
Save the offset list.
TData data_
Offset list data storage.
unsigned long mult_
Max multiple to use in list pre-ordering.
static const Uint4 DATA_UNIT_SIZE
void TruncateList(TWord offset, TWord &total)
Truncate the list to the value of offset.
Sequence stream for reading FASTA formatted files.
Class used to abstract reading nucleotide sequences from various sources.
TSeqData::TMask TMask
Public alias for type containing masking info.
A helper class used when creating internal set masked locations in the process of converting the sequ...
TLocs::const_iterator it_
State of the iterator over *vit_ (inner iteration).
bool In(TSeqPos pos)
Check if a point falls within the intervals stored in the object.
TSeqPos start_
Left end of *it_.
objects::CSeq_loc::TPacked_int::Tdata TLocs
See documentation for CSubjectMap_Factory_Base::TLocs.
std::vector< const TLocs * > TLocsVec
Collection of TLocs extracted from CSequenceIStream::TSeqData.
void Init()
Initialize the iterators after the masked locations are added.
bool Good() const
Check if the end of iteration has been reached.
void Advance()
Iteration step.
void Adjust(TSeqPos pos)
Backtrack to the first interval to the left of pos or to the beginning, if not possible.
TLocsVec c_locs_
Container with sets of masked intervals.
bool Retreat()
Iteration step backwords.
TSeqPos stop_
One past the right end of *it_.
CMaskHelper()
Default object constructor.
void Add(const TMask::value_type &loc)
Add a set of masked intervals.
TLocsVec::const_iterator vit_
State of the iterator over c_locs_ (outer iteration).
CSequenceIStream::TMask TMask
forwarded type
Part of the CSubjectMap_Factory class that is independent of template parameters.
CSequenceIStream::TMask TMask
Masking information.
TSeqStore seq_store_
Container for storing the packed sequence data.
TSeqNum committed_
Logical number of the last committed sequence.
unsigned long report_level_
Level of reporting requested by the user.
TSeqStore::size_type ss_cap_
Current seq_store capacity.
CRef< CMaskHelper > mask_helper_
Auxiliary object used to compute unmasked parts of the sequences.
CDbIndex::TSeqNum TSeqNum
forwarded type
TSeqNum last_chunk_
Logical number of last processed sequence.
unsigned long chunk_size_
Maximum internal sequence size.
string extractSeqVector(TSeqData &sd)
Helper function used to extract CSeqVector instance from a TSeqData object.
unsigned long chunk_overlap_
Length of overlap between consequtive chunks of one sequence.
objects::CSeq_loc::TPacked_int::Tdata TLocs
The inner most type needed to access mask data in the representation returned by ReadFasta().
static const TSeqStore::size_type SS_THRESH
Threshold for the difference between seqstore size and capacity.
unsigned long stride_
Stride selected in index creation options.
objects::CSeqVector TSeq
Sequence data without masking.
unsigned long min_offset_
Minimum offset value used by the index.
const Uint1 * seq_store_start() const
Get the start of the compressed sequence storage space.
std::vector< Uint1 > TSeqStore
Container type used to store compressed sequence information.
static const TSeqStore::size_type SS_INCR
Increment used to increase seqstore capacity.
string NewSequenceInit(TSeqData &sd, TSeqNum start_chunk)
Start processing of the new input sequence.
CSequenceIStream::TSeqData TSeqData
forwarded type
TSubjects subjects_
Mapping from subject oid to chunk information.
std::vector< TSeqNum > TSubjects
Type for storing mapping from subject oids to the chunk numbers.
CRef< objects::CObjectManager > om_
Reference to the ObjectManager instance.
TSeq c_seq_
Sequence data of the sequence currently being processed.
TSeqNum c_chunk_
Current chunk number of the sequence currently being processed.
CSubjectMap_Factory_Base(const TOptions &options)
Object constructor.
To be merged with CSubjectMap_Factory_Base.
bool AddSequenceChunk(TSeqStore::size_type seq_off)
Append the next chunk of the input sequence currently being processed to the subject map.
TSeqNum GetLastSequenceChunk() const
Get the oid of the last chunk number of the last processed sequence.
CSubjectMap_Factory_TBase(const TOptions &options)
Object constructor.
void Commit()
Finalize processing of the current input sequence.
void Save(CNcbiOstream &os) const
Save the subject map and sequence info.
TSeqNum LastGoodSequence() const
Get the internal oid of the last valid sequence.
void RollBack()
Revert to the state before the start of processing of the current input sequence.
TChunks chunks_
Collection of sequence chunks (or logical sequences).
TSeqNum GetLastSequence() const
Get the oid of the last processed sequence.
std::vector< SSeqInfo > TChunks
Type for the collection of sequence chunks.
const TSeqInfo * GetSeqInfo(TSeqNum snum) const
Get the chunk info by internal oid.
SSeqSeg TSeqSeg
Type definition for external users.
SSeqInfo TSeqInfo
Type definition for external users.
TWord total() const
Get the total memory usage by the subject map in bytes.
To be merged with CSubjectMap_Factory_Base.
Uint1 offset_bits_
Number of bits used to encode offset.
string NewSequenceInit(TSeqData &sd, TSeqNum start_chunk)
Start processing of the new input sequence.
vector< TWord > TLengthTable
Type of lengths table.
TSeqPos cur_lid_len_
Current length of local sequence.
TLengthTable lengths_
The table of subject sequence lengths.
void Save(CNcbiOstream &os) const
Save the subject map and sequence info.
TLIdMap lid_map_
Maping of local sequence ids to chunks.
vector< SLIdMapElement > TLIdMap
Type of mapping of local sequence ids to chunks.
TWord MakeOffset(const Uint1 *seq, TSeqPos off) const
Encode an offset given a pointer to the compressed sequence data and relative offset.
CSubjectMap_Factory_TBase TBase
Base class.
bool CheckOffset(const Uint1 *seq, TSeqPos off) const
Check if index information should be produced for this offset.
CSubjectMap_Factory(const TOptions &options)
Object constructor.
bool AddSequenceChunk(bool &overflow)
Append the next chunk of the input sequence currently being processed to the subject map.
Type representing subject map data.
static const int chunk_size
const unsigned long WIDTH_32
32-bit index.
const unsigned long OFFSET_COMBINED
Combination of chunk number and chunk-based offset.
unsigned long GetMinOffset(unsigned long stride)
Compute the minimum offset value needed encode offsets based on stride.
const unsigned long UNCOMPRESSED
No compression.
unsigned long GetCodeBits(unsigned long stride)
Compute the number of bits to encode special offsets based on stride.
Uint1 base_value(objects::CSeqVectorTypes::TResidue r)
Convertion from IUPACNA to NCBI2NA (+1).
void WriteWord(CNcbiOstream &os, word_t word)
Write a word into a binary output stream.
static const unsigned long CR
CDbIndex::TWord TWord
Alias for CDbIndex::TWord type.
const std::string to_hex_str(TWord word)
Convert an integer to hex string representation.
CDbIndex::TSeqNum TSeqNum
Forwarding declarations for convenience.
objects::CSeqVectorTypes::TResidue TResidue
unsigned int TSeqPos
Type for sequence locations and lengths.
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
NCBI_XOBJUTIL_EXPORT string GetTitle(const CBioseq_Handle &hnd, TGetTitleFlags flags=0)
TObjectType * GetNonNullPointer(void)
Get pointer value and throw a null pointer exception if pointer is null.
TObjectType * GetPointerOrNull(void) THROWS_NONE
Get pointer value.
uint8_t Uint1
1-byte (8-bit) unsigned integer
uint32_t Uint4
4-byte (32-bit) unsigned integer
uint64_t Uint8
8-byte (64-bit) unsigned integer
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define END_SCOPE(ns)
End the previously defined scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
#define BEGIN_SCOPE(ns)
Define a new scope.
IO_PREFIX::ofstream CNcbiOfstream
Portable alias for ofstream.
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
static void hex(unsigned char c)
double value_type
The numeric datatype used by the parser.
#define ASSERT
macro for assert.
std::istream & in(std::istream &in_, double &x_)
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
CSequenceIStream::TSeqData TSeqData
Simple record type used to specify index creation parameters.
bool legacy
Indicator of the legacy index format.
unsigned long max_index_size
Maximum index size in megabytes.
unsigned long chunk_size
Long sequences are split into chunks of this size.
std::string stat_file_name
File to write index statistics into.
unsigned long ws_hint
Most likely word size to use for searches.
bool idmap
Indicator of the index map creation.
unsigned long hkey_width
Width of the hash key in bits.
unsigned long stride
Stride to use for stored database locations.
TWord data[DATA_UNIT_SIZE]
Type containing the sequence itself along with the masking information.
CRef< objects::CSeq_entry > seq_entry_
Sequence data.
TMask mask_locs_
Masked portion of the sequence.
Element of mapping of local sequence ids to chunks.
TSeqNum start_
First chunk.
TSeqPos seq_start_
Start of the combined sequence in seq_store.
TSeqNum end_
One past the last chunk.
TSeqPos seq_end_
End of the combined sequence in seq_store.
Type used to store a masked segment internally.
TSeqPos stop_
One past the end of the segment.
SSeqSeg(TSeqPos start, TSeqPos stop=0)
Object constructor.
TSeqPos start_
Start of the segment.
Information about the sequence chunk.
SSeqInfo(TWord start=0, TWord len=0, const TSegs &segs=TSegs())
Object constructor.
TSegs segs_
Valid intervals, i.e.
TWord seq_start_
Start of the compressed sequence data.
TWord len_
Sequence length.
std::vector< SSeqSeg > TSegs
Type containing the valid intervals.
static Uint4 letter(char c)