NCBI C++ ToolKit
|
Implementation for the CSeqDBVol class, which provides an interface for all functionality of one database volume. More...
#include <ncbi_pch.hpp>
#include <objtools/blast/seqdb_reader/impl/seqdbvol.hpp>
#include "seqdboidlist.hpp"
#include <objects/general/general__.hpp>
#include <objects/seqfeat/seqfeat__.hpp>
#include <serial/objistr.hpp>
#include <serial/objostr.hpp>
#include <serial/objistrasnb.hpp>
#include <serial/objostrasnb.hpp>
#include <serial/serial.hpp>
#include <corelib/ncbimtx.hpp>
#include <sstream>
Go to the source code of this file.
Go to the SVN repository for this file.
Typedefs | |
typedef set< pair< int, int > > | TRangeVector |
List of offset ranges as begin/end pairs. More... | |
Functions | |
static vector< Uint1 > | s_SeqDBMapNA2ToNA4Setup () |
Build NA2 to NcbiNA4 translation table. More... | |
static void | s_SeqDBMapNA2ToNA4 (const char *buf2bit, vector< char > &buf4bit, int base_length) |
Convert sequence data from NA2 to NA4 format. More... | |
static vector< Uint1 > | s_SeqDBMapNA2ToNA8Setup () |
Build NA2 to Ncbi-NA8 translation table. More... | |
static void | s_SeqDBMapNA2ToNA8 (const char *buf2bit, char *buf8bit, const SSeqDBSlice &range) |
Convert sequence data from NA2 to NA8 format. More... | |
static void | s_SeqDBMapNcbiNA8ToBlastNA8 (char *buf, const SSeqDBSlice &range) |
Convert sequence data from Ncbi-NA8 to Blast-NA8 format. More... | |
Uint4 | s_ResLenNew (const vector< Int4 > &ambchars, Uint4 i) |
Get length of ambiguous region (new version) More... | |
Uint4 | s_ResPosNew (const vector< Int4 > &ambchars, Uint4 i) |
Get position of ambiguous region (new version) More... | |
Uint4 | s_ResVal (const vector< Int4 > &ambchars, Uint4 i) |
Get ambiguous residue value (old version) More... | |
Uint4 | s_ResLenOld (const vector< Int4 > &ambchars, Uint4 i) |
Get ambiguous region length (old version) More... | |
Uint4 | s_ResPosOld (const vector< Int4 > &ambchars, Uint4 i) |
Get ambiguous residue value (old version) More... | |
static void | s_SeqDBRebuildDNA_NA4 (vector< char > &buf4bit, const vector< Int4 > &amb_chars) |
Rebuild an ambiguous region from sequence and ambiguity data. More... | |
static void | s_SeqDBRebuildDNA_NA8 (char *seq, const vector< Int4 > &amb_chars, const SSeqDBSlice ®ion) |
Rebuild an ambiguous region from sequence and ambiguity data. More... | |
static void | s_SeqDBWriteSeqDataProt (CSeq_inst &seqinst, const char *seq_buffer, int length) |
Store protein sequence data in a Seq-inst. More... | |
static void | s_SeqDBWriteSeqDataNucl (CSeq_inst &seqinst, const char *seq_buffer, int length) |
Store non-ambiguous nucleotide sequence data in a Seq-inst. More... | |
static void | s_SeqDBWriteSeqDataNucl (CSeq_inst &seqinst, const char *seq_buffer, int length, vector< Int4 > &amb_chars) |
Store non-ambiguous nucleotide sequence data in a Seq-inst. More... | |
static void | s_GetBioseqTitle (CRef< CBlast_def_line_set > deflines, string &title) |
Get the title string for a CBioseq. More... | |
static bool | s_SeqDB_SeqIdIn (const list< CRef< CSeq_id > > &seqids, const CSeq_id &target) |
Search for a Seq-id in a list of Seq-ids. More... | |
static CRef< CBlast_def_line_set > | s_OssToDefline (const CUser_field::TData::TOss &oss) |
Efficiently decode a Blast-def-line-set from binary ASN.1. More... | |
template<class T > | |
CRef< CBlast_def_line_set > | s_ExtractBlastDefline (const T &bioseq) |
static void | s_SeqDBMaskSequence (char *seq, CSeqDB::TSequenceRanges *masks, char mask_letter, const SSeqDBSlice &range) |
void | SeqDB_UnpackAmbiguities (const CTempString &sequence, const CTempString &ambiguities, string &result) |
Unpack an ambiguous nucleotide sequence. More... | |
bool | s_IncludeDefline_Taxid (const CBlast_def_line &def, const set< TTaxId > &user_tax_ids) |
bool | s_IncludeDefline_MaskFilter (const CBlast_def_line &def, Uint8 mask) |
bool | s_IncludeDefline_NegativeTaxid (const CBlast_def_line &def, const set< TTaxId > &user_tax_ids) |
template<class T > | |
static void | s_SeqDBFitsInFour (T id) |
Variables | |
unsigned | SeqDB_ncbina8_to_blastna8 [] |
Implementation for the CSeqDBVol class, which provides an interface for all functionality of one database volume.
Definition in file seqdbvol.cpp.
typedef set< pair<int, int> > TRangeVector |
List of offset ranges as begin/end pairs.
Definition at line 1587 of file seqdbvol.cpp.
CRef<CBlast_def_line_set> s_ExtractBlastDefline | ( | const T & | bioseq | ) |
Definition at line 1215 of file seqdbvol.cpp.
References _ASSERT, failure, CUser_object_Base::GetData(), CObject_id_Base::GetStr(), CUser_object_Base::GetType(), CObject_id_Base::IsStr(), ITERATE, kAsnDeflineObjLabel, and s_OssToDefline().
Referenced by CSeqDB::ExtractBlastDefline().
|
static |
Get the title string for a CBioseq.
GetBioseq will use this function to get a title field when constructing the CBioseq object.
deflines | The set of deflines for this sequence. [in] |
title | The returned title string. [out] |
Definition at line 938 of file seqdbvol.cpp.
References CBlast_def_line_Base::CanGetSeqid(), CBlast_def_line_Base::CanGetTitle(), CBlast_def_line_set_Base::Get(), CBlast_def_line_Base::GetSeqid(), and CBlast_def_line_Base::GetTitle().
Referenced by CSeqDBVol::GetBioseq().
bool s_IncludeDefline_MaskFilter | ( | const CBlast_def_line & | def, |
Uint8 | mask | ||
) |
Definition at line 1925 of file seqdbvol.cpp.
References CSeq_id::fAcc_predicted, fExcludeModel, CBlast_def_line_Base::GetSeqid(), ITERATE, and mask.
Referenced by CSeqDBVol::x_GetFilteredHeader().
bool s_IncludeDefline_NegativeTaxid | ( | const CBlast_def_line & | def, |
const set< TTaxId > & | user_tax_ids | ||
) |
Definition at line 1938 of file seqdbvol.cpp.
References set< Key, Compare >::end(), set< Key, Compare >::find(), CBlast_def_line::GetTaxIds(), ITERATE, and set< Key, Compare >::size().
Referenced by CSeqDBVol::x_GetFilteredHeader().
Definition at line 1892 of file seqdbvol.cpp.
References set< Key, Compare >::end(), set< Key, Compare >::find(), CBlast_def_line_Base::GetLinks(), CBlast_def_line_Base::GetTaxid(), set< Key, Compare >::insert(), CBlast_def_line_Base::IsSetLinks(), CBlast_def_line_Base::IsSetTaxid(), ITERATE, set< Key, Compare >::size(), and TAX_ID_FROM.
Referenced by CSeqDBVol::x_GetFilteredHeader().
|
static |
Efficiently decode a Blast-def-line-set from binary ASN.1.
oss | Octet string sequence of binary ASN.1 data. |
bdls | Blast def line set decoded from oss. |
Definition at line 1176 of file seqdbvol.cpp.
References data, ITERATE, NULL, and ncbi::grid::netcache::search::fields::size.
Referenced by s_ExtractBlastDefline().
Get length of ambiguous region (new version)
Given an ambiguity element in the new format, this returns the length of the ambiguous region.
ambchars | The packed ambiguity data. [in] |
i | The index into the ambiguity data. [in] |
Definition at line 623 of file seqdbvol.cpp.
References i.
Referenced by s_SeqDBRebuildDNA_NA4(), and s_SeqDBRebuildDNA_NA8().
Get ambiguous region length (old version)
Given an ambiguity element in the old format, this returns the length of the ambiguous region.
ambchars | The packed ambiguity data. [in] |
i | The index into the ambiguity data. [in] |
Definition at line 675 of file seqdbvol.cpp.
References i.
Referenced by s_SeqDBRebuildDNA_NA4(), and s_SeqDBRebuildDNA_NA8().
Get position of ambiguous region (new version)
Given an ambiguity element in the new format, this returns the position of the ambiguous region.
ambchars | The packed ambiguity data. [in] |
i | The index into the ambiguity data. [in] |
Definition at line 639 of file seqdbvol.cpp.
References i.
Referenced by s_SeqDBRebuildDNA_NA4(), and s_SeqDBRebuildDNA_NA8().
Get ambiguous residue value (old version)
Given an ambiguity element in the old format, this returns the position of the ambiguous region.
ambchars | The packed ambiguity data. [in] |
i | The index into the ambiguity data. [in] |
Definition at line 691 of file seqdbvol.cpp.
References i.
Referenced by s_SeqDBRebuildDNA_NA4(), and s_SeqDBRebuildDNA_NA8().
Get ambiguous residue value (old version)
Given an ambiguity element in the old format, this returns the residue value to use for all bases in the ambiguous region.
ambchars | The packed ambiguity data. [in] |
i | The index into the ambiguity data. [in] |
Definition at line 659 of file seqdbvol.cpp.
References i.
Referenced by s_SeqDBRebuildDNA_NA4(), and s_SeqDBRebuildDNA_NA8().
|
static |
Search for a Seq-id in a list of Seq-ids.
This iterates over a list of Seq-ids, and returns true if a specific Seq-id is equivalent to one found in the list.
seqids | A list of Seq-ids to search. [in] |
target | The Seq-id to search for. [in] |
Definition at line 1013 of file seqdbvol.cpp.
References CSeq_id::e_NO, CSeq_id::e_YES, and ITERATE.
Referenced by CSeqDBVol::GetBioseq(), and CSeqDBVol::x_GetTaxDefline().
Definition at line 3059 of file seqdbvol.cpp.
References NCBI_THROW, and T.
Referenced by CSeqDBVol::GetGiBounds(), and CSeqDBVol::GetPigBounds().
|
static |
Convert sequence data from NA2 to NA4 format.
This uses a translation table to convert nucleotide data. The input data is in NA2 format, the output data will be in NcbiNA4 format.
buf2bit | The NA2 input data. [in] |
buf4bit | The NcbiNA4 output data. [out] |
base_length | The length (in bases) of the input data. [in] |
Definition at line 389 of file seqdbvol.cpp.
References _ASSERT, base_length, i, and s_SeqDBMapNA2ToNA4Setup().
Referenced by s_SeqDBWriteSeqDataNucl().
Build NA2 to NcbiNA4 translation table.
This builds a translation table for nucleotide data. The table will be used by s_SeqDBMapNA2ToNA4(). The table is indexed by the packed nucleotide representation, or "NA2" format, which encodes four bases per byte. The elements of the table are the unpacked "Ncbi-NA4" representation, which encodes two bases per byte.
Definition at line 351 of file seqdbvol.cpp.
References convert().
Referenced by s_SeqDBMapNA2ToNA4().
|
static |
Convert sequence data from NA2 to NA8 format.
This uses a translation table to convert nucleotide data. The input data is in NA2 format, the output data will be in Ncbi-NA8 format. This function also optionally adds sentinel bytes to the start and end of the data (needed by some applications).
buf2bit | The NA2 input data. [in] |
buf8bit | The start of the Ncbi-NA8 output data. [out] |
buf8bit_end | The end of the Ncbi-NA8 output data. [out] |
sentinel_bytes | Specify true if sentinel bytes should be included. [in] |
range | The subregion of the sequence to work on. [in] |
Definition at line 481 of file seqdbvol.cpp.
References _ASSERT, compile_time_bits::range(), and s_SeqDBMapNA2ToNA8Setup().
Referenced by CSeqDBVol::GetAmbigPartialSeq(), SeqDB_UnpackAmbiguities(), and CSeqDBVol::x_GetAmbigSeq().
Build NA2 to Ncbi-NA8 translation table.
This builds a translation table for nucleotide data. The table will be used by s_SeqDBMapNA2ToNA8(). The table is indexed by the packed nucleotide representation, or "NA2" format, which encodes four bases per byte. The elements of the table are the unpacked "Ncbi-NA8" representation, which encodes one base per byte.
Definition at line 440 of file seqdbvol.cpp.
References i.
Referenced by s_SeqDBMapNA2ToNA8().
|
static |
Convert sequence data from Ncbi-NA8 to Blast-NA8 format.
This uses a translation table to convert nucleotide data. The input data is in Ncbi-NA8 format, the output data will be in Blast-NA8 format. The data is converted in-place.
buf | The array of nucleotides to convert. [in|out] |
range | The range of opearation. [in] |
Definition at line 601 of file seqdbvol.cpp.
References buf, i, compile_time_bits::range(), and SeqDB_ncbina8_to_blastna8.
Referenced by CSeqDBVol::GetAmbigPartialSeq(), and CSeqDBVol::x_GetAmbigSeq().
|
static |
Definition at line 1499 of file seqdbvol.cpp.
References CSeqDB::TSequenceRanges::empty(), first(), i, max(), min(), compile_time_bits::range(), and ncbi::grid::netcache::search::fields::size.
Referenced by CSeqDBVol::GetAmbigPartialSeq(), and CSeqDBVol::x_GetAmbigSeq().
|
static |
Rebuild an ambiguous region from sequence and ambiguity data.
When sequence data for a blast database is built, ambiguous regions are replaced with random strings of the four standard nucleotides. The ambiguity data is seperately encoded as a sequence of integer values. This function unpacks the ambiguity data and replaces the randomized bases with correct (ambiguous) encodings. This version works with 4 bit representations.
buf4bit | Sequence data for a sequence. [in|out] |
amb_chars | Corresponding ambiguous data. [in] |
Definition at line 710 of file seqdbvol.cpp.
References i, s_ResLenNew(), s_ResLenOld(), s_ResPosNew(), s_ResPosOld(), and s_ResVal().
Referenced by s_SeqDBWriteSeqDataNucl().
|
static |
Rebuild an ambiguous region from sequence and ambiguity data.
When sequence data for a blast database is built, ambiguous regions are replaced with random strings of the four standard nucleotides. The ambiguity data is seperately encoded as a sequence of integer values. This function unpacks the ambiguity data and replaces the randomized bases with correct (ambiguous) encodings. This version works with 8 bit representations.
seq | Sequence data for a sequence. [in|out] |
amb_chars | Corresponding ambiguous data. [in] |
region | If non-null, the part of the sequence to get. [in] |
Definition at line 785 of file seqdbvol.cpp.
References SSeqDBSlice::begin, SSeqDBSlice::end, i, s_ResLenNew(), s_ResLenOld(), s_ResPosNew(), s_ResPosOld(), and s_ResVal().
Referenced by CSeqDBVol::GetAmbigPartialSeq(), SeqDB_UnpackAmbiguities(), and CSeqDBVol::x_GetAmbigSeq().
|
static |
Store non-ambiguous nucleotide sequence data in a Seq-inst.
This function reads length elements from seq_buffer and stores them in a Seq-inst object. It also sets appropriate encoding information in that object. No ambiguity information is used. The input array is assumed to be in 2 bit representation.
seqinst | The Seq-inst to return the data in. [out] |
seq_buffer | The input sequence data. [in] |
length | The length (in bases) of the input data. [in] |
Definition at line 877 of file seqdbvol.cpp.
References CSeq_inst_Base::eMol_na, i, CSeq_inst_Base::SetMol(), and CSeq_inst_Base::SetSeq_data().
Referenced by CSeqDBVol::GetBioseq().
|
static |
Store non-ambiguous nucleotide sequence data in a Seq-inst.
This function reads length elements from seq_buffer and stores them in a Seq-inst object. It also sets appropriate encoding information in that object. No ambiguity information is used. The input array is assumed to be in Ncbi-NA4 representation.
seqinst | The Seq-inst to return the data in. [out] |
seq_buffer | The input sequence data in Ncbi-NA4 format. [in] |
length | The length (in bases) of the input data. [in] |
amb_chars | The ambiguity data for this sequence. [in] |
Definition at line 915 of file seqdbvol.cpp.
References CSeq_inst_Base::eMol_na, s_SeqDBMapNA2ToNA4(), s_SeqDBRebuildDNA_NA4(), CSeq_inst_Base::SetMol(), and CSeq_inst_Base::SetSeq_data().
|
static |
Store protein sequence data in a Seq-inst.
This function reads length elements from seq_buffer and stores them in a Seq-inst object. It also sets appropriate encoding information in that object.
seqinst | The Seq-inst to return the data in. [out] |
seq_buffer | The input sequence data. [in] |
length | The length (in bases) of the input data. [in] |
Definition at line 840 of file seqdbvol.cpp.
References CSeq_inst_Base::eMol_aa, i, CSeq_inst_Base::SetMol(), and CSeq_inst_Base::SetSeq_data().
Referenced by CSeqDBVol::GetBioseq().
void SeqDB_UnpackAmbiguities | ( | const CTempString & | sequence, |
const CTempString & | ambiguities, | ||
string & | result | ||
) |
Unpack an ambiguous nucleotide sequence.
This method provides a way to unpack nucleotide sequence data that has been packed in blast database format. One source of such data is the GetRawSeqAndAmbig() method in the CSeqDBExpert class. The output format is ncbi8na.
sequence | Sequence data in NA2 format with encoded length. [in] |
ambiguities | Sequence ambiguities packed in blastdb format. [in] |
result | Unpacked sequence in Ncbi NA8 format. [out] |
Definition at line 1698 of file seqdbvol.cpp.
References A, base_length, CTempString::data(), free(), i, CTempString::length(), malloc(), NCBI_THROW, compile_time_bits::range(), result, s_SeqDBMapNA2ToNA8(), s_SeqDBRebuildDNA_NA8(), and SeqDB_GetStdOrd().
Referenced by CWriteDB_Impl::x_ComputeHash().
unsigned SeqDB_ncbina8_to_blastna8[] |
Definition at line 571 of file seqdbvol.cpp.
Referenced by s_SeqDBMapNcbiNA8ToBlastNA8().