NCBI C++ ToolKit
Typedefs | Functions | Variables
seqdbvol.cpp File Reference

Implementation for the CSeqDBVol class, which provides an interface for all functionality of one database volume. More...

#include <ncbi_pch.hpp>
#include <objtools/blast/seqdb_reader/impl/seqdbvol.hpp>
#include "seqdboidlist.hpp"
#include <objects/general/general__.hpp>
#include <objects/seqfeat/seqfeat__.hpp>
#include <serial/objistr.hpp>
#include <serial/objostr.hpp>
#include <serial/objistrasnb.hpp>
#include <serial/objostrasnb.hpp>
#include <serial/serial.hpp>
#include <corelib/ncbimtx.hpp>
#include <sstream>
+ Include dependency graph for seqdbvol.cpp:

Go to the source code of this file.

Go to the SVN repository for this file.

Typedefs

typedef set< pair< int, int > > TRangeVector
 List of offset ranges as begin/end pairs. More...
 

Functions

static vector< Uint1s_SeqDBMapNA2ToNA4Setup ()
 Build NA2 to NcbiNA4 translation table. More...
 
static void s_SeqDBMapNA2ToNA4 (const char *buf2bit, vector< char > &buf4bit, int base_length)
 Convert sequence data from NA2 to NA4 format. More...
 
static vector< Uint1s_SeqDBMapNA2ToNA8Setup ()
 Build NA2 to Ncbi-NA8 translation table. More...
 
static void s_SeqDBMapNA2ToNA8 (const char *buf2bit, char *buf8bit, const SSeqDBSlice &range)
 Convert sequence data from NA2 to NA8 format. More...
 
static void s_SeqDBMapNcbiNA8ToBlastNA8 (char *buf, const SSeqDBSlice &range)
 Convert sequence data from Ncbi-NA8 to Blast-NA8 format. More...
 
Uint4 s_ResLenNew (const vector< Int4 > &ambchars, Uint4 i)
 Get length of ambiguous region (new version) More...
 
Uint4 s_ResPosNew (const vector< Int4 > &ambchars, Uint4 i)
 Get position of ambiguous region (new version) More...
 
Uint4 s_ResVal (const vector< Int4 > &ambchars, Uint4 i)
 Get ambiguous residue value (old version) More...
 
Uint4 s_ResLenOld (const vector< Int4 > &ambchars, Uint4 i)
 Get ambiguous region length (old version) More...
 
Uint4 s_ResPosOld (const vector< Int4 > &ambchars, Uint4 i)
 Get ambiguous residue value (old version) More...
 
static void s_SeqDBRebuildDNA_NA4 (vector< char > &buf4bit, const vector< Int4 > &amb_chars)
 Rebuild an ambiguous region from sequence and ambiguity data. More...
 
static void s_SeqDBRebuildDNA_NA8 (char *seq, const vector< Int4 > &amb_chars, const SSeqDBSlice &region)
 Rebuild an ambiguous region from sequence and ambiguity data. More...
 
static void s_SeqDBWriteSeqDataProt (CSeq_inst &seqinst, const char *seq_buffer, int length)
 Store protein sequence data in a Seq-inst. More...
 
static void s_SeqDBWriteSeqDataNucl (CSeq_inst &seqinst, const char *seq_buffer, int length)
 Store non-ambiguous nucleotide sequence data in a Seq-inst. More...
 
static void s_SeqDBWriteSeqDataNucl (CSeq_inst &seqinst, const char *seq_buffer, int length, vector< Int4 > &amb_chars)
 Store non-ambiguous nucleotide sequence data in a Seq-inst. More...
 
static void s_GetBioseqTitle (CRef< CBlast_def_line_set > deflines, string &title)
 Get the title string for a CBioseq. More...
 
static bool s_SeqDB_SeqIdIn (const list< CRef< CSeq_id > > &seqids, const CSeq_id &target)
 Search for a Seq-id in a list of Seq-ids. More...
 
static CRef< CBlast_def_line_sets_OssToDefline (const CUser_field::TData::TOss &oss)
 Efficiently decode a Blast-def-line-set from binary ASN.1. More...
 
template<class T >
CRef< CBlast_def_line_sets_ExtractBlastDefline (const T &bioseq)
 
static void s_SeqDBMaskSequence (char *seq, CSeqDB::TSequenceRanges *masks, char mask_letter, const SSeqDBSlice &range)
 
void SeqDB_UnpackAmbiguities (const CTempString &sequence, const CTempString &ambiguities, string &result)
 Unpack an ambiguous nucleotide sequence. More...
 
bool s_IncludeDefline_Taxid (const CBlast_def_line &def, const set< TTaxId > &user_tax_ids)
 
bool s_IncludeDefline_MaskFilter (const CBlast_def_line &def, Uint8 mask)
 
bool s_IncludeDefline_NegativeTaxid (const CBlast_def_line &def, const set< TTaxId > &user_tax_ids)
 
template<class T >
static void s_SeqDBFitsInFour (T id)
 

Variables

unsigned SeqDB_ncbina8_to_blastna8 []
 

Detailed Description

Implementation for the CSeqDBVol class, which provides an interface for all functionality of one database volume.

Definition in file seqdbvol.cpp.

Typedef Documentation

◆ TRangeVector

typedef set< pair<int, int> > TRangeVector

List of offset ranges as begin/end pairs.

Definition at line 1587 of file seqdbvol.cpp.

Function Documentation

◆ s_ExtractBlastDefline()

template<class T >
CRef<CBlast_def_line_set> s_ExtractBlastDefline ( const T bioseq)

◆ s_GetBioseqTitle()

static void s_GetBioseqTitle ( CRef< CBlast_def_line_set deflines,
string title 
)
static

Get the title string for a CBioseq.

GetBioseq will use this function to get a title field when constructing the CBioseq object.

Parameters
deflinesThe set of deflines for this sequence. [in]
titleThe returned title string. [out]

Definition at line 938 of file seqdbvol.cpp.

References CBlast_def_line_Base::CanGetSeqid(), CBlast_def_line_Base::CanGetTitle(), CBlast_def_line_set_Base::Get(), CBlast_def_line_Base::GetSeqid(), and CBlast_def_line_Base::GetTitle().

Referenced by CSeqDBVol::GetBioseq().

◆ s_IncludeDefline_MaskFilter()

bool s_IncludeDefline_MaskFilter ( const CBlast_def_line def,
Uint8  mask 
)

◆ s_IncludeDefline_NegativeTaxid()

bool s_IncludeDefline_NegativeTaxid ( const CBlast_def_line def,
const set< TTaxId > &  user_tax_ids 
)

◆ s_IncludeDefline_Taxid()

bool s_IncludeDefline_Taxid ( const CBlast_def_line def,
const set< TTaxId > &  user_tax_ids 
)

◆ s_OssToDefline()

static CRef<CBlast_def_line_set> s_OssToDefline ( const CUser_field::TData::TOss oss)
static

Efficiently decode a Blast-def-line-set from binary ASN.1.

Parameters
ossOctet string sequence of binary ASN.1 data.
bdlsBlast def line set decoded from oss.

Definition at line 1176 of file seqdbvol.cpp.

References data, ITERATE, NULL, and ncbi::grid::netcache::search::fields::size.

Referenced by s_ExtractBlastDefline().

◆ s_ResLenNew()

Uint4 s_ResLenNew ( const vector< Int4 > &  ambchars,
Uint4  i 
)
inline

Get length of ambiguous region (new version)

Given an ambiguity element in the new format, this returns the length of the ambiguous region.

Parameters
ambcharsThe packed ambiguity data. [in]
iThe index into the ambiguity data. [in]
Returns
The region length.

Definition at line 623 of file seqdbvol.cpp.

References i.

Referenced by s_SeqDBRebuildDNA_NA4(), and s_SeqDBRebuildDNA_NA8().

◆ s_ResLenOld()

Uint4 s_ResLenOld ( const vector< Int4 > &  ambchars,
Uint4  i 
)
inline

Get ambiguous region length (old version)

Given an ambiguity element in the old format, this returns the length of the ambiguous region.

Parameters
ambcharsThe packed ambiguity data. [in]
iThe index into the ambiguity data. [in]
Returns
The residue value.

Definition at line 675 of file seqdbvol.cpp.

References i.

Referenced by s_SeqDBRebuildDNA_NA4(), and s_SeqDBRebuildDNA_NA8().

◆ s_ResPosNew()

Uint4 s_ResPosNew ( const vector< Int4 > &  ambchars,
Uint4  i 
)
inline

Get position of ambiguous region (new version)

Given an ambiguity element in the new format, this returns the position of the ambiguous region.

Parameters
ambcharsThe packed ambiguity data. [in]
iThe index into the ambiguity data. [in]
Returns
The region length.

Definition at line 639 of file seqdbvol.cpp.

References i.

Referenced by s_SeqDBRebuildDNA_NA4(), and s_SeqDBRebuildDNA_NA8().

◆ s_ResPosOld()

Uint4 s_ResPosOld ( const vector< Int4 > &  ambchars,
Uint4  i 
)
inline

Get ambiguous residue value (old version)

Given an ambiguity element in the old format, this returns the position of the ambiguous region.

Parameters
ambcharsThe packed ambiguity data. [in]
iThe index into the ambiguity data. [in]
Returns
The residue value.

Definition at line 691 of file seqdbvol.cpp.

References i.

Referenced by s_SeqDBRebuildDNA_NA4(), and s_SeqDBRebuildDNA_NA8().

◆ s_ResVal()

Uint4 s_ResVal ( const vector< Int4 > &  ambchars,
Uint4  i 
)
inline

Get ambiguous residue value (old version)

Given an ambiguity element in the old format, this returns the residue value to use for all bases in the ambiguous region.

Parameters
ambcharsThe packed ambiguity data. [in]
iThe index into the ambiguity data. [in]
Returns
The residue value.

Definition at line 659 of file seqdbvol.cpp.

References i.

Referenced by s_SeqDBRebuildDNA_NA4(), and s_SeqDBRebuildDNA_NA8().

◆ s_SeqDB_SeqIdIn()

static bool s_SeqDB_SeqIdIn ( const list< CRef< CSeq_id > > &  seqids,
const CSeq_id target 
)
static

Search for a Seq-id in a list of Seq-ids.

This iterates over a list of Seq-ids, and returns true if a specific Seq-id is equivalent to one found in the list.

Parameters
seqidsA list of Seq-ids to search. [in]
targetThe Seq-id to search for. [in]
Returns
True if the Seq-id was found.

Definition at line 1013 of file seqdbvol.cpp.

References CSeq_id::e_NO, CSeq_id::e_YES, and ITERATE.

Referenced by CSeqDBVol::GetBioseq(), and CSeqDBVol::x_GetTaxDefline().

◆ s_SeqDBFitsInFour()

template<class T >
static void s_SeqDBFitsInFour ( T  id)
static

Definition at line 3059 of file seqdbvol.cpp.

References NCBI_THROW, and T.

Referenced by CSeqDBVol::GetGiBounds(), and CSeqDBVol::GetPigBounds().

◆ s_SeqDBMapNA2ToNA4()

static void s_SeqDBMapNA2ToNA4 ( const char *  buf2bit,
vector< char > &  buf4bit,
int  base_length 
)
static

Convert sequence data from NA2 to NA4 format.

This uses a translation table to convert nucleotide data. The input data is in NA2 format, the output data will be in NcbiNA4 format.

Parameters
buf2bitThe NA2 input data. [in]
buf4bitThe NcbiNA4 output data. [out]
base_lengthThe length (in bases) of the input data. [in]

Definition at line 389 of file seqdbvol.cpp.

References _ASSERT, base_length, i, and s_SeqDBMapNA2ToNA4Setup().

Referenced by s_SeqDBWriteSeqDataNucl().

◆ s_SeqDBMapNA2ToNA4Setup()

static vector<Uint1> s_SeqDBMapNA2ToNA4Setup ( )
static

Build NA2 to NcbiNA4 translation table.

This builds a translation table for nucleotide data. The table will be used by s_SeqDBMapNA2ToNA4(). The table is indexed by the packed nucleotide representation, or "NA2" format, which encodes four bases per byte. The elements of the table are the unpacked "Ncbi-NA4" representation, which encodes two bases per byte.

Returns
The NA2 to NA4 translation table

Definition at line 351 of file seqdbvol.cpp.

References convert().

Referenced by s_SeqDBMapNA2ToNA4().

◆ s_SeqDBMapNA2ToNA8()

static void s_SeqDBMapNA2ToNA8 ( const char *  buf2bit,
char *  buf8bit,
const SSeqDBSlice range 
)
static

Convert sequence data from NA2 to NA8 format.

This uses a translation table to convert nucleotide data. The input data is in NA2 format, the output data will be in Ncbi-NA8 format. This function also optionally adds sentinel bytes to the start and end of the data (needed by some applications).

Parameters
buf2bitThe NA2 input data. [in]
buf8bitThe start of the Ncbi-NA8 output data. [out]
buf8bit_endThe end of the Ncbi-NA8 output data. [out]
sentinel_bytesSpecify true if sentinel bytes should be included. [in]
rangeThe subregion of the sequence to work on. [in]

Definition at line 481 of file seqdbvol.cpp.

References _ASSERT, compile_time_bits::range(), and s_SeqDBMapNA2ToNA8Setup().

Referenced by CSeqDBVol::GetAmbigPartialSeq(), SeqDB_UnpackAmbiguities(), and CSeqDBVol::x_GetAmbigSeq().

◆ s_SeqDBMapNA2ToNA8Setup()

static vector<Uint1> s_SeqDBMapNA2ToNA8Setup ( )
static

Build NA2 to Ncbi-NA8 translation table.

This builds a translation table for nucleotide data. The table will be used by s_SeqDBMapNA2ToNA8(). The table is indexed by the packed nucleotide representation, or "NA2" format, which encodes four bases per byte. The elements of the table are the unpacked "Ncbi-NA8" representation, which encodes one base per byte.

Returns
The NA2 to NA8 translation table

Definition at line 440 of file seqdbvol.cpp.

References i.

Referenced by s_SeqDBMapNA2ToNA8().

◆ s_SeqDBMapNcbiNA8ToBlastNA8()

static void s_SeqDBMapNcbiNA8ToBlastNA8 ( char *  buf,
const SSeqDBSlice range 
)
static

Convert sequence data from Ncbi-NA8 to Blast-NA8 format.

This uses a translation table to convert nucleotide data. The input data is in Ncbi-NA8 format, the output data will be in Blast-NA8 format. The data is converted in-place.

Parameters
bufThe array of nucleotides to convert. [in|out]
rangeThe range of opearation. [in]

Definition at line 601 of file seqdbvol.cpp.

References buf, i, compile_time_bits::range(), and SeqDB_ncbina8_to_blastna8.

Referenced by CSeqDBVol::GetAmbigPartialSeq(), and CSeqDBVol::x_GetAmbigSeq().

◆ s_SeqDBMaskSequence()

static void s_SeqDBMaskSequence ( char *  seq,
CSeqDB::TSequenceRanges masks,
char  mask_letter,
const SSeqDBSlice range 
)
static

◆ s_SeqDBRebuildDNA_NA4()

static void s_SeqDBRebuildDNA_NA4 ( vector< char > &  buf4bit,
const vector< Int4 > &  amb_chars 
)
static

Rebuild an ambiguous region from sequence and ambiguity data.

When sequence data for a blast database is built, ambiguous regions are replaced with random strings of the four standard nucleotides. The ambiguity data is seperately encoded as a sequence of integer values. This function unpacks the ambiguity data and replaces the randomized bases with correct (ambiguous) encodings. This version works with 4 bit representations.

Parameters
buf4bitSequence data for a sequence. [in|out]
amb_charsCorresponding ambiguous data. [in]

Definition at line 710 of file seqdbvol.cpp.

References i, s_ResLenNew(), s_ResLenOld(), s_ResPosNew(), s_ResPosOld(), and s_ResVal().

Referenced by s_SeqDBWriteSeqDataNucl().

◆ s_SeqDBRebuildDNA_NA8()

static void s_SeqDBRebuildDNA_NA8 ( char *  seq,
const vector< Int4 > &  amb_chars,
const SSeqDBSlice region 
)
static

Rebuild an ambiguous region from sequence and ambiguity data.

When sequence data for a blast database is built, ambiguous regions are replaced with random strings of the four standard nucleotides. The ambiguity data is seperately encoded as a sequence of integer values. This function unpacks the ambiguity data and replaces the randomized bases with correct (ambiguous) encodings. This version works with 8 bit representations.

Parameters
seqSequence data for a sequence. [in|out]
amb_charsCorresponding ambiguous data. [in]
regionIf non-null, the part of the sequence to get. [in]

Definition at line 785 of file seqdbvol.cpp.

References SSeqDBSlice::begin, SSeqDBSlice::end, i, s_ResLenNew(), s_ResLenOld(), s_ResPosNew(), s_ResPosOld(), and s_ResVal().

Referenced by CSeqDBVol::GetAmbigPartialSeq(), SeqDB_UnpackAmbiguities(), and CSeqDBVol::x_GetAmbigSeq().

◆ s_SeqDBWriteSeqDataNucl() [1/2]

static void s_SeqDBWriteSeqDataNucl ( CSeq_inst seqinst,
const char *  seq_buffer,
int  length 
)
static

Store non-ambiguous nucleotide sequence data in a Seq-inst.

This function reads length elements from seq_buffer and stores them in a Seq-inst object. It also sets appropriate encoding information in that object. No ambiguity information is used. The input array is assumed to be in 2 bit representation.

Parameters
seqinstThe Seq-inst to return the data in. [out]
seq_bufferThe input sequence data. [in]
lengthThe length (in bases) of the input data. [in]

Definition at line 877 of file seqdbvol.cpp.

References CSeq_inst_Base::eMol_na, i, CSeq_inst_Base::SetMol(), and CSeq_inst_Base::SetSeq_data().

Referenced by CSeqDBVol::GetBioseq().

◆ s_SeqDBWriteSeqDataNucl() [2/2]

static void s_SeqDBWriteSeqDataNucl ( CSeq_inst seqinst,
const char *  seq_buffer,
int  length,
vector< Int4 > &  amb_chars 
)
static

Store non-ambiguous nucleotide sequence data in a Seq-inst.

This function reads length elements from seq_buffer and stores them in a Seq-inst object. It also sets appropriate encoding information in that object. No ambiguity information is used. The input array is assumed to be in Ncbi-NA4 representation.

Parameters
seqinstThe Seq-inst to return the data in. [out]
seq_bufferThe input sequence data in Ncbi-NA4 format. [in]
lengthThe length (in bases) of the input data. [in]
amb_charsThe ambiguity data for this sequence. [in]

Definition at line 915 of file seqdbvol.cpp.

References CSeq_inst_Base::eMol_na, s_SeqDBMapNA2ToNA4(), s_SeqDBRebuildDNA_NA4(), CSeq_inst_Base::SetMol(), and CSeq_inst_Base::SetSeq_data().

◆ s_SeqDBWriteSeqDataProt()

static void s_SeqDBWriteSeqDataProt ( CSeq_inst seqinst,
const char *  seq_buffer,
int  length 
)
static

Store protein sequence data in a Seq-inst.

This function reads length elements from seq_buffer and stores them in a Seq-inst object. It also sets appropriate encoding information in that object.

Parameters
seqinstThe Seq-inst to return the data in. [out]
seq_bufferThe input sequence data. [in]
lengthThe length (in bases) of the input data. [in]

Definition at line 840 of file seqdbvol.cpp.

References CSeq_inst_Base::eMol_aa, i, CSeq_inst_Base::SetMol(), and CSeq_inst_Base::SetSeq_data().

Referenced by CSeqDBVol::GetBioseq().

◆ SeqDB_UnpackAmbiguities()

void SeqDB_UnpackAmbiguities ( const CTempString sequence,
const CTempString ambiguities,
string result 
)

Unpack an ambiguous nucleotide sequence.

This method provides a way to unpack nucleotide sequence data that has been packed in blast database format. One source of such data is the GetRawSeqAndAmbig() method in the CSeqDBExpert class. The output format is ncbi8na.

Parameters
sequenceSequence data in NA2 format with encoded length. [in]
ambiguitiesSequence ambiguities packed in blastdb format. [in]
resultUnpacked sequence in Ncbi NA8 format. [out]

Definition at line 1698 of file seqdbvol.cpp.

References base_length, CTempString::data(), free(), i, CTempString::length(), malloc(), NCBI_THROW, compile_time_bits::range(), result, s_SeqDBMapNA2ToNA8(), s_SeqDBRebuildDNA_NA8(), and SeqDB_GetStdOrd().

Referenced by CWriteDB_Impl::x_ComputeHash().

Variable Documentation

◆ SeqDB_ncbina8_to_blastna8

unsigned SeqDB_ncbina8_to_blastna8[]
Initial value:
= {
15,
0,
1,
6,
2,
4,
9,
13,
3,
8,
5,
12,
7,
11,
10,
14
}

Definition at line 571 of file seqdbvol.cpp.

Referenced by s_SeqDBMapNcbiNA8ToBlastNA8().

Modified on Mon Jun 24 05:19:08 2024 by modify_doxy.py rev. 669887