102 "Failed to initialize blast score block");
120 "Failed to initialize score matrix");
149 "Failed to initialize Karlin blocks");
156 m_EffectiveSearchSpace(0)
162 : m_EffectiveSearchSpace(0)
171 : m_EffectiveSearchSpace(0)
187 "CScoreBuilder::GetBlastScore(): "
188 "only two-row alignments are supported");
200 "CScoreBuilder::GetBlastScore(): " +
202 +
" is not supported");
206 static const unsigned char reverse_4na[16] = {0, 8, 4, 0, 2, 0, 0, 0, 1};
213 "CScoreBuilder::GetBlastScore(): "
214 "only dense-seg alignments are supported");
219 "Blast scoring parameters have not been specified");
222 int computed_score = 0;
241 "Protein scoring parameters required");
251 if (start1 == -1 || start2 == -1) {
252 computed_score -= gap_open + gap_extend * seg_len;
258 for (
TSeqPos pos = 0; pos < seg_len; ++pos) {
259 unsigned char c1 = vec1[start1 + pos];
260 unsigned char c2 = vec2[start2 + pos];
261 computed_score += matrix[c1][c2];
272 "Nucleotide scoring parameters required");
275 bool scaled_up =
false;
276 if (gap_open == 0 && gap_extend == 0) {
279 gap_extend =
match / 2 - mismatch;
293 if (start1 == -1 || start2 == -1) {
294 computed_score -= gap_open + gap_extend * seg_len;
301 if (strand1 > strand2) {
302 for (
TSeqPos pos = 0; pos < seg_len; ++pos) {
303 unsigned char c1 = vec1[start1 + pos];
304 unsigned char c2 = vec2[start2 + seg_len - 1 - pos];
309 else if (strand1 < strand2) {
310 for (
TSeqPos pos = 0; pos < seg_len; ++pos) {
311 unsigned char c1 = vec1[start1 + seg_len - 1 - pos];
312 unsigned char c2 = vec2[start2 + pos];
318 for (
TSeqPos pos = 0; pos < seg_len; ++pos) {
319 unsigned char c1 = vec1[start1 + pos];
320 unsigned char c2 = vec2[start2 + pos];
321 computed_score += (c1 == c2) ?
match : mismatch;
330 computed_score =
max(0, computed_score);
331 return computed_score;
354 unique_ptr<CSeq_align> swapped_align_ptr;
357 swapped_align_ptr->
Assign(align);
358 swapped_align_ptr->SwapRows(0,1);
359 align_ptr = swapped_align_ptr.get();
362 list<CRef<CPairwiseAln> > pairs;
364 pairs.push_back(aln);
377 "CScore_TblastnScore: "
378 "valid only for protein spliced-seg alignments");
381 list<CRef<CPairwiseAln> > pairs;
388 sub_align.
SetSegs().SetSpliced().SetExons().clear();
389 sub_align.
SetSegs().SetSpliced().SetExons().push_back(exon);
393 if (exon->IsSetAcceptor_before_exon() || pairs.empty()) {
394 pairs.push_back(aln);
397 pairs.back()->push_back(*
r);
445 int this_pair_score = -1;
447 CPairwiseAln::const_iterator
prev = aln->end();
451 if (
prev != aln->end()) {
452 int q_gap = range_it->GetFirstFrom() -
prev->GetFirstTo() - 1;
455 prev->GetSecondFrom() - range_it->GetSecondTo() - 1 :
456 range_it->GetSecondFrom() -
prev->GetSecondTo() - 1);
460 int gap =
abs(q_gap - s_gap);
468 int s_start = s_range.
GetFrom();
469 int s_end = s_range.
GetTo();
472 int new_offs = q_pos % 3;
473 for ( ; offs != new_offs; offs = (offs + 1) % 3) {
484 for ( ; s_start <= s_end; ++s_start, ++q_pos, ++vec_it) {
496 int this_score = matrix[
prot][xlate];
502 this_pair_score += this_score;
505 offs = (offs + 1) % 3;
515 ITERATE(list<int>, gap_bases, gaps) {
518 if (new_score > 0 ) {
519 this_pair_score = new_score;
521 this_pair_score -= 1;
525 score += this_pair_score;
549 "E-value calculation requires search space "
576 d == numeric_limits<double>::quiet_NaN()) {
579 if (d > 1e35 || d < -1e35) {
589 d == numeric_limits<double>::quiet_NaN()) {
592 if (d > 1e35 || d < -1e35) {
684 flipped_BTOP.insert(0, match_str);
693 "Traceback strings can only be calculated for pairwise "
694 "Dense-seg alignments");
699 string BTOP, flipped_BTOP;
705 for (
unsigned idx = 0; idx <
subject.size(); ++idx) {
711 flipped_BTOP.insert(flipped_BTOP.begin(), complement[0]);
712 flipped_BTOP.insert(flipped_BTOP.begin(),
'-');
715 for (
unsigned idx = 0; idx <
query.size(); ++idx) {
721 flipped_BTOP.insert(flipped_BTOP.begin(),
'-');
722 flipped_BTOP.insert(flipped_BTOP.begin(), complement[0]);
726 for (
unsigned idx = 0; idx <
query.size(); ++idx) {
728 "inconsistent aligned segment length");
736 string query_complement;
738 idx, 1, query_complement);
739 string subject_complement;
741 idx, 1, subject_complement);
742 flipped_BTOP.insert(flipped_BTOP.begin(),
743 subject_complement[0]);
744 flipped_BTOP.insert(flipped_BTOP.begin(),
745 query_complement[0]);
751 return pair<string,string>(
753 ? BTOP : flipped_BTOP);
760 if ((*ext_it)->GetType().IsStr() &&
761 (*ext_it)->GetType().GetStr() ==
"Tracebacks")
764 tracebacks = *ext_it;
771 tracebacks->
SetType().SetStr(
"Tracebacks");
772 align.
SetExt().push_back(tracebacks);
773 }
else if (tracebacks->
HasField(
"Query") && tracebacks->
HasField(
"Subject"))
778 tracebacks->
SetField(
"Query").
SetData().SetStr(traceback_strings.first);
779 tracebacks->
SetField(
"Subject").
SetData().SetStr(traceback_strings.second);
795 if ((*ext_it)->GetType().IsStr() &&
796 (*ext_it)->GetType().GetStr() ==
"Tracebacks")
798 string field = row == 0 ?
"Query" :
"Subject";
799 if ((*ext_it)->HasField(field)) {
800 return (*ext_it)->GetField(field).GetData().GetStr();
813 if (!stored_traceback.empty()) {
814 return stored_traceback;
819 return row == 0 ? traceback_strings.first : traceback_strings.second;
834 align.
SetSegs().SetDenseg().SetScores().clear();
839 align.
SetSegs().SetDisc().Set()) {
845 align.
SetSegs().SetSpliced().SetExons()) {
846 (*exon_iter)->SetScores().Set().clear();
850 align.
SetSegs().SetSparse().SetRow_scores().clear();
856 (*std_iter)->SetScores().clear();
863 align.
SetId().clear();
876 stringstream cleanStr;
879 checksum.
AddLine(cleanStr.str());
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
CRef< CPairwiseAln > CreatePairwiseAlnFromSeqAlign(const objects::CSeq_align &seq_align)
A simple API that assumes that the seq_align has exactly two rows and you want to create a pairwise w...
CRef< CSeq_align > ConvertSeq_align(const CSeq_align &src, CSeq_align::TSegs::E_Choice dst_choice, CSeq_align::TDim anchor_row=-1, CScope *scope=NULL)
Convert source alignment to a new type.
Contains C++ wrapper classes to structures in algo/blast/core as well as some auxiliary functions to ...
Declarations of static arrays used to define some NCBI encodings to be used in a toolkit independent ...
The structures and functions in blast_options.
Int2 BLAST_FillScoringOptions(BlastScoringOptions *options, EBlastProgramType program, Boolean greedy_extension, Int4 penalty, Int4 reward, const char *matrix, Int4 gap_open, Int4 gap_extend)
Fill non-default values in the BlastScoringOptions structure.
Int2 BlastScoringOptionsNew(EBlastProgramType program, BlastScoringOptions **options)
Allocate memory for BlastScoringOptions and fill with default values.
BlastScoringOptions * BlastScoringOptionsFree(BlastScoringOptions *options)
Deallocate memory for BlastScoringOptions.
Declares the CBlastOptionsHandle and CBlastOptionsFactory classes.
EBlastProgramType
Defines the engine's notion of the different applications of the BLAST algorithm.
Utilities initialize/setup BLAST.
Int2 Blast_ScoreBlkMatrixInit(EBlastProgramType program_number, const BlastScoringOptions *scoring_options, BlastScoreBlk *sbp, GET_MATRIX_PATH get_path)
Initializes the substitution matrix in the BlastScoreBlk according to the scoring options specified.
Definitions and prototypes used by blast_stat.c to calculate BLAST statistics.
BlastScoreBlk * BlastScoreBlkFree(BlastScoreBlk *sbp)
Deallocates BlastScoreBlk as well as all associated structures.
Int2 Blast_KarlinBlkGappedCalc(Blast_KarlinBlk *kbp, Int4 gap_open, Int4 gap_extend, const char *matrix_name, Blast_Message **error_return)
Fills in lambda, H, and K values, as calculated by Stephen Altschul in Methods in Enzy.
Blast_KarlinBlk * Blast_KarlinBlkNew(void)
Callocs a Blast_KarlinBlk.
Int2 Blast_KarlinBlkNuclGappedCalc(Blast_KarlinBlk *kbp, Int4 gap_open, Int4 gap_extend, Int4 reward, Int4 penalty, Blast_KarlinBlk *kbp_ungap, Boolean *round_down, Blast_Message **error_return)
Retrieves Karlin-Altschul parameters from precomputed tables, given the substitution and gap scores.
double BLAST_KarlinStoE_simple(Int4 S, Blast_KarlinBlk *kbp, Int8 searchsp)
Calculates the Expect value based upon the search space and some Karlin-Altschul parameters.
Int2 Blast_ScoreBlkKbpIdealCalc(BlastScoreBlk *sbp)
Calculates the Karlin-Altschul parameters assuming standard residue compositions for the query and su...
BlastScoreBlk * BlastScoreBlkNew(Uint1 alphabet, Int4 number_of_contexts)
Allocates and initializes BlastScoreBlk.
EProgram
This enumeration is to evolve into a task/program specific list that specifies sets of default parame...
@ eVecScreen
Vector screening.
@ eBlastn
Nucl-Nucl (traditional blastn)
@ ePHIBlastn
Nucleotide PHI BLAST.
@ eBlastp
Protein-Protein.
@ eTblastn
Protein-Translated nucl.
@ eMegablast
Nucl-Nucl (traditional megablast)
@ eDiscMegablast
Nucl-Nucl using discontiguous megablast.
Checksum and hash calculation classes.
TSignedSeqPos GetStart(TNumrow row, TNumseg seg, int offset=0) const
TSeqPos GetLen(TNumseg seg, int offset=0) const
CDense_seg::TNumseg TNumseg
int StrandSign(TNumrow row) const
TNumseg GetNumSegs(void) const
const CBioseq_Handle & GetBioseqHandle(TNumrow row) const
string & GetSegSeqString(string &buffer, TNumrow row, TNumseg seg, TNumseg offset=0) const
Handle to the options to the BLAST algorithm.
Encapsulates ALL the BLAST algorithm's options.
CChecksum – Checksum calculator.
static const CTrans_table & GetTransTable(int id)
A pairwise aln is a collection of ranges for a pair of rows.
double ComputeScore(CScope &scope, const CSeq_align &align, CSeq_align::EScoreType score)
int GetBlastScoreProtToNucl(CScope &scope, const CSeq_align &align, list< CRef< CPairwiseAln > > &pairs)
int ComputeTieBreaker(const CSeq_align &align)
void AddTracebacks(CScope &scope, CSeq_align &align)
enum blast::EProgram m_BlastType
~CScoreBuilder()
Destructor.
void AddScore(CScope &scope, CSeq_align &align, EScoreType score)
deprecated: use CSeq_align::EScoreType directly
double GetBlastBitScore(CScope &scope, const CSeq_align &align)
Compute the BLAST bit score.
struct BlastScoreBlk * m_ScoreBlk
int GetBlastScore(CScope &scope, const CSeq_align &align)
Compute the BLAST score of the alignment.
void AddTieBreaker(CSeq_align &align)
int GetBlastScoreDenseg(CScope &scope, const CSeq_align &align)
void x_Initialize(blast::CBlastOptionsHandle &options)
double ComputeScore(CScope &scope, const CSeq_align &align, const CRangeCollection< TSeqPos > &ranges, CSeq_align::EScoreType score)
int GetBlastScoreSpliced(CScope &scope, const CSeq_align &align)
Int8 m_EffectiveSearchSpace
int GetBlastScoreStd(CScope &scope, const CSeq_align &align)
string GetTraceback(const CSeq_align &align, CSeq_align::TDim row)
double GetBlastEValue(CScope &scope, const CSeq_align &align)
Compute the BLAST e-value.
CScoreBuilder()
Default constructor.
static SIZE_TYPE Complement(const string &src, TCoding src_coding, TSeqPos pos, TSeqPos length, string &dst)
EScoreType
enum controlling known named scores
void SetNamedScore(const string &id, int score)
TDim CheckNumRows(void) const
Validatiors.
const CSeq_id & GetSeq_id(TDim row) const
Get seq-id (the first one if segments have different ids).
ENa_strand GetSeqStrand(TDim row) const
Get strand (the first one if segments have different strands).
char GetStartResidue(int state) const
char GetCodonResidue(int state) const
static int NextCodonState(int state, unsigned char ch)
bool HasField(const string &str, const string &delim=".", NStr::ECase use_case=NStr::eCase) const
Verify that a named field exists.
CUser_field & SetField(const string &str, const string &delim=".", const string &obj_subtype=kEmptyStr, NStr::ECase use_case=NStr::eCase)
Access a named field in this user object.
static DLIST_TYPE *DLIST_NAME() prev(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
#define BLASTNA_SEQ_CODE
Identifies the blastna alphabet, for use in blast only.
static CBlastOptionsHandle * Create(EProgram program, EAPILocality locality=CBlastOptions::eLocal)
Creates an options handle object configured with default options for the requested program,...
EBlastProgramType EProgramToEBlastProgramType(EProgram p)
Convert EProgram to EBlastProgramType.
int GetGapExtensionCost() const
#define BLASTAA_SEQ_CODE
== Seq_code_ncbistdaa
const Uint1 AMINOACID_TO_NCBISTDAA[]
Translates between ncbieaa and ncbistdaa.
EProgram GetProgram() const
Accessors/Mutators for individual options.
const CBlastOptions & GetOptions() const
Return the object which this object is a handle for.
int GetMismatchPenalty() const
int GetMatchReward() const
int GetGapOpeningCost() const
const char * GetMatrixName() const
unsigned int TSeqPos
Type for sequence locations and lengths.
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
int TSignedSeqPos
Type for signed sequence position.
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
void AddLine(const char *line, size_t len)
Uint4 GetChecksum(void) const
Return calculated checksum.
#define NCBI_ASSERT(expr, mess)
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Set object to copy of another one.
#define MSerial_AsnText
I/O stream manipulators –.
static CSeq_id_Handle GetHandle(const CSeq_id &id)
Normal way of getting a handle, works for any seq-id.
const COrg_ref & GetOrg_ref(const CBioseq_Handle &handle)
Return the org-ref associated with a given sequence.
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
TMol GetSequenceType(void) const
CRef< CSeq_loc > GetRangeSeq_loc(TSeqPos start, TSeqPos stop, ENa_strand strand=eNa_strand_unknown) const
Return CSeq_loc referencing the given range and strand on the bioseq If start == 0,...
@ eCoding_Iupac
Set coding to printable coding (Iupacna or Iupacaa)
TMol GetSequenceType(void) const
void Reset(void)
Reset reference object.
uint8_t Uint1
1-byte (8-bit) unsigned integer
int16_t Int2
2-byte (16-bit) signed integer
int32_t Int4
4-byte (32-bit) signed integer
uint32_t Uint4
4-byte (32-bit) unsigned integer
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
static enable_if< is_arithmetic< TNumeric >::value||is_convertible< TNumeric, Int8 >::value, string >::type NumericToString(TNumeric value, TNumToStringFlags flags=0, int base=10)
Convert numeric value to string.
TTo GetTo(void) const
Get the To member data.
TFrom GetFrom(void) const
Get the From member data.
void SetType(TType &value)
Assign a value to Type data member.
void SetData(TData &value)
Assign a value to Data data member.
const TDenseg & GetDenseg(void) const
Get the variant data.
TId & SetId(void)
Assign a value to Id data member.
TScore & SetScore(void)
Assign a value to Score data member.
E_Choice Which(void) const
Which variant is currently selected.
list< CRef< CStd_seg > > TStd
bool IsSetExt(void) const
extra info Check if a value has been assigned to Ext data member.
static string SelectionName(E_Choice index)
Retrieve selection name (for diagnostic purposes).
void SetSegs(TSegs &value)
Assign a value to Segs data member.
TProduct_type GetProduct_type(void) const
Get the Product_type member data.
list< CRef< CUser_object > > TExt
bool IsSparse(void) const
Check if variant Sparse is selected.
const TSpliced & GetSpliced(void) const
Get the variant data.
bool CanGetSegs(void) const
Check if it is safe to call GetSegs method.
TExt & SetExt(void)
Assign a value to Ext data member.
list< CRef< CSpliced_exon > > TExons
const TExons & GetExons(void) const
Get the Exons member data.
bool IsStd(void) const
Check if variant Std is selected.
bool IsDisc(void) const
Check if variant Disc is selected.
const TExt & GetExt(void) const
Get the Ext member data.
bool IsSpliced(void) const
Check if variant Spliced is selected.
TNumseg GetNumseg(void) const
Get the Numseg member data.
list< CRef< CSeq_align > > Tdata
TBounds & SetBounds(void)
Assign a value to Bounds data member.
const TSegs & GetSegs(void) const
Get the Segs member data.
bool IsDenseg(void) const
Check if variant Denseg is selected.
ENa_strand
strand of nucleic acid
EMol
molecule class in living organism
unsigned int
A callback function used to compare two keys in a database.
#define NCBIMATH_LN2
Natural log(2)
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
static int match(register const pcre_uchar *eptr, register const pcre_uchar *ecode, const pcre_uchar *mstart, int offset_top, match_data *md, eptrblock *eptrb, unsigned int rdepth)
static void s_RecordMatch(size_t match, string &BTOP, string &flipped_BTOP)
void s_CleanSeqAlign(CSeq_align &align)
static pair< string, string > s_ComputeTraceback(CScope &scope, const CSeq_align &align)
static const unsigned char reverse_4na[16]
Boolean round_down
Score must be rounded down to nearest even score if odd.
char * name
name of scoring matrix.
Int4 penalty
penalty for mismatch in blastn.
SBlastScoreMatrix * matrix
scoring matrix data
Blast_KarlinBlk * kbp_ideal
Ideal values (for query with average database composition).
Blast_KarlinBlk ** kbp_gap_std
K-A parameters for std (not position-based) alignments.
Int4 reward
reward for match in blastn.
Scoring options block Used to produce the BlastScoreBlk structure This structure may be needed for lo...
Structure to hold the Karlin-Altschul parameters.
double Lambda
Lambda value used in statistics.
double logK
natural log of K value used in statistics
int ** data
actual scoring matrix data, stored in row-major form