35 #ifndef ALGO_BLAST_CORE__BLAST_STAT__H
36 #define ALGO_BLAST_CORE__BLAST_STAT__H
53 #define BLAST_MATRIX_NOMINAL 0
54 #define BLAST_MATRIX_PREFERRED 1
55 #define BLAST_MATRIX_BEST 2
58 #define BLASTMAT_DIR "/usr/ncbi/blast/matrix"
61 typedef char* (*GET_MATRIX_PATH) (
const char*,
Boolean);
121 #define BLAST_SCORE_MIN INT2_MIN
122 #define BLAST_SCORE_MAX INT2_MAX
259 Int4 compressed_alphabet_size,
260 double scale_factor);
453 Int4 gap_extend,
const char* matrix_name);
578 Int8 searchsp,
Boolean dodecay,
double gap_decay_rate);
597 Int4 query_length,
Int4 subject_length,
598 Int8 searchsp_eff,
double weight_divisor);
625 Int2 num,
double xsum,
626 Int4 query_length,
Int4 subject_length,
627 Int8 searchsp_eff,
double weight_divisor);
644 Int4 query_length,
Int4 subject_length,
645 Int8 searchsp_eff,
double weight_divisor );
657 Int4* gap_extension);
671 Int4* gap_extension);
715 double *alpha,
double *beta);
735 const Uint1 * rps_query_seq,
Int4 db_seq_length,
782 double alpha_d_lambda,
787 Int4 * length_adjustment);
Defines to provide correct exporting from BLAST DLL in Windows.
#define NCBI_XBLAST_EXPORT
NULL operations for other cases.
Structures for BLAST messages.
Definitions for various programs supported by core BLAST.
EBlastProgramType
Defines the engine's notion of the different applications of the BLAST algorithm.
Definitions and functions associated with the BlastQueryInfo structure.
char *(* GET_MATRIX_PATH)(const char *, Boolean)
callback to resolve the path to blast score matrices
BlastScoreBlk * BlastScoreBlkFree(BlastScoreBlk *sbp)
Deallocates BlastScoreBlk as well as all associated structures.
Int2 BLAST_GetProteinGapExistenceExtendParams(const char *matrixName, Int4 *gap_existence, Int4 *gap_extension)
Extract the recommended gap existence and extension values.
double BLAST_GapDecayDivisor(double decayrate, unsigned nsegs)
Compute a divisor used to weight the evalue of a collection of "nsegs" distinct alignments.
char * BLAST_PrintAllowedValues(const char *matrix, Int4 gap_open, Int4 gap_extend)
Prints a messages about the allowed open etc values for the given matrix, BlastKarlinBlkGappedFill sh...
Int2 Blast_GetStdAlphabet(Uint1 alphabet_code, Uint1 *residues, Uint4 residue_size)
Fills a buffer with the 'standard' alphabet (given by STD_AMINO_ACID_FREQS[index]....
double BLAST_LargeGapSumE(Int2 num, double xsum, Int4 query_length, Int4 subject_length, Int8 searchsp_eff, double weight_divisor)
Calculates the e-value if a collection of distinct alignments with arbitrarily large gaps between the...
struct Blast_ResFreq Blast_ResFreq
Stores the letter frequency of a sequence or database.
struct Blast_KarlinBlk Blast_KarlinBlk
Structure to hold the Karlin-Altschul parameters.
Int2 Blast_GetNuclAlphaBeta(Int4 reward, Int4 penalty, Int4 gap_open, Int4 gap_extend, Blast_KarlinBlk *kbp, Boolean gapped_calculation, double *alpha, double *beta)
Extract the alpha and beta settings for these substitution and gap scores.
void Blast_FillResidueProbability(const Uint1 *sequence, Int4 length, double *resProb)
Given a sequence of 'length' amino acid residues, compute the probability of each residue and put tha...
double Blast_KarlinLambdaNR(Blast_ScoreFreq *sfp, double initialLambdaGuess)
Calculates the parameter Lambda given an initial guess for its value.
Int4 ** RPSRescalePssm(double scalingFactor, Int4 rps_query_length, const Uint1 *rps_query_seq, Int4 db_seq_length, Int4 **posMatrix, BlastScoreBlk *sbp)
Rescale the PSSM, using composition-based statistics, for use with RPS BLAST.
struct SPsiBlastScoreMatrix SPsiBlastScoreMatrix
Scoring matrix data used in PSI-BLAST.
Blast_ResFreq * Blast_ResFreqFree(Blast_ResFreq *rfp)
Deallocates Blast_ResFreq and prob0 element.
SCompressedAlphabet * SCompressedAlphabetFree(SCompressedAlphabet *alphabet)
Free a compressed alphabet and score matrix.
double BLAST_KarlinEtoP(double x)
Convert an E-value to a P-value.
struct Blast_GumbelBlk Blast_GumbelBlk
Structure to hold the Gumbel parameters (for FSC).
Int2 BlastScoreBlkNuclMatrixCreate(BlastScoreBlk *sbp)
Fill in the matrix for blastn using the penaly and rewards The query sequence alphabet is blastna,...
Int2 Blast_KarlinBlkGappedCalc(Blast_KarlinBlk *kbp, Int4 gap_open, Int4 gap_extend, const char *matrix_name, Blast_Message **error_return)
Fills in lambda, H, and K values, as calculated by Stephen Altschul in Methods in Enzy.
SBlastScoreMatrix * SBlastScoreMatrixFree(SBlastScoreMatrix *matrix)
Deallocates SBlastScoreMatrix structure.
Blast_KarlinBlk * Blast_KarlinBlkNew(void)
Callocs a Blast_KarlinBlk.
struct BlastScoreBlk BlastScoreBlk
Structure used for scoring calculations.
Blast_KarlinBlk * Blast_KarlinBlkFree(Blast_KarlinBlk *kbp)
Deallocates the KarlinBlk.
Int2 BLAST_ScoreSetAmbigRes(BlastScoreBlk *sbp, char ambiguous_res)
Set the ambiguous residue (e.g, 'N', 'X') in the BlastScoreBlk*.
Int2 Blast_KarlinBlkUngappedCalc(Blast_KarlinBlk *kbp, Blast_ScoreFreq *sfp)
Computes the parameters lambda, H K for use in calculating the statistical significance of high-scori...
double BLAST_KarlinPtoE(double p)
Convert a P-value to an E-value.
SNCBIPackedScoreMatrix * BlastScoreBlkGetCompiledInMatrix(const char *name)
Returns a pointer to the static compiled in version of the matrix.
SCompressedAlphabet * SCompressedAlphabetNew(BlastScoreBlk *sbp, Int4 compressed_alphabet_size, double scale_factor)
Allocate a new compressed alphabet and score matrix.
Int2 Blast_ScoreBlkKbpUngappedCalc(EBlastProgramType program, BlastScoreBlk *sbp, Uint1 *query, const BlastQueryInfo *query_info, Blast_Message **blast_message)
Calculate and fill the ungapped Karlin-Altschul parameters in the BlastScoreBlk structure (fields kbp...
Int2 Blast_ResFreqStdComp(const BlastScoreBlk *sbp, Blast_ResFreq *rfp)
Calculates residues frequencies given a standard distribution.
double BLAST_SpougeStoE(Int4 S, Blast_KarlinBlk *kbp, Blast_GumbelBlk *gbp, Int4 qlen, Int4 slen)
Calculates the Expect value based upon the Spouge's FSC method.
Int2 Blast_KarlinBlkGappedLoadFromTables(Blast_KarlinBlk *kbp, Int4 gap_open, Int4 gap_extend, const char *matrix_name, Boolean standard_only)
Attempts to fill KarlinBlk for given gap opening, extensions etc.
char * BLAST_PrintMatrixMessage(const char *matrix, Boolean standard_only)
Prints a messages about the allowed matrices, BlastKarlinBlkGappedFill should return 1 before this is...
Blast_ScoreFreq * Blast_ScoreFreqFree(Blast_ScoreFreq *sfp)
Deallocates the score frequencies structure.
Int2 Blast_GumbelBlkCalc(Blast_GumbelBlk *gbp, Int4 gap_open, Int4 gap_extend, const char *matrix_name, Blast_Message **error_return)
Fills in gumbel parameters to estimate p-value using FSC.
SBlastScoreMatrix * SBlastScoreMatrixNew(size_t ncols, size_t nrows)
Allocates a new SBlastScoreMatrix structure of the specified dimensions.
Int2 Blast_KarlinBlkNuclGappedCalc(Blast_KarlinBlk *kbp, Int4 gap_open, Int4 gap_extend, Int4 reward, Int4 penalty, Blast_KarlinBlk *kbp_ungap, Boolean *round_down, Blast_Message **error_return)
Retrieves Karlin-Altschul parameters from precomputed tables, given the substitution and gap scores.
Blast_ResFreq * Blast_ResFreqNew(const BlastScoreBlk *sbp)
Allocates a new Blast_ResFreq structure and fills in the prob element based upon the contents of sbp.
double BLAST_KarlinStoE_simple(Int4 S, Blast_KarlinBlk *kbp, Int8 searchsp)
Calculates the Expect value based upon the search space and some Karlin-Altschul parameters.
Int4 BLAST_SpougeEtoS(double E, Blast_KarlinBlk *kbp, Blast_GumbelBlk *gbp, Int4 qlen, Int4 slen)
Estimate the score for a specified expect value.
struct SCompressedAlphabet SCompressedAlphabet
Scoring matrix data used for compressed protein alphabets.
int BlastScoreBlkCheck(BlastScoreBlk *sbp)
Check that score blk is valid, returns zero if it is.
struct erfc_table erfc_table
Tabulated results for faster erfc(x) lookup.
Int2 Blast_ScoreBlkKbpIdealCalc(BlastScoreBlk *sbp)
Calculates the Karlin-Altschul parameters assuming standard residue compositions for the query and su...
Boolean BLAST_CheckRewardPenaltyScores(Int4 reward, Int4 penalty)
Check the validity of the reward and penalty scores.
double BLAST_UnevenGapSumE(Int4 query_start_points, Int4 subject_start_points, Int2 num, double xsum, Int4 query_length, Int4 subject_length, Int8 searchsp_eff, double weight_divisor)
Calculates the e-value of a collection multiple distinct alignments with asymmetric gaps between the ...
void BLAST_GetAlphaBeta(const char *matrixName, double *alpha, double *beta, Boolean gapped, Int4 gap_open, Int4 gap_extend, const Blast_KarlinBlk *kbp_ungapped)
Extract the alpha and beta settings for this matrixName, and these gap open and gap extension costs.
Blast_ScoreFreq * Blast_ScoreFreqNew(Int4 score_min, Int4 score_max)
Creates a new structure to keep track of score frequencies for a scoring system.
struct SBlastScoreMatrix SBlastScoreMatrix
Scoring matrix used in BLAST.
Int2 BLAST_Cutoffs(Int4 *S, double *E, Blast_KarlinBlk *kbp, Int8 searchsp, Boolean dodecay, double gap_decay_rate)
Calculate the cutoff score from the expected number of HSPs or vice versa.
SPsiBlastScoreMatrix * SPsiBlastScoreMatrixNew(size_t ncols)
Allocates a new SPsiBlastScoreMatrix structure of dimensions ncols by BLASTAA_SIZE.
Int2 Blast_KarlinBlkCopy(Blast_KarlinBlk *kbp_to, Blast_KarlinBlk *kbp_from)
Copies contents of one Karlin block to another.
Int2 BLAST_GetNucleotideGapExistenceExtendParams(Int4 reward, Int4 penalty, Int4 *gap_existence, Int4 *gap_extension)
Extract the recommended gap existence and extension values.
Int2 Blast_GumbelBlkLoadFromTables(Blast_GumbelBlk *gbp, Int4 gap_open, Int4 gap_extend, const char *matrix_name)
Attempts to fill GumbelBlk for given gap opening, extensions etc.
Int4 BLAST_ComputeLengthAdjustment(double K, double logK, double alpha_d_lambda, double beta, Int4 query_length, Int8 db_length, Int4 db_num_seqs, Int4 *length_adjustment)
Computes the adjustment to the lengths of the query and database sequences that is used to compensate...
BlastScoreBlk * BlastScoreBlkNew(Uint1 alphabet, Int4 number_of_contexts)
Allocates and initializes BlastScoreBlk.
struct Blast_ScoreFreq Blast_ScoreFreq
Holds score frequencies used in calculation of Karlin-Altschul parameters for an ungapped search.
SPsiBlastScoreMatrix * SPsiBlastScoreMatrixFree(SPsiBlastScoreMatrix *matrix)
Deallocates a SPsiBlastScoreMatrix structure.
Int2 Blast_ScoreBlkMatrixFill(BlastScoreBlk *sbp, GET_MATRIX_PATH get_path)
This function fills in the BlastScoreBlk structure.
double BLAST_SmallGapSumE(Int4 start_points, Int2 num, double xsum, Int4 query_length, Int4 subject_length, Int8 searchsp_eff, double weight_divisor)
Calculates the e-value for alignments with "small" gaps (typically under fifty residues/basepairs) fo...
uint8_t Uint1
1-byte (8-bit) unsigned integer
int16_t Int2
2-byte (16-bit) signed integer
int32_t Int4
4-byte (32-bit) signed integer
uint32_t Uint4
4-byte (32-bit) unsigned integer
int64_t Int8
8-byte (64-bit) signed integer
Type and macro definitions from C toolkit that are not defined in C++ toolkit.
Uint1 Boolean
bool replacment for C
The query related information.
Structure used for scoring calculations.
Boolean protein_alphabet
TRUE if alphabet_code is for a protein alphabet (e.g., ncbistdaa etc.), FALSE for nt.
Blast_KarlinBlk ** kbp
Karlin-Altschul parameters.
Blast_KarlinBlk ** kbp_psi
K-A parameters for position-based alignments.
Blast_ScoreFreq ** sfp
score frequencies for scoring matrix.
Boolean matrix_only_scoring
Score ungapped/gapped alignment only using the matrix parameters and with raw scores.
double scale_factor
multiplier for all cutoff and dropoff scores
Int2 ambig_occupy
How many occupied?
Blast_KarlinBlk ** kbp_gap
K-A parameters for gapped alignments.
Boolean round_down
Score must be rounded down to nearest even score if odd.
Int2 ambig_size
size of array above.
char * name
name of scoring matrix.
SPsiBlastScoreMatrix * psi_matrix
PSSM and associated data.
Int2 alphabet_start
numerical value of 1st letter.
Int2 alphabet_size
size of alphabet.
Uint1 alphabet_code
NCBI alphabet code.
Int4 penalty
penalty for mismatch in blastn.
Uint1 * ambiguous_res
Array of ambiguous res.
Int4 number_of_contexts
Used by sfp and kbp, how large are these.
Boolean read_in_matrix
If TRUE, matrix is read in, otherwise produce one from penalty and reward above.
SBlastScoreMatrix * matrix
scoring matrix data
ListNode * comments
Comments about scoring matrix.
Boolean complexity_adjusted_scoring
Use cross_match-like complexity adjustment on raw scores.
Blast_KarlinBlk * kbp_ideal
Ideal values (for query with average database composition).
Blast_KarlinBlk ** kbp_gap_std
K-A parameters for std (not position-based) alignments.
Blast_KarlinBlk ** kbp_std
K-A parameters for ungapped alignments.
Int4 reward
reward for match in blastn.
Blast_KarlinBlk ** kbp_gap_psi
K-A parameters for psi alignments.
Blast_GumbelBlk * gbp
Gumbel parameters for FSC.
Structure to hold the Gumbel parameters (for FSC).
Boolean filled
flag indicate the values of gbp are prepared
double Tau
2*G*(alpha_un - Sigma)
double Alpha_un
Ungapped alpha.
double Sigma
cov(L) = sigma y + tau
double G
G is the total penalty for extension.
double Beta
2*G*(alpha_un - alpha)
Int8 db_length
total length of database
double Lambda
the unscaled Lambda value
double Alpha
var(L) = alpha y + beta
Structure to hold the Karlin-Altschul parameters.
double paramC
for use in seed.
double K
K value used in statistics.
double Lambda
Lambda value used in statistics.
double H
H value used in statistics.
double logK
natural log of K value used in statistics
Structure to hold the a message from the core of the BLAST engine.
Stores the letter frequency of a sequence or database.
Uint1 alphabet_code
indicates alphabet.
double * prob0
probs, zero offset.
double * prob
letter probs, (possible) non-zero offset.
Holds score frequencies used in calculation of Karlin-Altschul parameters for an ungapped search.
double * sprob0
arrays for frequency of given score
double score_avg
average score, must be negative for local alignment.
Int4 score_max
highest allowed scores
Int4 obs_min
lowest observed (actual) scores
double * sprob
arrays for frequency of given score, shifted down by score_min.
Int4 score_min
lowest allowed scores
Int4 obs_max
highest observed (actual) scores
A generic linked list node structure.
Scoring matrix used in BLAST.
size_t nrows
number of rows
double lambda
derived value of the matrix lambda -RMH-
double * freqs
array of assumed matrix background frequencies -RMH-
size_t ncols
number of columns
int ** data
actual scoring matrix data, stored in row-major form
Scoring matrix data used for compressed protein alphabets.
Uint1 * compress_table
translation table (AA->compressed)
Int4 compressed_alphabet_size
letters in the compressed alphabet
SBlastScoreMatrix * matrix
score matrix
Scoring matrix data used in PSI-BLAST.
SBlastScoreMatrix * pssm
position-specific score matrix
double ** freq_ratios
PSSM's frequency ratios, dimensions are specified in pssm data above.
Blast_KarlinBlk * kbp
Karlin-Altschul block associated with this PSSM.
Tabulated results for faster erfc(x) lookup.