NCBI C++ ToolKit
blast_stat.h
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: blast_stat.h 71621 2016-03-18 14:26:46Z rackerst $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Tom Madden
27  *
28  */
29 
30 /** @file blast_stat.h
31  * Definitions and prototypes used by blast_stat.c to calculate BLAST
32  * statistics. @todo FIXME: needs doxygen comments
33  */
34 
35 #ifndef ALGO_BLAST_CORE__BLAST_STAT__H
36 #define ALGO_BLAST_CORE__BLAST_STAT__H
37 
44 
45 #ifdef __cplusplus
46 extern "C" {
47 #endif
48 
49 /**
50  * Defines for the matrix 'preferences' (as specified by S. Altschul).
51  * Used in arrays such as blosum45_prefs in blast_stat.c
52  */
53 #define BLAST_MATRIX_NOMINAL 0 /**< acceptable values, not recommended. */
54 #define BLAST_MATRIX_PREFERRED 1 /**< These value are preferred over others. */
55 #define BLAST_MATRIX_BEST 2 /**< This is the best value, only one per matrix. */
56 
57 
58 #define BLASTMAT_DIR "/usr/ncbi/blast/matrix" /**< Default location for blast databases. */
59 
60 /** callback to resolve the path to blast score matrices */
61 typedef char* (*GET_MATRIX_PATH) (const char*, Boolean);
62 
63 /**
64  Structure to hold the Karlin-Altschul parameters.
65 */
66 typedef struct Blast_KarlinBlk {
67  double Lambda; /**< Lambda value used in statistics */
68  double K; /**< K value used in statistics */
69  double logK; /**< natural log of K value used in statistics */
70  double H; /**< H value used in statistics */
71  double paramC; /**< for use in seed. */
73 
74 
75 /**
76  Tabulated results for faster erfc(x) lookup.
77  erfc(x) = 1/2 + 1/sqrt(2*pi) * \int_0^x { exp(-y^2/2)} dy
78  erfc(-\inf) = 0.0
79  erfc(+\inf) = 1.0
80  The erfc(x) is tabulated in p with step h, starting from a to b
81 */
82 typedef struct erfc_table {
83  double eps;
84  double a;
85  double b;
87  double h;
88  double *p;
90 
91 /**
92  Structure to hold the Gumbel parameters (for FSC).
93 */
94 typedef struct Blast_GumbelBlk {
95  double Lambda; /**< the unscaled Lambda value */
96  double C;
97  double G; /**< G is the total penalty for extension */
98  double a; /**< avg(L) = a y + b */
99  double Alpha; /**< var(L) = alpha y + beta */
100  double Sigma; /**< cov(L) = sigma y + tau */
101  double a_un; /**< Ungapped a */
102  double Alpha_un; /**< Ungapped alpha */
103 
104  double b; /**< 2*G*(a_un - a) */
105  double Beta; /**< 2*G*(alpha_un - alpha) */
106  double Tau; /**< 2*G*(alpha_un - Sigma) */
107 
108  Int8 db_length; /**< total length of database */
109 
110  Boolean filled; /**< flag indicate the values of gbp are prepared */
111 
113 
114 
115 /********************************************************************
116 
117  Structures relating to scoring or the BlastScoreBlk
118 
119 ********************************************************************/
120 
121 #define BLAST_SCORE_MIN INT2_MIN /**< minimum allowed score (for one letter comparison). */
122 #define BLAST_SCORE_MAX INT2_MAX /**< maximum allowed score (for one letter comparison). */
123 
124 
125 /** Holds score frequencies used in calculation
126 of Karlin-Altschul parameters for an ungapped search.
127 */
128 typedef struct Blast_ScoreFreq {
129  Int4 score_min; /**< lowest allowed scores */
130  Int4 score_max; /**< highest allowed scores */
131  Int4 obs_min; /**< lowest observed (actual) scores */
132  Int4 obs_max; /**< highest observed (actual) scores */
133  double score_avg; /**< average score, must be negative for local alignment. */
134  double* sprob0; /**< arrays for frequency of given score */
135  double* sprob; /**< arrays for frequency of given score, shifted down by score_min. */
137 
138 /** Scoring matrix used in BLAST */
139 typedef struct SBlastScoreMatrix {
140  int** data; /**< actual scoring matrix data, stored in row-major
141  form */
142  size_t ncols; /**< number of columns */
143  size_t nrows; /**< number of rows */
144  double* freqs; /**< array of assumed matrix background frequencies -RMH-*/
145  double lambda; /**< derived value of the matrix lambda -RMH- */
147 
148 /** Scoring matrix data used in PSI-BLAST */
149 typedef struct SPsiBlastScoreMatrix {
150  SBlastScoreMatrix* pssm; /**< position-specific score matrix */
151  double** freq_ratios; /**< PSSM's frequency ratios, dimensions are
152  specified in pssm data above */
153  Blast_KarlinBlk* kbp; /**< Karlin-Altschul block associated with this
154  PSSM */
156 
157 /** Allocates a new SPsiBlastScoreMatrix structure of dimensions ncols by
158  * BLASTAA_SIZE.
159  * @param ncols number of columns (i.e.: query length) [in]
160  * @return NULL in case of memory allocation failure, else new
161  * SPsiBlastScoreMatrix structure
162  */
165 SPsiBlastScoreMatrixNew(size_t ncols);
166 
167 /** Deallocates a SPsiBlastScoreMatrix structure.
168  * @param matrix structure to deallocate [in]
169  * @return NULL
170  */
174 
175 /** Structure used for scoring calculations.
176 */
177 typedef struct BlastScoreBlk {
178  Boolean protein_alphabet; /**< TRUE if alphabet_code is for a
179 protein alphabet (e.g., ncbistdaa etc.), FALSE for nt. alphabets. */
180  Uint1 alphabet_code; /**< NCBI alphabet code. */
181  Int2 alphabet_size; /**< size of alphabet. */
182  Int2 alphabet_start; /**< numerical value of 1st letter. */
183  char* name; /**< name of scoring matrix. */
184  ListNode* comments; /**< Comments about scoring matrix. */
185  SBlastScoreMatrix* matrix; /**< scoring matrix data */
186  SPsiBlastScoreMatrix* psi_matrix; /**< PSSM and associated data. If this
187  is not NULL, then the BLAST search is
188  position specific (i.e.: PSI-BLAST) */
189  Boolean matrix_only_scoring; /**< Score ungapped/gapped alignment only
190  using the matrix parameters and
191  with raw scores. Ignore
192  penalty/reward and do not report
193  Karlin-Altschul stats. This is used
194  by the rmblastn program. -RMH- */
195  Boolean complexity_adjusted_scoring; /**< Use cross_match-like complexity
196  adjustment on raw scores. -RMH- */
197  Int4 loscore; /**< Min. substitution scores */
198  Int4 hiscore; /**< Max. substitution scores */
199  Int4 penalty; /**< penalty for mismatch in blastn. */
200  Int4 reward; /**< reward for match in blastn. */
201  double scale_factor; /**< multiplier for all cutoff and dropoff scores */
202  Boolean read_in_matrix; /**< If TRUE, matrix is read in, otherwise
203  produce one from penalty and reward above. @todo should this be
204  an allowed way of specifying the matrix to use? */
205  Blast_ScoreFreq** sfp; /**< score frequencies for scoring matrix. */
206  /* kbp & kbp_gap are ptrs that should be set to kbp_std, kbp_psi, etc. */
207  Blast_KarlinBlk** kbp; /**< Karlin-Altschul parameters. Actually just a placeholder. */
208  Blast_KarlinBlk** kbp_gap; /**< K-A parameters for gapped alignments. Actually just a placeholder. */
209  Blast_GumbelBlk* gbp; /**< Gumbel parameters for FSC. */
210  /* Below are the Karlin-Altschul parameters for non-position based ('std')
211  and position based ('psi') searches. */
212  Blast_KarlinBlk **kbp_std, /**< K-A parameters for ungapped alignments */
213  **kbp_psi, /**< K-A parameters for position-based alignments. */
214  **kbp_gap_std, /**< K-A parameters for std (not position-based) alignments */
215  **kbp_gap_psi; /**< K-A parameters for psi alignments. */
216  Blast_KarlinBlk* kbp_ideal; /**< Ideal values (for query with average database composition). */
217  Int4 number_of_contexts; /**< Used by sfp and kbp, how large are these*/
218  Uint1* ambiguous_res; /**< Array of ambiguous res. (e.g, 'X', 'N')*/
219  Int2 ambig_size, /**< size of array above. FIXME: not needed here? */
220  ambig_occupy; /**< How many occupied? */
221  Boolean round_down; /**< Score must be rounded down to nearest even score if odd. */
223 
224 /** Scoring matrix data used for compressed protein alphabets */
225 typedef struct SCompressedAlphabet {
226  Int4 compressed_alphabet_size; /**< letters in the compressed alphabet */
227  SBlastScoreMatrix* matrix; /**< score matrix */
228  Uint1* compress_table; /**< translation table (AA->compressed)*/
230 
231 /** Deallocates SBlastScoreMatrix structure
232  * @param matrix structure to deallocate [in]
233  * @return NULL
234  */
238 
239 /** Allocates a new SBlastScoreMatrix structure of the specified dimensions.
240  * @param ncols number of columns [in]
241  * @param nrows number of rows [in]
242  * @return NULL in case of memory allocation failure, else new
243  * SBlastScoreMatrix structure
244  */
247 SBlastScoreMatrixNew(size_t ncols, size_t nrows);
248 
249 /** Allocate a new compressed alphabet and score matrix
250  * @param sbp Current score matrix information [in]
251  * @param compressed_alphabet_size Desired size of compressed
252  * alphabet (current choices are limited to 10 or 15) [in]
253  * @param scale_factor Score matrix entries are scaled by this value [in]
254  * @return the new alphabet, or NULL on failure
255  */
259  Int4 compressed_alphabet_size,
260  double scale_factor);
261 
262 /** Free a compressed alphabet and score matrix
263  * @param alphabet The compressed alphabet structure
264  * @return Always NULL
265  */
269 
270 /**
271 Stores the letter frequency of a sequence or database.
272 */
273 typedef struct Blast_ResFreq {
274  Uint1 alphabet_code; /**< indicates alphabet. */
275  double* prob; /**< letter probs, (possible) non-zero offset. */
276  double* prob0; /**< probs, zero offset. */
278 
279 /**
280  * Check that score blk is valid, returns zero if it is.
281  * @param sbp ScoreBlk to check [in]
282  * @return zero if valid
283 */
286 
287 /**
288  * Allocates and initializes BlastScoreBlk
289  * @param alphabet either BLASTAA_SEQ_CODE or BLASTNA_SEQ_CODE [in]
290  * @param number_of_contexts how many strands or sequences [in]
291  * @return BlastScoreBlk*
292 */
294 BlastScoreBlk* BlastScoreBlkNew (Uint1 alphabet, Int4 number_of_contexts);
295 
296 /** Deallocates BlastScoreBlk as well as all associated structures.
297  * @param sbp BlastScoreBlk to be deallocated [in]
298  * @return NULL pointer.
299  */
302 
303 /** Set the ambiguous residue (e.g, 'N', 'X') in the BlastScoreBlk*.
304  * Convert from ncbieaa to sbp->alphabet_code (i.e., ncbistdaa) first.
305  *
306  * @param sbp the object to be modified [in|out]
307  * @param ambiguous_res the residue to be set on the BlastScoreBlk
308  * @return zero on success, others on error
309  */
311 Int2 BLAST_ScoreSetAmbigRes (BlastScoreBlk* sbp, char ambiguous_res);
312 
313 
314 /** Calculate and fill the ungapped Karlin-Altschul parameters in the
315  * BlastScoreBlk structure (fields kbp_std, kbp_psi, and kbp of that structure).
316  * @param program BLAST program type, needed to decide whether to substitute
317  * ideal values. [in]
318  * @param sbp Scoring block to work with [in] [out]
319  * @param query Buffer containing (concatenated) query sequence [in]
320  * @param query_info Information about offsets of concatenated queries [in]
321  * @param blast_message returns queries that could not be processed [out]
322  * @return 0 if ungapped Karlin-Altschul parameters could be calculated for
323  * all of the query sequence's contexts; 1 if any of the contexts
324  * failed (but all others will be populated).
325  */
327 Int2
329  BlastScoreBlk* sbp, Uint1* query,
330  const BlastQueryInfo* query_info,
331  Blast_Message* *blast_message);
332 
333 /** This function fills in the BlastScoreBlk structure.
334  * Tasks are:
335  * -read in the matrix
336  * -set maxscore
337  * @param sbp Scoring block [in] [out]
338  * @param get_path pointer to function that will return path to matrix. Only called
339  * if built-in matrix not found [in]
340 */
343 
344 /** Callocs a Blast_KarlinBlk
345  * @return pointer to the Blast_KarlinBlk
346 */
349 
350 /** Copies contents of one Karlin block to another. Both must be allocated
351  * before this function is called.
352  * @param kbp_to Karlin block to copy values to [in] [out]
353  * @param kbp_from Karlin block to copy values from [in]
354  * @return 0 on success; -1 if either argument is NULL on input.
355  */
358 
359 
360 /** Deallocates the KarlinBlk
361  * @param kbp KarlinBlk to be deallocated [in]
362  * @return NULL
363 */
366 
367 /** Fills in lambda, H, and K values, as calculated by Stephen Altschul
368  * in Methods in Enzy. (vol 266, page 474).
369  * @param kbp object to be filled in [in|out]
370  * @param gap_open cost of gap existence [in]
371  * @param gap_extend cost to extend a gap one letter [in]
372  * @param matrix_name name of the matrix to be used [in]
373  * @param error_return filled in with error message if needed [out]
374  * @return zero on success
375  */
378  Int4 gap_extend, const char* matrix_name, Blast_Message** error_return);
379 
380 /** Retrieves Karlin-Altschul parameters from precomputed tables, given the
381  * substitution and gap scores. Gap cost values greater than any of those
382  * listed in the tables ("greater" meaning that both values are greater than or
383  * equal, and at least one is strictly greater), are treated as infinite, and
384  * parameters values are copied from the ungapped Karlin block.
385  * @param kbp Allocated Karlin block to fill [in] [out]
386  * @param gap_open Gap openening (existence) cost [in]
387  * @param gap_extend Gap extension cost [in]
388  * @param reward Match reward score [in]
389  * @param penalty Mismatch penalty score [in]
390  * @param kbp_ungap Karlin block with ungapped Karlin-Altschul parameters [in]
391  * @param round_down specifies that the score should be rounded down to nearest even
392  * score in some cases [in|out]
393  * @param error_return Pointer to error message. [in] [out]
394  */
396 Int2
398  Int4 gap_extend, Int4 reward, Int4 penalty,
399  Blast_KarlinBlk* kbp_ungap,
400  Boolean* round_down,
401  Blast_Message** error_return);
402 
403 
404 /** Calculates the Karlin-Altschul parameters assuming standard residue
405  * compositions for the query and subject sequences. It populates the kbp_ideal
406  * field of its sbp argument. This is used if the query is translated and the
407  * calculated (real) Karlin parameters are bad, as they're calculated for
408  * non-coding regions.
409  * @param sbp ScoreBlk used to calculate "ideal" values. [in|out]
410  * @return 0 on success, 1 on failure
411 */
414 
415 /** Attempts to fill KarlinBlk for given gap opening, extensions etc.
416  *
417  * @param kbp object to be filled in [in|out]
418  * @param gap_open gap existence cost [in]
419  * @param gap_extend gap extension cost [in]
420  * @param matrix_name name of the matrix used [in]
421  * @param standard_only If true, include only the standard blosum and pam matrices [in]
422  * @return -1 if matrix_name is NULL;
423  * 1 if matrix not found
424  * 2 if matrix found, but open, extend etc. values not supported.
425 */
427 Int2 Blast_KarlinBlkGappedLoadFromTables(Blast_KarlinBlk* kbp, Int4 gap_open, Int4 gap_extend, const char* matrix_name, Boolean standard_only);
428 
429 /** Fills in gumbel parameters to estimate p-value using FSC
430  * @param gbp object to be filled in [in|out]
431  * @param gap_open cost of gap existence [in]
432  * @param gap_extend cost to extend a gap one letter [in]
433  * @param matrix_name name of the matrix to be used [in]
434  * @param error_return filled in with error message if needed [out]
435  * @return zero on success
436  */
439  Int4 gap_extend, const char* matrix_name, Blast_Message** error_return);
440 
441 /** Attempts to fill GumbelBlk for given gap opening, extensions etc.
442  *
443  * @param gbp object to be filled in [in|out]
444  * @param gap_open gap existence cost [in]
445  * @param gap_extend gap extension cost [in]
446  * @param matrix_name name of the matrix used [in]
447  * @return -1 if matrix_name is NULL;
448  * 1 if matrix not found
449  * 2 if matrix found, but open, extend etc. values not supported.
450 */
453  Int4 gap_extend, const char* matrix_name);
454 
455 /** Prints a messages about the allowed matrices, BlastKarlinBlkGappedFill should return 1 before this is called.
456  * @param matrix the matrix to print a message about [in]
457  * @param standard_only, If true, include only the standad blosum and pam
458  * matrices [in]
459  * @return the message
460  */
462 char* BLAST_PrintMatrixMessage(const char *matrix, Boolean standard_only);
463 
464 /** Prints a messages about the allowed open etc values for the given matrix,
465  * BlastKarlinBlkGappedFill should return 2 before this is called.
466  * @param matrix name of the matrix [in]
467  * @param gap_open gap existence cost [in]
468  * @param gap_extend cost to extend a gap by one [in]
469  * @return message
470  */
472 char* BLAST_PrintAllowedValues(const char *matrix, Int4 gap_open, Int4 gap_extend);
473 
474 /** Calculates the parameter Lambda given an initial guess for its value */
476 double
477 Blast_KarlinLambdaNR(Blast_ScoreFreq* sfp, double initialLambdaGuess);
478 
479 /** Calculates the Expect value based upon the search space and some Karlin-Altschul
480  * parameters. It is "simple" as it does not use sum-statistics.
481  * @param S the score of the alignment. [in]
482  * @param kbp the Karlin-Altschul parameters. [in]
483  * @param searchsp total search space to be used [in]
484  * @return the expect value
485  */
487 double BLAST_KarlinStoE_simple (Int4 S, Blast_KarlinBlk* kbp, Int8 searchsp);
488 
489 /** Calculates the Expect value based upon the Spouge's FSC method.
490  * @param S the score of the alignment. [in]
491  * @param gbp the Gumbel parameters. [in]
492  * @param qlen the query length. [in]
493  * @param slen the subject length. [in]
494  * @return the expect value
495  */
497 double BLAST_SpougeStoE (Int4 S, Blast_KarlinBlk* kbp, Blast_GumbelBlk* gbp, Int4 qlen, Int4 slen);
498 
499 /** Estimate the score for a specified expect value.
500  * @param E the expect value of the alignment. [in]
501  * @param gbp the Gumbel parameters. [in]
502  * @param qlen the query length. [in]
503  * @param slen the subject length. [in]
504  * @return the score
505  */
507 Int4 BLAST_SpougeEtoS (double E, Blast_KarlinBlk* kbp, Blast_GumbelBlk* gbp, Int4 qlen, Int4 slen);
508 
509 /** Convert a P-value to an E-value.
510  *
511  * P-values and E-values may either represent statistics of a database
512  * search or represent statistics on the two sequences being compared.
513  * If given a database P-value, this routine will return a database
514  * E-value; if given a pairwise P-value, it will return a pairwise
515  * E-value.
516  *
517  * In the context of a database search, the available P-value is often
518  * a pairwise P-value, whereas the desired E-value is a database
519  * E-value. When this it the case, the value returned by this routine
520  * should be multiplied by the effective length of the database and
521  * divided by the effective length of the subject.
522  *
523  * @param p the P-value to be converted [in] @return the corresponding
524  * expect value.
525  */
526 NCBI_XBLAST_EXPORT double BLAST_KarlinPtoE(double p);
527 
528 /** Convert an E-value to a P-value.
529  *
530  * E-values and P-values may either represent statistics of a database
531  * search or represent statistics on the two sequences being compared.
532  * If given a database E-value, this routine will return a database
533  * P-value; if given a pairwise E-value, it will return a pairwise
534  * P-value.
535  *
536  * In the context of a database search, the available E-value is
537  * typically a database E-value, whereas the desired P-value is a
538  * pairwise P-value. When this is the case, the E-value should be
539  * divided by the effective length of the database and multiplied by
540  * the effective length of the subject, before BLAST_KarlinEtoP is
541  * called.
542  *
543  * @param x the expect value to be converted [in]
544  * @return the corresponding p-value.
545  */
547 double BLAST_KarlinEtoP(double x);
548 
549 /** Compute a divisor used to weight the evalue of a collection of
550  * "nsegs" distinct alignments. These divisors are used to compensate
551  * for the effect of choosing the best among multiple collections of
552  * alignments. See
553  *
554  * Stephen F. Altschul. Evaluating the statitical significance of
555  * multiple distinct local alignments. In Suhai, editior, Theoretical
556  * and Computational Methods in Genome Research, pages 1-14. Plenum
557  * Press, New York, 1997.
558  *
559  * The "decayrate" parameter of this routine is a value in the
560  * interval (0,1). Typical values of decayrate are .1 and .5.
561  * @param decayrate adjusts for (multiple) tests of number of HSP sum groups [in]
562  * @param nsegs the number of HSPs in the sum group [in]
563  * @return divisor used to compensate for multiple tests
564  */
566 double BLAST_GapDecayDivisor(double decayrate, unsigned nsegs );
567 
568 /** Calculate the cutoff score from the expected number of HSPs or vice versa.
569  * @param S The calculated score [in] [out]
570  * @param E The calculated e-value [in] [out]
571  * @param kbp The Karlin-Altschul statistical parameters [in]
572  * @param searchsp The effective search space [in]
573  * @param dodecay Use gap decay feature? [in]
574  * @param gap_decay_rate Gap decay rate to use, if dodecay is set [in]
575  */
577 Int2 BLAST_Cutoffs (Int4 *S, double* E, Blast_KarlinBlk* kbp,
578  Int8 searchsp, Boolean dodecay, double gap_decay_rate);
579 
580 /** Calculates the e-value for alignments with "small" gaps (typically
581  * under fifty residues/basepairs) following ideas of Stephen Altschul's.
582  * @param start_points the number of starting points permitted between
583  * adjacent alignments; max_overlap + max_gap + 1 [in]
584  * @param num the number of distinct alignments in this collection [in]
585  * @param xsum the sum of the scores of these alignments each individually
586  * normalized using an appropriate value of Lambda and logK [in]
587  * @param query_length effective len of the query seq [in]
588  * @param subject_length effective len of the subject seq [in]
589  * @param searchsp_eff effective size of the search space [in]
590  * @param weight_divisor a divisor used to weight the e-value
591  * when multiple collections of alignments are being considered by
592  * the calling routine [in]
593  * @return the expect value
594  */
596 double BLAST_SmallGapSumE (Int4 start_points, Int2 num, double xsum,
597  Int4 query_length, Int4 subject_length,
598  Int8 searchsp_eff, double weight_divisor);
599 
600 /** Calculates the e-value of a collection multiple distinct
601  * alignments with asymmetric gaps between the alignments. The gaps
602  * in one (protein) sequence are typically small (like in
603  * BLAST_SmallGapSumE) gap an the gaps in the other (translated DNA)
604  * sequence are possibly large (up to 4000 bp.) This routine is used
605  * for linking HSPs representing exons in the DNA sequence that are
606  * separated by introns.
607  * @param query_start_points the number of starting points in
608  * the query sequence permitted between adjacent alignments;
609  * max_overlap + max_gap + 1. [in]
610  * @param subject_start_points the number of starting points in
611  * the subject sequence permitted between adjacent alignments [in]
612  * @param num number of distinct alignments in one collection [in]
613  * @param xsum the sum of the scores of these alignments each individually
614  * normalized using an appropriate value of Lambda and logK [in]
615  * @param query_length effective length of query [in]
616  * @param subject_length effective length of subject [in]
617  * @param searchsp_eff effective size of the search space [in]
618  * @param weight_divisor a divisor used to weight the e-value
619  * when multiple collections of alignments are being considered by
620  * the calling routine [in]
621  * @return sum expect value.
622  */
624 double BLAST_UnevenGapSumE (Int4 query_start_points, Int4 subject_start_points,
625  Int2 num, double xsum,
626  Int4 query_length, Int4 subject_length,
627  Int8 searchsp_eff, double weight_divisor);
628 
629 /** Calculates the e-value if a collection of distinct alignments with
630  * arbitrarily large gaps between the alignments
631  * @param num number of distinct alignments in the collection [in]
632  * @param xsum the sum of the scores of these alignments each individually
633  * normalized using an appropriate value of Lambda and logK [in]
634  * @param query_length effective length of query sequence [in]
635  * @param subject_length effective length of subject sequence [in]
636  * @param searchsp_eff effective size of the search space [in]
637  * @param weight_divisor a divisor used to weight the e-value
638  * when multiple collections of alignments are being considered by the
639  * calling routine [in]
640  * @return sum expect value.
641  */
643 double BLAST_LargeGapSumE (Int2 num, double xsum,
644  Int4 query_length, Int4 subject_length,
645  Int8 searchsp_eff, double weight_divisor );
646 
647 /** Extract the recommended gap existence and extension values.
648  * Only to be used with protein matrices.
649  * @param matrixName name of the matrix [in]
650  * @param gap_existence returns recommended existence cost [in|out]
651  * @param gap_extension returns recommended extension cost [in|out]
652  * @return zero on success
653  */
655 Int2 BLAST_GetProteinGapExistenceExtendParams(const char* matrixName,
656  Int4* gap_existence,
657  Int4* gap_extension);
658 
659 /** Extract the recommended gap existence and extension values.
660  * Only to be used with blastn searches.
661  * @param reward match score [in]
662  * @param penalty mismatch score [in]
663  * @param gap_existence returns recommended existence cost [in|out]
664  * @param gap_extension returns recommended extension cost [in|out]
665  * @return zero on success
666  */
669  Int4 penalty,
670  Int4* gap_existence,
671  Int4* gap_extension);
672 
673 /** Check the validity of the reward and penalty scores.
674  * Only to be used with blastn searches.
675  * @param reward match score [in]
676  * @param penalty mismatch score [in]
677  * @return TRUE on success
678  */
681 
682 /** Extract the alpha and beta settings for this matrixName, and these
683  * gap open and gap extension costs
684  * @param matrixName name of the matrix used [in]
685  * @param alpha Karlin-Altschul parameter to be set [out]
686  * @param beta Karlin-Altschul parameter to be set [out]
687  * @param gapped TRUE if a gapped search [in]
688  * @param gap_open existence cost of a gap [in]
689  * @param gap_extend extension cost of a gap [in]
690  * @param kbp_ungapped Karlin block with ungapped values of the parameters [in]
691 */
693 void BLAST_GetAlphaBeta (const char* matrixName, double *alpha,
694  double *beta, Boolean gapped, Int4 gap_open,
695  Int4 gap_extend, const Blast_KarlinBlk* kbp_ungapped);
696 
697 /** Extract the alpha and beta settings for these substitution and gap scores.
698  * If substitution or gap costs are not found in the tables, assume an ungapped
699  * search. Then alpha is computed using the formula Alpha = Lambda/H, and beta
700  * is equal to 0 except for some special cases.
701  * @param reward Match reward score [in]
702  * @param penalty Mismatch penalty score [in]
703  * @param gap_open Gap opening (existence) cost [in]
704  * @param gap_extend Gap extension cost [in]
705  * @param kbp Karlin block containing already computed Lambda, K and H
706  * parameters.
707  * @param gapped_calculation Is this a gapped search? [in]
708  * @param alpha Alpha parameter for this scoring system [out]
709  * @param beta Beta parameter for this scoring system [out]
710  */
712 Int2 Blast_GetNuclAlphaBeta(Int4 reward, Int4 penalty, Int4 gap_open,
713  Int4 gap_extend, Blast_KarlinBlk* kbp,
714  Boolean gapped_calculation,
715  double *alpha, double *beta);
716 
717 /** Rescale the PSSM, using composition-based statistics, for use
718  * with RPS BLAST. This function produces a PSSM for a single RPS DB
719  * sequence (of size db_seq_length) and incorporates information from
720  * the RPS blast query. Each individual database sequence must call this
721  * function to retrieve its own PSSM. The matrix is returned (and must
722  * be freed elsewhere). posMatrix is the portion of the complete
723  * concatenated PSSM that is specific to this DB sequence
724  * @todo revise to use existing code
725  * @param scalingFactor used to rescale Lambda [in]
726  * @param rps_query_length length of query sequence [in]
727  * @param rps_query_seq the query sequence [in]
728  * @param db_seq_length Length of the database sequence [in]
729  * @param posMatrix matrix (actual) values to be used [in]
730  * @param sbp Structure with score matrix parameters [in][out]
731  * @return rescaled pssm
732  */
734 Int4 ** RPSRescalePssm(double scalingFactor, Int4 rps_query_length,
735  const Uint1 * rps_query_seq, Int4 db_seq_length,
736  Int4 **posMatrix, BlastScoreBlk *sbp);
737 
738 
739 /**
740  * Computes the adjustment to the lengths of the query and database sequences
741  * that is used to compensate for edge effects when computing evalues.
742  *
743  * The length adjustment is an integer-valued approximation to the fixed
744  * point of the function
745  *
746  * f(ell) = beta +
747  * (alpha/lambda) * (log K + log((m - ell)*(n - N ell)))
748  *
749  * where m is the query length n is the length of the database and N is the
750  * number of sequences in the database. The values beta, alpha, lambda and
751  * K are statistical, Karlin-Altschul parameters.
752  *
753  * The value of the length adjustment computed by this routine, A,
754  * will always be an integer smaller than the fixed point of
755  * f(ell). Usually, it will be the largest such integer. However, the
756  * computed length adjustment, A, will also be so small that
757  *
758  * K * (m - A) * (n - N * A) > MAX(m,n).
759  *
760  * Moreover, an iterative method is used to compute A, and under
761  * unusual circumstances the iterative method may not converge.
762  *
763  * @param K the statistical parameter K [in]
764  * @param logK the natural logarithm of K [in]
765  * @param alpha_d_lambda the ratio of the statistical parameters
766  * alpha and lambda (for ungapped alignments, the
767  * value 1/H should be used) [in]
768  * @param beta the statistical parameter beta (for ungapped
769  * alignments, beta == 0) [in]
770  * @param query_length the length of the query sequence [in]
771  * @param db_length the length of the database [in]
772  * @param db_num_seqs the number of sequences in the database [in]
773  * @param length_adjustment the computed value of the length adjustment [out]
774  *
775  * @return 0 if length_adjustment is known to be the largest integer less
776  * than the fixed point of f(ell); 1 otherwise.
777  */
779 Int4
781  double logK,
782  double alpha_d_lambda,
783  double beta,
784  Int4 query_length,
785  Int8 db_length,
786  Int4 db_num_seqs,
787  Int4 * length_adjustment);
788 
789 
790 /** Allocates a new Blast_ResFreq structure and fills in the prob element
791  based upon the contents of sbp.
792  * @param sbp The BlastScoreBlk* used to init prob [in]
793 */
796 
797 /** Deallocates Blast_ResFreq and prob0 element.
798  * @param rfp the Blast_ResFreq to be deallocated.
799 */
802 
803 
804 /** Calculates residues frequencies given a standard distribution.
805  * @param sbp the BlastScoreBlk provides information on alphabet.
806  * @param rfp the prob element on this Blast_ResFreq is used.
807  * @return zero on success
808 */
811 
812 /** Creates a new structure to keep track of score frequencies for a scoring
813  * system.
814  * @param score_min Minimum score [in]
815  * @param score_max Maximum score [in]
816  * @return allocated and initialized pointer to Blast_ScoreFreq
817  */
820 Blast_ScoreFreqNew(Int4 score_min, Int4 score_max);
821 
822 /** Deallocates the score frequencies structure
823  * @param sfp the structure to deallocate [in]
824  * @return NULL
825  */
829 
830 /** Fills a buffer with the 'standard' alphabet
831  * (given by STD_AMINO_ACID_FREQS[index].ch).
832  *
833  * @param alphabet_code specifies alphabet [in]
834  * @param residues buffer to be filled in [in|out]
835  * @param residue_size size of "residues" buffer [in]
836  * @return Number of residues in alphabet or negative returns upon error.
837  */
839 Int2
840 Blast_GetStdAlphabet(Uint1 alphabet_code, Uint1* residues,
841  Uint4 residue_size);
842 
843 /** Computes the parameters lambda, H K for use in calculating the
844  * statistical significance of high-scoring segments or subalignments (see
845  * comment on blast_stat.c for more details).
846  * @param kbp object containing Lambda, H, and K as well as scoring information [in|out]
847  * @param sfp array of probabilities for all scores [in]
848  * @return zero on success, 1 on error.
849  */
851 Int2
853 
854 /** Given a sequence of 'length' amino acid residues, compute the
855  * probability of each residue and put that in the array resProb
856  * Excludes ambiguity characters.
857  *
858  * @param sequence the sequence to be computed upon [in]
859  * @param length the length of the sequence [in]
860  * @param resProb the object to be filled in [in|out]
861  */
863 void
864 Blast_FillResidueProbability(const Uint1* sequence, Int4 length, double * resProb);
865 
866 /** Fill in the matrix for blastn using the penaly and rewards
867  * The query sequence alphabet is blastna, the subject sequence
868  * is ncbi2na. The alphabet blastna is defined in blast_stat.h
869  * and the first four elements of blastna are identical to ncbi2na.
870  * if sbp->matrix==NULL, it is allocated.
871  * @param sbp the BlastScoreBlk on which reward, penalty, and matrix will be set
872  [in|out]
873  * @return zero on success.
874 */
877 
878 /** Returns a pointer to the static compiled in version of the
879  * matrix. If name is NULL or the matrix is not compiled in
880  * NULL is returned.
881  * @param name matrix name [in]
882  * @return pointer to matrix or NULL if not supported.
883  */
886 
887 #ifdef __cplusplus
888 }
889 #endif
890 #endif /* !ALGO_BLAST_CORE__BLAST_STAT__H */
Defines to provide correct exporting from BLAST DLL in Windows.
#define NCBI_XBLAST_EXPORT
NULL operations for other cases.
Definition: blast_export.h:65
Structures for BLAST messages.
Definitions for various programs supported by core BLAST.
EBlastProgramType
Defines the engine's notion of the different applications of the BLAST algorithm.
Definition: blast_program.h:72
Definitions and functions associated with the BlastQueryInfo structure.
char *(* GET_MATRIX_PATH)(const char *, Boolean)
callback to resolve the path to blast score matrices
Definition: blast_stat.h:61
BlastScoreBlk * BlastScoreBlkFree(BlastScoreBlk *sbp)
Deallocates BlastScoreBlk as well as all associated structures.
Definition: blast_stat.c:965
Int2 BLAST_GetProteinGapExistenceExtendParams(const char *matrixName, Int4 *gap_existence, Int4 *gap_extension)
Extract the recommended gap existence and extension values.
Definition: blast_stat.c:3373
double BLAST_GapDecayDivisor(double decayrate, unsigned nsegs)
Compute a divisor used to weight the evalue of a collection of "nsegs" distinct alignments.
Definition: blast_stat.c:4062
char * BLAST_PrintAllowedValues(const char *matrix, Int4 gap_open, Int4 gap_extend)
Prints a messages about the allowed open etc values for the given matrix, BlastKarlinBlkGappedFill sh...
Definition: blast_stat.c:3786
Int2 Blast_GetStdAlphabet(Uint1 alphabet_code, Uint1 *residues, Uint4 residue_size)
Fills a buffer with the 'standard' alphabet (given by STD_AMINO_ACID_FREQS[index]....
Definition: blast_stat.c:1862
double BLAST_LargeGapSumE(Int2 num, double xsum, Int4 query_length, Int4 subject_length, Int8 searchsp_eff, double weight_divisor)
Calculates the e-value if a collection of distinct alignments with arbitrarily large gaps between the...
Definition: blast_stat.c:4515
struct Blast_ResFreq Blast_ResFreq
Stores the letter frequency of a sequence or database.
struct Blast_KarlinBlk Blast_KarlinBlk
Structure to hold the Karlin-Altschul parameters.
Int2 Blast_GetNuclAlphaBeta(Int4 reward, Int4 penalty, Int4 gap_open, Int4 gap_extend, Blast_KarlinBlk *kbp, Boolean gapped_calculation, double *alpha, double *beta)
Extract the alpha and beta settings for these substitution and gap scores.
Definition: blast_stat.c:3948
void Blast_FillResidueProbability(const Uint1 *sequence, Int4 length, double *resProb)
Given a sequence of 'length' amino acid residues, compute the probability of each residue and put tha...
Definition: blast_stat.c:4560
double Blast_KarlinLambdaNR(Blast_ScoreFreq *sfp, double initialLambdaGuess)
Calculates the parameter Lambda given an initial guess for its value.
Definition: blast_stat.c:2566
Int4 ** RPSRescalePssm(double scalingFactor, Int4 rps_query_length, const Uint1 *rps_query_seq, Int4 db_seq_length, Int4 **posMatrix, BlastScoreBlk *sbp)
Rescale the PSSM, using composition-based statistics, for use with RPS BLAST.
Definition: blast_stat.c:4676
struct SPsiBlastScoreMatrix SPsiBlastScoreMatrix
Scoring matrix data used in PSI-BLAST.
Blast_ResFreq * Blast_ResFreqFree(Blast_ResFreq *rfp)
Deallocates Blast_ResFreq and prob0 element.
Definition: blast_stat.c:1688
SCompressedAlphabet * SCompressedAlphabetFree(SCompressedAlphabet *alphabet)
Free a compressed alphabet and score matrix.
Definition: blast_stat.c:4974
double BLAST_KarlinEtoP(double x)
Convert an E-value to a P-value.
Definition: blast_stat.c:4172
struct Blast_GumbelBlk Blast_GumbelBlk
Structure to hold the Gumbel parameters (for FSC).
Int2 BlastScoreBlkNuclMatrixCreate(BlastScoreBlk *sbp)
Fill in the matrix for blastn using the penaly and rewards The query sequence alphabet is blastna,...
Definition: blast_stat.c:1060
Int2 Blast_KarlinBlkGappedCalc(Blast_KarlinBlk *kbp, Int4 gap_open, Int4 gap_extend, const char *matrix_name, Blast_Message **error_return)
Fills in lambda, H, and K values, as calculated by Stephen Altschul in Methods in Enzy.
Definition: blast_stat.c:3526
SBlastScoreMatrix * SBlastScoreMatrixFree(SBlastScoreMatrix *matrix)
Deallocates SBlastScoreMatrix structure.
Definition: blast_stat.c:732
Blast_KarlinBlk * Blast_KarlinBlkNew(void)
Callocs a Blast_KarlinBlk.
Definition: blast_stat.c:2860
struct BlastScoreBlk BlastScoreBlk
Structure used for scoring calculations.
Blast_KarlinBlk * Blast_KarlinBlkFree(Blast_KarlinBlk *kbp)
Deallocates the KarlinBlk.
Definition: blast_stat.c:956
Int2 BLAST_ScoreSetAmbigRes(BlastScoreBlk *sbp, char ambiguous_res)
Set the ambiguous residue (e.g, 'N', 'X') in the BlastScoreBlk*.
Definition: blast_stat.c:1012
Int2 Blast_KarlinBlkUngappedCalc(Blast_KarlinBlk *kbp, Blast_ScoreFreq *sfp)
Computes the parameters lambda, H K for use in calculating the statistical significance of high-scori...
Definition: blast_stat.c:2698
double BLAST_KarlinPtoE(double p)
Convert a P-value to an E-value.
Definition: blast_stat.c:4158
SNCBIPackedScoreMatrix * BlastScoreBlkGetCompiledInMatrix(const char *name)
Returns a pointer to the static compiled in version of the matrix.
SCompressedAlphabet * SCompressedAlphabetNew(BlastScoreBlk *sbp, Int4 compressed_alphabet_size, double scale_factor)
Allocate a new compressed alphabet and score matrix.
Definition: blast_stat.c:4938
Int2 Blast_ScoreBlkKbpUngappedCalc(EBlastProgramType program, BlastScoreBlk *sbp, Uint1 *query, const BlastQueryInfo *query_info, Blast_Message **blast_message)
Calculate and fill the ungapped Karlin-Altschul parameters in the BlastScoreBlk structure (fields kbp...
Definition: blast_stat.c:2736
Int2 Blast_ResFreqStdComp(const BlastScoreBlk *sbp, Blast_ResFreq *rfp)
Calculates residues frequencies given a standard distribution.
Definition: blast_stat.c:1886
double BLAST_SpougeStoE(Int4 S, Blast_KarlinBlk *kbp, Blast_GumbelBlk *gbp, Int4 qlen, Int4 slen)
Calculates the Expect value based upon the Spouge's FSC method.
Definition: blast_stat.c:5159
Int2 Blast_KarlinBlkGappedLoadFromTables(Blast_KarlinBlk *kbp, Int4 gap_open, Int4 gap_extend, const char *matrix_name, Boolean standard_only)
Attempts to fill KarlinBlk for given gap opening, extensions etc.
Definition: blast_stat.c:3576
char * BLAST_PrintMatrixMessage(const char *matrix, Boolean standard_only)
Prints a messages about the allowed matrices, BlastKarlinBlkGappedFill should return 1 before this is...
Definition: blast_stat.c:3759
Blast_ScoreFreq * Blast_ScoreFreqFree(Blast_ScoreFreq *sfp)
Deallocates the score frequencies structure.
Definition: blast_stat.c:941
Int2 Blast_GumbelBlkCalc(Blast_GumbelBlk *gbp, Int4 gap_open, Int4 gap_extend, const char *matrix_name, Blast_Message **error_return)
Fills in gumbel parameters to estimate p-value using FSC.
Definition: blast_stat.c:3651
SBlastScoreMatrix * SBlastScoreMatrixNew(size_t ncols, size_t nrows)
Allocates a new SBlastScoreMatrix structure of the specified dimensions.
Definition: blast_stat.c:760
Int2 Blast_KarlinBlkNuclGappedCalc(Blast_KarlinBlk *kbp, Int4 gap_open, Int4 gap_extend, Int4 reward, Int4 penalty, Blast_KarlinBlk *kbp_ungap, Boolean *round_down, Blast_Message **error_return)
Retrieves Karlin-Altschul parameters from precomputed tables, given the substitution and gap scores.
Definition: blast_stat.c:3835
Blast_ResFreq * Blast_ResFreqNew(const BlastScoreBlk *sbp)
Allocates a new Blast_ResFreq structure and fills in the prob element based upon the contents of sbp.
Definition: blast_stat.c:1707
double BLAST_KarlinStoE_simple(Int4 S, Blast_KarlinBlk *kbp, Int8 searchsp)
Calculates the Expect value based upon the search space and some Karlin-Altschul parameters.
Definition: blast_stat.c:4140
Int4 BLAST_SpougeEtoS(double E, Blast_KarlinBlk *kbp, Blast_GumbelBlk *gbp, Int4 qlen, Int4 slen)
Estimate the score for a specified expect value.
Definition: blast_stat.c:5219
struct SCompressedAlphabet SCompressedAlphabet
Scoring matrix data used for compressed protein alphabets.
int BlastScoreBlkCheck(BlastScoreBlk *sbp)
Check that score blk is valid, returns zero if it is.
Definition: blast_stat.c:853
struct erfc_table erfc_table
Tabulated results for faster erfc(x) lookup.
Int2 Blast_ScoreBlkKbpIdealCalc(BlastScoreBlk *sbp)
Calculates the Karlin-Altschul parameters assuming standard residue compositions for the query and su...
Definition: blast_stat.c:2831
Boolean BLAST_CheckRewardPenaltyScores(Int4 reward, Int4 penalty)
Check the validity of the reward and penalty scores.
Definition: blast_stat.c:3453
double BLAST_UnevenGapSumE(Int4 query_start_points, Int4 subject_start_points, Int2 num, double xsum, Int4 query_length, Int4 subject_length, Int8 searchsp_eff, double weight_divisor)
Calculates the e-value of a collection multiple distinct alignments with asymmetric gaps between the ...
Definition: blast_stat.c:4474
void BLAST_GetAlphaBeta(const char *matrixName, double *alpha, double *beta, Boolean gapped, Int4 gap_open, Int4 gap_extend, const Blast_KarlinBlk *kbp_ungapped)
Extract the alpha and beta settings for this matrixName, and these gap open and gap extension costs.
Definition: blast_stat.c:3093
Blast_ScoreFreq * Blast_ScoreFreqNew(Int4 score_min, Int4 score_max)
Creates a new structure to keep track of score frequencies for a scoring system.
Definition: blast_stat.c:2112
struct SBlastScoreMatrix SBlastScoreMatrix
Scoring matrix used in BLAST.
Int2 BLAST_Cutoffs(Int4 *S, double *E, Blast_KarlinBlk *kbp, Int8 searchsp, Boolean dodecay, double gap_decay_rate)
Calculate the cutoff score from the expected number of HSPs or vice versa.
Definition: blast_stat.c:4073
SPsiBlastScoreMatrix * SPsiBlastScoreMatrixNew(size_t ncols)
Allocates a new SPsiBlastScoreMatrix structure of dimensions ncols by BLASTAA_SIZE.
Definition: blast_stat.c:805
Int2 Blast_KarlinBlkCopy(Blast_KarlinBlk *kbp_to, Blast_KarlinBlk *kbp_from)
Copies contents of one Karlin block to another.
Definition: blast_stat.c:2870
Int2 BLAST_GetNucleotideGapExistenceExtendParams(Int4 reward, Int4 penalty, Int4 *gap_existence, Int4 *gap_extension)
Extract the recommended gap existence and extension values.
Definition: blast_stat.c:3401
Int2 Blast_GumbelBlkLoadFromTables(Blast_GumbelBlk *gbp, Int4 gap_open, Int4 gap_extend, const char *matrix_name)
Attempts to fill GumbelBlk for given gap opening, extensions etc.
Definition: blast_stat.c:3695
Int4 BLAST_ComputeLengthAdjustment(double K, double logK, double alpha_d_lambda, double beta, Int4 query_length, Int8 db_length, Int4 db_num_seqs, Int4 *length_adjustment)
Computes the adjustment to the lengths of the query and database sequences that is used to compensate...
Definition: blast_stat.c:5024
BlastScoreBlk * BlastScoreBlkNew(Uint1 alphabet, Int4 number_of_contexts)
Allocates and initializes BlastScoreBlk.
Definition: blast_stat.c:884
struct Blast_ScoreFreq Blast_ScoreFreq
Holds score frequencies used in calculation of Karlin-Altschul parameters for an ungapped search.
SPsiBlastScoreMatrix * SPsiBlastScoreMatrixFree(SPsiBlastScoreMatrix *matrix)
Deallocates a SPsiBlastScoreMatrix structure.
Definition: blast_stat.c:786
Int2 Blast_ScoreBlkMatrixFill(BlastScoreBlk *sbp, GET_MATRIX_PATH get_path)
This function fills in the BlastScoreBlk structure.
Definition: blast_stat.c:1599
double BLAST_SmallGapSumE(Int4 start_points, Int2 num, double xsum, Int4 query_length, Int4 subject_length, Int8 searchsp_eff, double weight_divisor)
Calculates the e-value for alignments with "small" gaps (typically under fifty residues/basepairs) fo...
Definition: blast_stat.c:4401
#define Boolean
Definition: ncbistd.hpp:136
uint8_t Uint1
1-byte (8-bit) unsigned integer
Definition: ncbitype.h:99
int16_t Int2
2-byte (16-bit) signed integer
Definition: ncbitype.h:100
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
int64_t Int8
8-byte (64-bit) signed integer
Definition: ncbitype.h:104
#define S(x, n)
Type and macro definitions from C toolkit that are not defined in C++ toolkit.
Uint1 Boolean
bool replacment for C
Definition: ncbi_std.h:94
const double E
#define K
The query related information.
Structure used for scoring calculations.
Definition: blast_stat.h:177
Boolean protein_alphabet
TRUE if alphabet_code is for a protein alphabet (e.g., ncbistdaa etc.), FALSE for nt.
Definition: blast_stat.h:178
Blast_KarlinBlk ** kbp
Karlin-Altschul parameters.
Definition: blast_stat.h:207
Int4 loscore
Min.
Definition: blast_stat.h:197
Blast_KarlinBlk ** kbp_psi
K-A parameters for position-based alignments.
Definition: blast_stat.h:213
Blast_ScoreFreq ** sfp
score frequencies for scoring matrix.
Definition: blast_stat.h:205
Boolean matrix_only_scoring
Score ungapped/gapped alignment only using the matrix parameters and with raw scores.
Definition: blast_stat.h:189
double scale_factor
multiplier for all cutoff and dropoff scores
Definition: blast_stat.h:201
Int2 ambig_occupy
How many occupied?
Definition: blast_stat.h:220
Blast_KarlinBlk ** kbp_gap
K-A parameters for gapped alignments.
Definition: blast_stat.h:208
Boolean round_down
Score must be rounded down to nearest even score if odd.
Definition: blast_stat.h:221
Int2 ambig_size
size of array above.
Definition: blast_stat.h:219
char * name
name of scoring matrix.
Definition: blast_stat.h:183
SPsiBlastScoreMatrix * psi_matrix
PSSM and associated data.
Definition: blast_stat.h:186
Int2 alphabet_start
numerical value of 1st letter.
Definition: blast_stat.h:182
Int2 alphabet_size
size of alphabet.
Definition: blast_stat.h:181
Uint1 alphabet_code
NCBI alphabet code.
Definition: blast_stat.h:180
Int4 penalty
penalty for mismatch in blastn.
Definition: blast_stat.h:199
Uint1 * ambiguous_res
Array of ambiguous res.
Definition: blast_stat.h:218
Int4 number_of_contexts
Used by sfp and kbp, how large are these.
Definition: blast_stat.h:217
Boolean read_in_matrix
If TRUE, matrix is read in, otherwise produce one from penalty and reward above.
Definition: blast_stat.h:202
SBlastScoreMatrix * matrix
scoring matrix data
Definition: blast_stat.h:185
Int4 hiscore
Max.
Definition: blast_stat.h:198
ListNode * comments
Comments about scoring matrix.
Definition: blast_stat.h:184
Boolean complexity_adjusted_scoring
Use cross_match-like complexity adjustment on raw scores.
Definition: blast_stat.h:195
Blast_KarlinBlk * kbp_ideal
Ideal values (for query with average database composition).
Definition: blast_stat.h:216
Blast_KarlinBlk ** kbp_gap_std
K-A parameters for std (not position-based) alignments.
Definition: blast_stat.h:214
Blast_KarlinBlk ** kbp_std
K-A parameters for ungapped alignments.
Definition: blast_stat.h:212
Int4 reward
reward for match in blastn.
Definition: blast_stat.h:200
Blast_KarlinBlk ** kbp_gap_psi
K-A parameters for psi alignments.
Definition: blast_stat.h:215
Blast_GumbelBlk * gbp
Gumbel parameters for FSC.
Definition: blast_stat.h:209
Structure to hold the Gumbel parameters (for FSC).
Definition: blast_stat.h:94
Boolean filled
flag indicate the values of gbp are prepared
Definition: blast_stat.h:110
double Tau
2*G*(alpha_un - Sigma)
Definition: blast_stat.h:106
double Alpha_un
Ungapped alpha.
Definition: blast_stat.h:102
double Sigma
cov(L) = sigma y + tau
Definition: blast_stat.h:100
double a
avg(L) = a y + b
Definition: blast_stat.h:98
double b
2*G*(a_un - a)
Definition: blast_stat.h:104
double G
G is the total penalty for extension.
Definition: blast_stat.h:97
double Beta
2*G*(alpha_un - alpha)
Definition: blast_stat.h:105
Int8 db_length
total length of database
Definition: blast_stat.h:108
double a_un
Ungapped a.
Definition: blast_stat.h:101
double Lambda
the unscaled Lambda value
Definition: blast_stat.h:95
double Alpha
var(L) = alpha y + beta
Definition: blast_stat.h:99
Structure to hold the Karlin-Altschul parameters.
Definition: blast_stat.h:66
double paramC
for use in seed.
Definition: blast_stat.h:71
double K
K value used in statistics.
Definition: blast_stat.h:68
double Lambda
Lambda value used in statistics.
Definition: blast_stat.h:67
double H
H value used in statistics.
Definition: blast_stat.h:70
double logK
natural log of K value used in statistics
Definition: blast_stat.h:69
Structure to hold the a message from the core of the BLAST engine.
Definition: blast_message.h:70
Stores the letter frequency of a sequence or database.
Definition: blast_stat.h:273
Uint1 alphabet_code
indicates alphabet.
Definition: blast_stat.h:274
double * prob0
probs, zero offset.
Definition: blast_stat.h:276
double * prob
letter probs, (possible) non-zero offset.
Definition: blast_stat.h:275
Holds score frequencies used in calculation of Karlin-Altschul parameters for an ungapped search.
Definition: blast_stat.h:128
double * sprob0
arrays for frequency of given score
Definition: blast_stat.h:134
double score_avg
average score, must be negative for local alignment.
Definition: blast_stat.h:133
Int4 score_max
highest allowed scores
Definition: blast_stat.h:130
Int4 obs_min
lowest observed (actual) scores
Definition: blast_stat.h:131
double * sprob
arrays for frequency of given score, shifted down by score_min.
Definition: blast_stat.h:135
Int4 score_min
lowest allowed scores
Definition: blast_stat.h:129
Int4 obs_max
highest observed (actual) scores
Definition: blast_stat.h:132
A generic linked list node structure.
Definition: ncbi_std.h:213
Scoring matrix used in BLAST.
Definition: blast_stat.h:139
size_t nrows
number of rows
Definition: blast_stat.h:143
double lambda
derived value of the matrix lambda -RMH-
Definition: blast_stat.h:145
double * freqs
array of assumed matrix background frequencies -RMH-
Definition: blast_stat.h:144
size_t ncols
number of columns
Definition: blast_stat.h:142
int ** data
actual scoring matrix data, stored in row-major form
Definition: blast_stat.h:140
Scoring matrix data used for compressed protein alphabets.
Definition: blast_stat.h:225
Uint1 * compress_table
translation table (AA->compressed)
Definition: blast_stat.h:228
Int4 compressed_alphabet_size
letters in the compressed alphabet
Definition: blast_stat.h:226
SBlastScoreMatrix * matrix
score matrix
Definition: blast_stat.h:227
Scoring matrix data used in PSI-BLAST.
Definition: blast_stat.h:149
SBlastScoreMatrix * pssm
position-specific score matrix
Definition: blast_stat.h:150
double ** freq_ratios
PSSM's frequency ratios, dimensions are specified in pssm data above.
Definition: blast_stat.h:151
Blast_KarlinBlk * kbp
Karlin-Altschul block associated with this PSSM.
Definition: blast_stat.h:153
static string query
Tabulated results for faster erfc(x) lookup.
Definition: blast_stat.h:82
double a
Definition: blast_stat.h:84
double h
Definition: blast_stat.h:87
double * p
Definition: blast_stat.h:88
double b
Definition: blast_stat.h:85
double eps
Definition: blast_stat.h:83
Modified on Fri Jan 05 07:25:59 2024 by modify_doxy.py rev. 669887