NCBI C++ ToolKit
blast_stat.c
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: blast_stat.c 101749 2024-02-06 20:23:45Z ivanov $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Tom Madden
27  *
28  */
29 
30 /** @file blast_stat.c
31  * Functions to calculate BLAST probabilities etc.
32  * Detailed Contents:
33  *
34  * - allocate and deallocate structures used by BLAST to calculate
35  * probabilities etc.
36  *
37  * - calculate residue frequencies for query and "average" database.
38  *
39  * - read in matrix or load it from memory.
40  *
41  * - calculate sum-p from a collection of HSP's, for both the case
42  * of a "small" gap and a "large" gap, when give a total score and the
43  * number of HSP's.
44  *
45  * - calculate expect values for p-values.
46  *
47  * - calculate pseuod-scores from p-values.
48  *
49  */
50 
53 #include "boost_erf.h"
54 #include "blast_psi_priv.h"
55 
56 #define BLAST_SCORE_RANGE_MAX (BLAST_SCORE_MAX - BLAST_SCORE_MIN) /**< maximum allowed range of BLAST scores. */
57 
58 /****************************************************************************
59 For more accuracy in the calculation of K, set K_SUMLIMIT to 0.00001.
60 For high speed in the calculation of K, use a K_SUMLIMIT of 0.001
61 Note: statistical significance is often not greatly affected by the value
62 of K, so high accuracy is generally unwarranted.
63 *****************************************************************************/
64 #define BLAST_KARLIN_K_SUMLIMIT_DEFAULT 0.0001 /**< K_SUMLIMIT_DEFAULT == sumlimit used in BlastKarlinLHtoK() */
65 
66 #define BLAST_KARLIN_LAMBDA_ACCURACY_DEFAULT (1.e-5) /**< LAMBDA_ACCURACY_DEFAULT == accuracy to which Lambda should be calc'd */
67 
68 #define BLAST_KARLIN_LAMBDA_ITER_DEFAULT 17 /**< LAMBDA_ITER_DEFAULT == no. of iterations in LambdaBis = ln(accuracy)/ln(2)*/
69 
70 #define BLAST_KARLIN_LAMBDA0_DEFAULT 0.5 /**< Initial guess for the value of Lambda in BlastKarlinLambdaNR */
71 
72 #define BLAST_KARLIN_K_ITER_MAX 100 /**< upper limit on iterations for BlastKarlinLHtoK */
73 
74 /** Number of statistical parameters in each row of the precomputed tables. */
75 #define BLAST_NUM_STAT_VALUES 11 /**< originally 8, now 11 to support Spouge's FSC. see notes below */
76 
77 /** Holds values (gap-opening, extension, etc.) for a matrix. */
79 
80 /** Used to temporarily store matrix values for retrieval. */
81 typedef struct MatrixInfo {
82  char* name; /**< name of matrix (e.g., BLOSUM90). */
83  array_of_8 *values; /**< The values (gap-opening, extension etc.). */
84  Int4 *prefs; /**< Preferences for display. */
85  Int4 max_number_values; /**< number of values (e.g., BLOSUM90_VALUES_MAX). */
87 
88 
89 /**************************************************************************************
90 
91 How the statistical parameters for the matrices are stored:
92 -----------------------------------------------------------
93 They parameters are stored in a two-dimensional array double (i.e.,
94 doubles, which has as it's first dimensions the number of different
95 gap existence and extension combinations and as it's second dimension 8.
96 The eight different columns specify:
97 
98 1.) gap existence penalty (INT2_MAX denotes infinite).
99 2.) gap extension penalty (INT2_MAX denotes infinite).
100 3.) decline to align penalty (this field is ignored)
101 4.) Lambda
102 5.) K
103 6.) H
104 7.) alpha
105 8.) beta
106 
107 (Items 4-8 are explained in:
108 Altschul SF, Bundschuh R, Olsen R, Hwa T.
109 The estimation of statistical parameters for local alignment score distributions.
110 Nucleic Acids Res. 2001 Jan 15;29(2):351-61.).
111 
112 N.B.: to support John Spouge's idea of FSC, the table is augmented with 3
113 more parameters:
114 
115 9.) C
116 10.) alpha_v
117 11.) sigma
118 
119 where C is the prefactor to the p-value expression, alpha_v is the alpha parameter
120 for the variance of length correction:
121  var(L) = alpha_v * score + beta_v
122 in contrast, the original alpha is the alpha parameter for the mean of length correction:
123  avg(L) = alpha * score + beta.
124 and finally sigma is the alpha parameter for the covariance of length correction:
125  cov(L1, L2) = sigma * score + tau
126 Note: in Spouge's theory, beta, beta_v, and tau are dependent variables.
127 
128 The values tabulated below are based on the following URL:
129 https://www.ncbi.nlm.nih.gov/CBBresearch/Spouge/html.ncbi/blast/index.html
130 
131 Take BLOSUM45 (below) as an example. Currently (5/17/02) there are
132 14 different allowed combinations (specified by "#define BLOSUM45_VALUES_MAX 14").
133 The first row in the array "blosum45_values" has INT2_MAX (i.e., 32767) for gap
134 existence, extension, and decline-to-align penalties. For all practical purposes
135 this value is large enough to be infinite, so the alignments will be ungapped.
136 BLAST may also use this value (INT2_MAX) as a signal to skip gapping, so a
137 different value should not be used if the intent is to have gapless extensions.
138 The next row is for the gap existence penalty 13 and the extension penalty 3.
139 The decline-to-align penalty is only supported in a few cases, so it is normally
140 set to INT2_MAX.
141 
142 
143 How to add a new matrix to blast_stat.c:
144 --------------------------------------
145 To add a new matrix to blast_stat.c it is necessary to complete
146 four steps. As an example consider adding the matrix
147 called TESTMATRIX
148 
149 1.) add a define specifying how many different existence and extensions
150 penalties are allowed, so it would be necessary to add the line:
151 
152 #define TESTMATRIX_VALUES_MAX 14
153 
154 if 14 values were to be allowed.
155 
156 2.) add a two-dimensional array to contain the statistical parameters:
157 
158 static array_of_8 testmatrix_values[TESTMATRIX_VALUES_MAX] ={ ...
159 
160 3.) add a "prefs" array that should hint about the "optimal"
161 gap existence and extension penalties:
162 
163 static Int4 testmatrix_prefs[TESTMATRIX_VALUES_MAX] = {
164 BLAST_MATRIX_NOMINAL,
165 ...
166 };
167 
168 4.) Go to the function BlastLoadMatrixValues (in this file) and
169 add two lines before the return at the end of the function:
170 
171  matrix_info = MatrixInfoNew("TESTMATRIX", testmatrix_values, testmatrix_prefs, TESTMATRIX_VALUES_MAX);
172  ListNodeAddPointer(&retval, 0, matrix_info);
173 
174 
175 
176 ***************************************************************************************/
177 
178 
179 
180 
181 
182 #define BLOSUM45_VALUES_MAX 14 /**< Number of different combinations supported for BLOSUM45. */
184  {(double) INT2_MAX, (double) INT2_MAX, (double) INT2_MAX, 0.2291, 0.0924, 0.2514, 0.9113, -5.7, 0.641318, 9.611060, 9.611060},
185  {13, 3, (double) INT2_MAX, 0.207, 0.049, 0.14, 1.5, -22, 0.671128, 35.855900, 35.963900},
186  {12, 3, (double) INT2_MAX, 0.199, 0.039, 0.11, 1.8, -34, 0.691530, 45.693600, 45.851700},
187  {11, 3, (double) INT2_MAX, 0.190, 0.031, 0.095, 2.0, -38, 0.691181, 62.874100, 63.103700},
188  {10, 3, (double) INT2_MAX, 0.179, 0.023, 0.075, 2.4, -51, 0.710529, 88.286800, 88.639100},
189  {16, 2, (double) INT2_MAX, 0.210, 0.051, 0.14, 1.5, -24, 0.666680, 36.279800, 36.452400},
190  {15, 2, (double) INT2_MAX, 0.203, 0.041, 0.12, 1.7, -31, 0.673871, 44.825700, 45.060400},
191  {14, 2, (double) INT2_MAX, 0.195, 0.032, 0.10, 1.9, -36, 0.685753, 60.736200, 61.102300},
192  {13, 2, (double) INT2_MAX, 0.185, 0.024, 0.084, 2.2, -45, 0.698480, 85.148100, 85.689400},
193  {12, 2, (double) INT2_MAX, 0.171, 0.016, 0.061, 2.8, -65, 0.713429, 127.758000, 128.582000},
194  {19, 1, (double) INT2_MAX, 0.205, 0.040, 0.11, 1.9, -43, 0.672302, 53.071400, 53.828200},
195  {18, 1, (double) INT2_MAX, 0.198, 0.032, 0.10, 2.0, -43, 0.682580, 72.342400, 73.403900},
196  {17, 1, (double) INT2_MAX, 0.189, 0.024, 0.079, 2.4, -57, 0.695035, 103.055000, 104.721000},
197  {16, 1, (double) INT2_MAX, 0.176, 0.016, 0.063, 2.8, -67, 0.712966, 170.100000, 173.003000},
198 }; /**< Supported values (gap-existence, extension, etc.) for BLOSUM45. */
199 
215 }; /**< Quality values for BLOSUM45 matrix, each element corresponds to same element number in array blosum45_values */
216 
217 
218 #define BLOSUM50_VALUES_MAX 16 /**< Number of different combinations supported for BLOSUM50. */
220  {(double) INT2_MAX, (double) INT2_MAX, (double) INT2_MAX, 0.2318, 0.112, 0.3362, 0.6895, -4.0, 0.609639, 5.388310, 5.388310},
221  {13, 3, (double) INT2_MAX, 0.212, 0.063, 0.19, 1.1, -16, 0.639287, 18.113800, 18.202800},
222  {12, 3, (double) INT2_MAX, 0.206, 0.055, 0.17, 1.2, -18, 0.644715, 22.654600, 22.777700},
223  {11, 3, (double) INT2_MAX, 0.197, 0.042, 0.14, 1.4, -25, 0.656327, 29.861100, 30.045700},
224  {10, 3, (double) INT2_MAX, 0.186, 0.031, 0.11, 1.7, -34, 0.671150, 42.393800, 42.674000},
225  {9, 3, (double) INT2_MAX, 0.172, 0.022, 0.082, 2.1, -48, 0.694326, 66.069600, 66.516400},
226  {16, 2, (double) INT2_MAX, 0.215, 0.066, 0.20, 1.05, -15, 0.633899, 17.951800, 18.092100},
227  {15, 2, (double) INT2_MAX, 0.210, 0.058, 0.17, 1.2, -20, 0.641985, 21.940100, 22.141800},
228  {14, 2, (double) INT2_MAX, 0.202, 0.045, 0.14, 1.4, -27, 0.650682, 28.681200, 28.961900},
229  {13, 2, (double) INT2_MAX, 0.193, 0.035, 0.12, 1.6, -32, 0.660984, 42.059500, 42.471600},
230  {12, 2, (double) INT2_MAX, 0.181, 0.025, 0.095, 1.9, -41, 0.678090, 63.747600, 64.397300},
231  {19, 1, (double) INT2_MAX, 0.212, 0.057, 0.18, 1.2, -21, 0.635714, 26.311200, 26.923300},
232  {18, 1, (double) INT2_MAX, 0.207, 0.050, 0.15, 1.4, -28, 0.643523, 34.903700, 35.734800},
233  {17, 1, (double) INT2_MAX, 0.198, 0.037, 0.12, 1.6, -33, 0.654504, 48.895800, 50.148600},
234  {16, 1, (double) INT2_MAX, 0.186, 0.025, 0.10, 1.9, -42, 0.667750, 76.469100, 78.443000},
235  {15, 1, (double) INT2_MAX, 0.171, 0.015, 0.063, 2.7, -76, 0.694575, 140.053000, 144.160000},
236 }; /**< Supported values (gap-existence, extension, etc.) for BLOSUM50. */
237 
255 }; /**< Quality values for BLOSUM50 matrix, each element corresponds to same element number in array blosum50_values */
256 
257 #define BLOSUM62_VALUES_MAX 12 /**< Number of different combinations supported for BLOSUM62. */
259  {(double) INT2_MAX, (double) INT2_MAX, (double) INT2_MAX, 0.3176, 0.134, 0.4012, 0.7916, -3.2, 0.623757, 4.964660, 4.964660},
260  {11, 2, (double) INT2_MAX, 0.297, 0.082, 0.27, 1.1, -10, 0.641766, 12.673800, 12.757600},
261  {10, 2, (double) INT2_MAX, 0.291, 0.075, 0.23, 1.3, -15, 0.649362, 16.474000, 16.602600},
262  {9, 2, (double) INT2_MAX, 0.279, 0.058, 0.19, 1.5, -19, 0.659245, 22.751900, 22.950000},
263  {8, 2, (double) INT2_MAX, 0.264, 0.045, 0.15, 1.8, -26, 0.672692, 35.483800, 35.821300},
264  {7, 2, (double) INT2_MAX, 0.239, 0.027, 0.10, 2.5, -46, 0.702056, 61.238300, 61.886000},
265  {6, 2, (double) INT2_MAX, 0.201, 0.012, 0.061, 3.3, -58, 0.740802, 140.417000, 141.882000},
266  {13, 1, (double) INT2_MAX, 0.292, 0.071, 0.23, 1.2, -11, 0.647715, 19.506300, 19.893100},
267  {12, 1, (double) INT2_MAX, 0.283, 0.059, 0.19, 1.5, -19, 0.656391, 27.856200, 28.469900},
268  {11, 1, (double) INT2_MAX, 0.267, 0.041, 0.14, 1.9, -30, 0.669720, 42.602800, 43.636200},
269  {10, 1, (double) INT2_MAX, 0.243, 0.024, 0.10, 2.5, -44, 0.693267, 83.178700, 85.065600},
270  {9, 1, (double) INT2_MAX, 0.206, 0.010, 0.052, 4.0, -87, 0.731887, 210.333000, 214.842000},
271 }; /**< Supported values (gap-existence, extension, etc.) for BLOSUM62. */
272 
286 }; /**< Quality values for BLOSUM62 matrix, each element corresponds to same element number in array blosum62_values */
287 
288 
289 #define BLOSUM80_VALUES_MAX 10 /**< Number of different combinations supported for BLOSUM80. */
291  {(double) INT2_MAX, (double) INT2_MAX, (double) INT2_MAX, 0.3430, 0.177, 0.6568, 0.5222, -1.6, 0.564057, 1.918130, 1.918130},
292  {25, 2, (double) INT2_MAX, 0.342, 0.17, 0.66, 0.52, -1.6, 0.563956, 1.731000, 1.731300},
293  {13, 2, (double) INT2_MAX, 0.336, 0.15, 0.57, 0.59, -3, 0.570979, 2.673470, 2.692300},
294  {9, 2, (double) INT2_MAX, 0.319, 0.11, 0.42, 0.76, -6, 0.587837, 5.576090, 5.667860},
295  {8, 2, (double) INT2_MAX, 0.308, 0.090, 0.35, 0.89, -9, 0.597556, 7.536950, 7.686230},
296  {7, 2, (double) INT2_MAX, 0.293, 0.070, 0.27, 1.1, -14, 0.615254, 11.586600, 11.840400},
297  {6, 2, (double) INT2_MAX, 0.268, 0.045, 0.19, 1.4, -19, 0.644054, 19.958100, 20.441200},
298  {11, 1, (double) INT2_MAX, 0.314, 0.095, 0.35, 0.90, -9, 0.590702, 8.808610, 9.223320},
299  {10, 1, (double) INT2_MAX, 0.299, 0.071, 0.27, 1.1, -14, 0.609620, 13.833800, 14.533400},
300  {9, 1, (double) INT2_MAX, 0.279, 0.048, 0.20, 1.4, -19, 0.623800, 24.252000, 25.490400},
301 }; /**< Supported values (gap-existence, extension, etc.) for BLOSUM80. */
302 
314 }; /**< Quality values for BLOSUM80 matrix, each element corresponds to same element number in array blosum80_values */
315 
316 #define BLOSUM90_VALUES_MAX 8 /**< Number of different combinations supported for BLOSUM90. */
318  {(double) INT2_MAX, (double) INT2_MAX, (double) INT2_MAX, 0.3346, 0.190, 0.7547, 0.4434, -1.4 , 0.544178, 1.377760, 1.377760},
319  {9, 2, (double) INT2_MAX, 0.310, 0.12, 0.46, 0.67, -6 , 0.570267, 4.232290, 4.334170},
320  {8, 2, (double) INT2_MAX, 0.300, 0.099, 0.39, 0.76, -7, 0.581580, 5.797020, 5.961420},
321  {7, 2, (double) INT2_MAX, 0.283, 0.072, 0.30, 0.93, -11, 0.600024, 9.040880, 9.321600},
322  {6, 2, (double) INT2_MAX, 0.259, 0.048, 0.22, 1.2, -16, 0.629344, 16.024400, 16.531600},
323  {11, 1, (double) INT2_MAX, 0.302, 0.093, 0.39, 0.78, -8, 0.576919, 7.143250, 7.619190},
324  {10, 1, (double) INT2_MAX, 0.290, 0.075, 0.28, 1.04, -15, 0.591366, 11.483900, 12.269800},
325  {9, 1, (double) INT2_MAX, 0.265, 0.044, 0.20, 1.3, -19, 0.613013, 21.408300, 22.840900},
326 }; /**< Supported values (gap-existence, extension, etc.) for BLOSUM90. */
327 
337 }; /**< Quality values for BLOSUM90 matrix, each element corresponds to same element number in array blosum90_values */
338 
339 #define PAM250_VALUES_MAX 16 /**< Number of different combinations supported for PAM250. */
341  {(double) INT2_MAX, (double) INT2_MAX, (double) INT2_MAX, 0.2252, 0.0868, 0.2223, 0.98, -5.0, 0.660059, 11.754300, 11.754300},
342  {15, 3, (double) INT2_MAX, 0.205, 0.049, 0.13, 1.6, -23, 0.687656, 34.578400, 34.928000},
343  {14, 3, (double) INT2_MAX, 0.200, 0.043, 0.12, 1.7, -26, 0.689768, 43.353000, 43.443800},
344  {13, 3, (double) INT2_MAX, 0.194, 0.036, 0.10, 1.9, -31, 0.697431, 50.948500, 51.081700},
345  {12, 3, (double) INT2_MAX, 0.186, 0.029, 0.085, 2.2, -41, 0.704565, 69.606500, 69.793600},
346  {11, 3, (double) INT2_MAX, 0.174, 0.020, 0.070, 2.5, -48, 0.722438, 98.653500, 98.927100},
347  {17, 2, (double) INT2_MAX, 0.204, 0.047, 0.12, 1.7, -28, 0.684799, 41.583800, 41.735800},
348  {16, 2, (double) INT2_MAX, 0.198, 0.038, 0.11, 1.8, -29, 0.691098, 51.635200, 51.843900},
349  {15, 2, (double) INT2_MAX, 0.191, 0.031, 0.087, 2.2, -44, 0.699051, 67.256700, 67.558500},
350  {14, 2, (double) INT2_MAX, 0.182, 0.024, 0.073, 2.5, -53, 0.714103, 96.315100, 96.756800},
351  {13, 2, (double) INT2_MAX, 0.171, 0.017, 0.059, 2.9, -64, 0.728738, 135.653000, 136.339000},
352  {21, 1, (double) INT2_MAX, 0.205, 0.045, 0.11, 1.8, -34, 0.683265, 48.728200, 49.218800},
353  {20, 1, (double) INT2_MAX, 0.199, 0.037, 0.10, 1.9, -35, 0.689380, 60.832000, 61.514100},
354  {19, 1, (double) INT2_MAX, 0.192, 0.029, 0.083, 2.3, -52, 0.696344, 84.019700, 84.985600},
355  {18, 1, (double) INT2_MAX, 0.183, 0.021, 0.070, 2.6, -60, 0.710525, 113.829000, 115.184000},
356  {17, 1, (double) INT2_MAX, 0.171, 0.014, 0.052, 3.3, -86, 0.727000, 175.071000, 177.196000},
357 }; /**< Supported values (gap-existence, extension, etc.) for PAM250. */
358 
376 }; /**< Quality values for PAM250 matrix, each element corresponds to same element number in array pam250_values */
377 
378 #define PAM30_VALUES_MAX 11 /**< Number of different combinations supported for PAM30. */
380  {(double) INT2_MAX, (double) INT2_MAX, (double) INT2_MAX, 0.3400, 0.283, 1.754, 0.1938, -0.3, 0.436164, 0.161818, 0.161818},
381  {7, 2, (double) INT2_MAX, 0.305, 0.15, 0.87, 0.35, -3, 0.479087, 1.014010, 1.162730},
382  {6, 2, (double) INT2_MAX, 0.287, 0.11, 0.68, 0.42, -4, 0.499980, 1.688060, 1.951430},
383  {5, 2, (double) INT2_MAX, 0.264, 0.079, 0.45, 0.59, -7, 0.533009, 3.377010, 3.871950},
384  {10, 1, (double) INT2_MAX, 0.309, 0.15, 0.88, 0.35, -3, 0.474741, 1.372050, 1.788770},
385  {9, 1, (double) INT2_MAX, 0.294, 0.11, 0.61, 0.48, -6, 0.492716, 2.463920, 3.186150},
386  {8, 1, (double) INT2_MAX, 0.270, 0.072, 0.40, 0.68, -10, 0.521286, 5.368130, 6.763480},
387  {15, 3, (double) INT2_MAX, 0.339, 0.28, 1.70, 0.20, -0.5, 0.437688, 0.157089, 0.155299},
388  {14, 2, (double) INT2_MAX, 0.337, 0.27, 1.62, 0.21, -0.8, 0.440010, 0.206970, 0.198524},
389  {14, 1, (double) INT2_MAX, 0.333, 0.27, 1.43, 0.23, -1.4, 0.444817, 0.436301, 0.361947},
390  {13, 3, (double) INT2_MAX, 0.338, 0.27, 1.69, 0.20, -0.5, 0.439086, 0.178973, 0.175436},
391 }; /**< Supported values (gap-existence, extension, etc.) for PAM30. */
392 
405 }; /**< Quality values for PAM30 matrix, each element corresponds to same element number in array pam30_values */
406 
407 
408 #define PAM70_VALUES_MAX 9 /**< Number of different combinations supported for PAM70. */
410  {(double) INT2_MAX, (double) INT2_MAX, (double) INT2_MAX, 0.3345, 0.229, 1.029, 0.3250, -0.7, 0.511296, 0.633439, 0.633439},
411  {8, 2, (double) INT2_MAX, 0.301, 0.12, 0.54, 0.56, -5, 0.549019, 2.881650, 3.025710},
412  {7, 2, (double) INT2_MAX, 0.286, 0.093, 0.43, 0.67, -7, 0.565659, 4.534540, 4.785780},
413  {6, 2, (double) INT2_MAX, 0.264, 0.064, 0.29, 0.90, -12, 0.596330, 7.942630, 8.402720},
414  {11, 1, (double) INT2_MAX, 0.305, 0.12, 0.52, 0.59, -6, 0.543514, 3.681400, 4.108020},
415  {10, 1, (double) INT2_MAX, 0.291, 0.091, 0.41, 0.71, -9, 0.560723, 6.002970, 6.716570},
416  {9, 1, (double) INT2_MAX, 0.270, 0.060, 0.28, 0.97, -14, 0.585186, 11.360800, 12.636700},
417  {11, 2, (double) INT2_MAX, 0.323, 0.186, 0.80, 1.32, -27, 0.524062, 1.321301, 1.281671},
418  {12, 3, (double) INT2_MAX, 0.330, 0.219, 0.93, 0.82, -16, 0.516845, 0.818768, 0.811240},
419 }; /**< Supported values (gap-existence, extension, etc.) for PAM70. */
420 
431 }; /**< Quality values for PAM70 matrix, each element corresponds to same element number in array pam70_values */
432 
433 
434 
435 #ifdef BLOSUM62_20_ENABLE
436 
437 #define BLOSUM62_20_VALUES_MAX 65 /**< Number of different combinations supported for BLOSUM62 with 1/20 bit scaling. */
438 static array_of_8 blosum62_20_values[BLOSUM62_20_VALUES_MAX] = {
439  {(double) INT2_MAX, (double) INT2_MAX, (double) INT2_MAX, 0.03391, 0.125, 0.4544, 0.07462, -3.2,0.0,0.0,0.0},
440  {100, 12, (double) INT2_MAX, 0.0300, 0.056, 0.21, 0.14, -15,0.0,0.0,0.0},
441  {95, 12, (double) INT2_MAX, 0.0291, 0.047, 0.18, 0.16, -20,0.0,0.0,0.0},
442  {90, 12, (double) INT2_MAX, 0.0280, 0.038, 0.15, 0.19, -28,0.0,0.0,0.0},
443  {85, 12, (double) INT2_MAX, 0.0267, 0.030, 0.13, 0.21, -31,0.0,0.0,0.0},
444  {80, 12, (double) INT2_MAX, 0.0250, 0.021, 0.10, 0.25, -39,0.0,0.0,0.0},
445  {105, 11, (double) INT2_MAX, 0.0301, 0.056, 0.22, 0.14, -16,0.0,0.0,0.0},
446  {100, 11, (double) INT2_MAX, 0.0294, 0.049, 0.20, 0.15, -17,0.0,0.0,0.0},
447  {95, 11, (double) INT2_MAX, 0.0285, 0.042, 0.16, 0.18, -25,0.0,0.0,0.0},
448  {90, 11, (double) INT2_MAX, 0.0271, 0.031, 0.14, 0.20, -28,0.0,0.0,0.0},
449  {85, 11, (double) INT2_MAX, 0.0256, 0.023, 0.10, 0.26, -46,0.0,0.0,0.0},
450  {115, 10, (double) INT2_MAX, 0.0308, 0.062, 0.22, 0.14, -20,0.0,0.0,0.0},
451  {110, 10, (double) INT2_MAX, 0.0302, 0.056, 0.19, 0.16, -26,0.0,0.0,0.0},
452  {105, 10, (double) INT2_MAX, 0.0296, 0.050, 0.17, 0.17, -27,0.0,0.0,0.0},
453  {100, 10, (double) INT2_MAX, 0.0286, 0.041, 0.15, 0.19, -32,0.0,0.0,0.0},
454  {95, 10, (double) INT2_MAX, 0.0272, 0.030, 0.13, 0.21, -35,0.0,0.0,0.0},
455  {90, 10, (double) INT2_MAX, 0.0257, 0.022, 0.11, 0.24, -40,0.0,0.0,0.0},
456  {85, 10, (double) INT2_MAX, 0.0242, 0.017, 0.083, 0.29, -51,0.0,0.0,0.0},
457  {115, 9, (double) INT2_MAX, 0.0306, 0.061, 0.24, 0.13, -14,0.0,0.0,0.0},
458  {110, 9, (double) INT2_MAX, 0.0299, 0.053, 0.19, 0.16, -23,0.0,0.0,0.0},
459  {105, 9, (double) INT2_MAX, 0.0289, 0.043, 0.17, 0.17, -23,0.0,0.0,0.0},
460  {100, 9, (double) INT2_MAX, 0.0279, 0.036, 0.14, 0.20, -31,0.0,0.0,0.0},
461  {95, 9, (double) INT2_MAX, 0.0266, 0.028, 0.12, 0.23, -37,0.0,0.0,0.0},
462  {120, 8, (double) INT2_MAX, 0.0307, 0.062, 0.22, 0.14, -18,0.0,0.0,0.0},
463  {115, 8, (double) INT2_MAX, 0.0300, 0.053, 0.20, 0.15, -19,0.0,0.0,0.0},
464  {110, 8, (double) INT2_MAX, 0.0292, 0.046, 0.17, 0.17, -23,0.0,0.0,0.0},
465  {105, 8, (double) INT2_MAX, 0.0280, 0.035, 0.14, 0.20, -31,0.0,0.0,0.0},
466  {100, 8, (double) INT2_MAX, 0.0266, 0.026, 0.12, 0.23, -37,0.0,0.0,0.0},
467  {125, 7, (double) INT2_MAX, 0.0306, 0.058, 0.22, 0.14, -18,0.0,0.0,0.0},
468  {120, 7, (double) INT2_MAX, 0.0300, 0.052, 0.19, 0.16, -23,0.0,0.0,0.0},
469  {115, 7, (double) INT2_MAX, 0.0292, 0.044, 0.17, 0.17, -24,0.0,0.0,0.0},
470  {110, 7, (double) INT2_MAX, 0.0279, 0.032, 0.14, 0.20, -31,0.0,0.0,0.0},
471  {105, 7, (double) INT2_MAX, 0.0267, 0.026, 0.11, 0.24, -41,0.0,0.0,0.0},
472  {120,10,5, 0.0298, 0.049, 0.19, 0.16, -21,0.0,0.0,0.0},
473  {115,10,5, 0.0290, 0.042, 0.16, 0.18, -25,0.0,0.0,0.0},
474  {110,10,5, 0.0279, 0.033, 0.13, 0.21, -32,0.0,0.0,0.0},
475  {105,10,5, 0.0264, 0.024, 0.10, 0.26, -46,0.0,0.0,0.0},
476  {100,10,5, 0.0250, 0.018, 0.081, 0.31, -56,0.0,0.0,0.0},
477  {125,10,4, 0.0301, 0.053, 0.18, 0.17, -25,0.0,0.0,0.0},
478  {120,10,4, 0.0292, 0.043, 0.15, 0.20, -33,0.0,0.0,0.0},
479  {115,10,4, 0.0282, 0.035, 0.13, 0.22, -36,0.0,0.0,0.0},
480  {110,10,4, 0.0270, 0.027, 0.11, 0.25, -41,0.0,0.0,0.0},
481  {105,10,4, 0.0254, 0.020, 0.079, 0.32, -60,0.0,0.0,0.0},
482  {130,10,3, 0.0300, 0.051, 0.17, 0.18, -27,0.0,0.0,0.0},
483  {125,10,3, 0.0290, 0.040, 0.13, 0.22, -38,0.0,0.0,0.0},
484  {120,10,3, 0.0278, 0.030, 0.11, 0.25, -44,0.0,0.0,0.0},
485  {115,10,3, 0.0267, 0.025, 0.092, 0.29, -52,0.0,0.0,0.0},
486  {110,10,3, 0.0252, 0.018, 0.070, 0.36, -70,0.0,0.0,0.0},
487  {135,10,2, 0.0292, 0.040, 0.13, 0.22, -35,0.0,0.0,0.0},
488  {130,10,2, 0.0283, 0.034, 0.10, 0.28, -51,0.0,0.0,0.0},
489  {125,10,2, 0.0269, 0.024, 0.077, 0.35, -71,0.0,0.0,0.0},
490  {120,10,2, 0.0253, 0.017, 0.059, 0.43, -90,0.0,0.0,0.0},
491  {115,10,2, 0.0234, 0.011, 0.043, 0.55, -121,0.0,0.0,0.0},
492  {100,14,3, 0.0258, 0.023, 0.087, 0.33, -59,0.0,0.0,0.0},
493  {105,13,3, 0.0263, 0.024, 0.085, 0.31, -57,0.0,0.0,0.0},
494  {110,12,3, 0.0271, 0.028, 0.093, 0.29, -54,0.0,0.0,0.0},
495  {115,11,3, 0.0275, 0.030, 0.10, 0.27, -49,0.0,0.0,0.0},
496  {125,9,3, 0.0283, 0.034, 0.12, 0.23, -38,0.0,0.0,0.0},
497  {130,8,3, 0.0287, 0.037, 0.12, 0.23, -40,0.0,0.0,0.0},
498  {125,7,3, 0.0287, 0.036, 0.12, 0.24, -44,0.0,0.0,0.0},
499  {140,6,3, 0.0285, 0.033, 0.12, 0.23, -40,0.0,0.0,0.0},
500  {105,14,3, 0.0270, 0.028, 0.10, 0.27, -46,0.0,0.0,0.0},
501  {110,13,3, 0.0279, 0.034, 0.10, 0.27, -50,0.0,0.0,0.0},
502  {115,12,3, 0.0282, 0.035, 0.12, 0.24, -42,0.0,0.0,0.0},
503  {120,11,3, 0.0286, 0.037, 0.12, 0.24, -44,0.0,0.0,0.0},
504 }; /**< Supported values (gap-existence, extension, etc.) for BLOSUM62_20. */
505 
506 static Int4 blosum62_20_prefs[BLOSUM62_20_VALUES_MAX] = {
572 }; /**< Quality values for BLOSUM62_20 matrix, each element corresponds to same element number in array blosum62_20_values */
573 
574 #endif
575 
576 
577 #define PROT_IDENTITY_VALUES_MAX 2
579  {(double) INT2_MAX, (double) INT2_MAX, (double) INT2_MAX, 0.28768, 0.282, 1.69, 0.1703, -0.3, 0.43828, 0.16804, 0.16804},
580  {15, 2, (double) INT2_MAX, 0.2835, 0.255, 1.49, 0.19, -1, 0.44502, 0.24613, 0.22743}
581 };
582 
586 };
587 
588 
589 /** Supported substitution and gap costs with corresponding quality values
590  * for nucleotide sequence comparisons.
591  * NB: the values 0 and 0 for the gap costs are treated as the defaults used for
592  * the greedy gapped extension, i.e.
593  * gap opening = 0,
594  * gap extension = 1/2 match - mismatch.
595  *
596  * The fields are:
597  *
598  * 1. Gap opening cost,
599  * 2. Gap extension cost,
600  * 3. Lambda,
601  * 4. K,
602  * 5. H,
603  * 6. Alpha,
604  * 7. Beta,
605  * 8. Theta
606  */
607 
608 // TODO: add gumbel parameters for nucleotide cases
609 
610 /** Karlin-Altschul parameter values for substitution scores 1 and -5. */
611 static const array_of_8 blastn_values_1_5[] = {
612  { 0, 0, 1.39, 0.747, 1.38, 1.00, 0, 100 },
613  { 3, 3, 1.39, 0.747, 1.38, 1.00, 0, 100 }
614 };
615 
616 /** Karlin-Altschul parameter values for substitution scores 1 and -4. */
617 static const array_of_8 blastn_values_1_4[] = {
618  { 0, 0, 1.383, 0.738, 1.36, 1.02, 0, 100 },
619  { 1, 2, 1.36, 0.67, 1.2, 1.1, 0, 98 },
620  { 0, 2, 1.26, 0.43, 0.90, 1.4, -1, 91 },
621  { 2, 1, 1.35, 0.61, 1.1, 1.2, -1, 98 },
622  { 1, 1, 1.22, 0.35, 0.72, 1.7, -3, 88 }
623 };
624 
625 /** Karlin-Altschul parameter values for substitution scores 2 and -7.
626  * These parameters can only be applied to even scores. Any odd score must be
627  * rounded down to the nearest even number before calculating the e-value.
628  */
629 static const array_of_8 blastn_values_2_7[] = {
630  { 0, 0, 0.69, 0.73, 1.34, 0.515, 0, 100 },
631  { 2, 4, 0.68, 0.67, 1.2, 0.55, 0, 99 },
632  { 0, 4, 0.63, 0.43, 0.90, 0.7, -1, 91 },
633  { 4, 2, 0.675, 0.62, 1.1, 0.6, -1, 98 },
634  { 2, 2, 0.61, 0.35, 0.72, 1.7, -3, 88 }
635 };
636 
637 /** Karlin-Altschul parameter values for substitution scores 1 and -3. */
638 static const array_of_8 blastn_values_1_3[] = {
639  { 0, 0, 1.374, 0.711, 1.31, 1.05, 0, 100 },
640  { 2, 2, 1.37, 0.70, 1.2, 1.1, 0, 99 },
641  { 1, 2, 1.35, 0.64, 1.1, 1.2, -1, 98 },
642  { 0, 2, 1.25, 0.42, 0.83, 1.5, -2, 91 },
643  { 2, 1, 1.34, 0.60, 1.1, 1.2, -1, 97 },
644  { 1, 1, 1.21, 0.34, 0.71, 1.7, -2, 88 }
645 };
646 
647 /** Karlin-Altschul parameter values for substitution scores 2 and -5.
648  * These parameters can only be applied to even scores. Any odd score must be
649  * rounded down to the nearest even number before calculating the e-value.
650  */
651 static const array_of_8 blastn_values_2_5[] = {
652  { 0, 0, 0.675, 0.65, 1.1, 0.6, -1, 99 },
653  { 2, 4, 0.67, 0.59, 1.1, 0.6, -1, 98 },
654  { 0, 4, 0.62, 0.39, 0.78, 0.8, -2, 91 },
655  { 4, 2, 0.67, 0.61, 1.0, 0.65, -2, 98 },
656  { 2, 2, 0.56, 0.32, 0.59, 0.95, -4, 82 }
657 };
658 
659 /** Karlin-Altschul parameter values for substitution scores 1 and -2. */
660 static const array_of_8 blastn_values_1_2[] = {
661  { 0, 0, 1.28, 0.46, 0.85, 1.5, -2, 96 },
662  { 2, 2, 1.33, 0.62, 1.1, 1.2, 0, 99 },
663  { 1, 2, 1.30, 0.52, 0.93, 1.4, -2, 97 },
664  { 0, 2, 1.19, 0.34, 0.66, 1.8, -3, 89 },
665  { 3, 1, 1.32, 0.57, 1.0, 1.3, -1, 99 },
666  { 2, 1, 1.29, 0.49, 0.92, 1.4, -1, 96 },
667  { 1, 1, 1.14, 0.26, 0.52, 2.2, -5, 85 }
668 };
669 
670 /** Karlin-Altschul parameter values for substitution scores 2 and -3.
671  * These parameters can only be applied to even scores. Any odd score must be
672  * rounded down to the nearest even number before calculating the e-value.
673  */
674 static const array_of_8 blastn_values_2_3[] = {
675  { 0, 0, 0.55, 0.21, 0.46, 1.2, -5, 87 },
676  { 4, 4, 0.63, 0.42, 0.84, 0.75, -2, 99 },
677  { 2, 4, 0.615, 0.37, 0.72, 0.85, -3, 97 },
678  { 0, 4, 0.55, 0.21, 0.46, 1.2, -5, 87 },
679  { 3, 3, 0.615, 0.37, 0.68, 0.9, -3, 97 },
680  { 6, 2, 0.63, 0.42, 0.84, 0.75, -2, 99 },
681  { 5, 2, 0.625, 0.41, 0.78, 0.8, -2, 99 },
682  { 4, 2, 0.61, 0.35, 0.68, 0.9, -3, 96 },
683  { 2, 2, 0.515, 0.14, 0.33, 1.55, -9, 81 }
684 };
685 
686 /** Karlin-Altschul parameter values for substitution scores 3 and -4. */
687 static const array_of_8 blastn_values_3_4[] = {
688  { 6, 3, 0.389, 0.25, 0.56, 0.7, -5, 95},
689  { 5, 3, 0.375, 0.21, 0.47, 0.8, -6, 92},
690  { 4, 3, 0.351, 0.14, 0.35, 1.0, -9, 86},
691  { 6, 2, 0.362, 0.16, 0.45, 0.8, -4, 88},
692  { 5, 2, 0.330, 0.092, 0.28, 1.2, -13, 81},
693  { 4, 2, 0.281, 0.046, 0.16, 1.8, -23, 69}
694 };
695 
696 /** Karlin-Altschul parameter values for substitution scores 4 and -5. */
697 static const array_of_8 blastn_values_4_5[] = {
698  { 0, 0, 0.22, 0.061, 0.22, 1.0, -15, 74 },
699  { 6, 5, 0.28, 0.21, 0.47, 0.6 , -7, 93 },
700  { 5, 5, 0.27, 0.17, 0.39, 0.7, -9, 90 },
701  { 4, 5, 0.25, 0.10, 0.31, 0.8, -10, 83 },
702  { 3, 5, 0.23, 0.065, 0.25, 0.9, -11, 76 }
703 };
704 
705 /** Karlin-Altschul parameter values for substitution scores 1 and -1. */
706 static const array_of_8 blastn_values_1_1[] = {
707  { 3, 2, 1.09, 0.31, 0.55, 2.0, -2, 99 },
708  { 2, 2, 1.07, 0.27, 0.49, 2.2, -3, 97 },
709  { 1, 2, 1.02, 0.21, 0.36, 2.8, -6, 92 },
710  { 0, 2, 0.80, 0.064, 0.17, 4.8, -16, 72 },
711  { 4, 1, 1.08, 0.28, 0.54, 2.0, -2, 98 },
712  { 3, 1, 1.06, 0.25, 0.46, 2.3, -4, 96 },
713  { 2, 1, 0.99, 0.17, 0.30, 3.3, -10, 90 }
714 };
715 
716 /** Karlin-Altschul parameter values for substitution scores 3 and -2. */
717 static const array_of_8 blastn_values_3_2[] = {
718  { 5, 5, 0.208, 0.030, 0.072, 2.9, -47, 77}
719 };
720 
721 /** Karlin-Altschul parameter values for substitution scores 5 and -4. */
722 static const array_of_8 blastn_values_5_4[] = {
723  { 10, 6, 0.163, 0.068, 0.16, 1.0, -19, 85 },
724  { 8, 6, 0.146, 0.039, 0.11, 1.3, -29, 76 }
725 };
726 
727 /** Deallocates SBlastScoreMatrix structure
728  * @param matrix structure to deallocate [in]
729  * @return NULL
730  */
733 {
734  if ( !matrix ) {
735  return NULL;
736  }
737 
738  if (matrix->data) {
739  matrix->data = (int**) _PSIDeallocateMatrix((void**) matrix->data,
740  (unsigned int)matrix->ncols);
741  }
742 
743  /* Deallocate the matrix frequencies which is used by the
744  * nucleotide custom matrix reader. -RMH-
745  */
746  if ( matrix->freqs )
747  sfree(matrix->freqs);
748 
749  sfree(matrix);
750  return NULL;
751 }
752 
753 /** Allocates a new SBlastScoreMatrix structure of the specified dimensions.
754  * @param ncols number of columns [in]
755  * @param nrows number of rows [in]
756  * @return NULL in case of memory allocation failure, else new
757  * SBlastScoreMatrix structure
758  */
760 SBlastScoreMatrixNew(size_t ncols, size_t nrows)
761 {
762  SBlastScoreMatrix* retval = NULL;
763 
764  retval = (SBlastScoreMatrix*) calloc(1, sizeof(SBlastScoreMatrix));
765  if ( !retval ) {
766  return SBlastScoreMatrixFree(retval);
767  }
768 
769  retval->data = (int**) _PSIAllocateMatrix((unsigned int)ncols, (unsigned int)nrows, sizeof(int));
770  if ( !retval->data ) {
771  return SBlastScoreMatrixFree(retval);
772  }
773 
774  /* Allocate additional attributes for use with custom
775  * nucleotide matrices. -RMH-
776  */
777  retval->freqs = (double *) calloc(ncols, sizeof(double));
778  retval->lambda = 0;
779 
780  retval->ncols = ncols;
781  retval->nrows = nrows;
782  return retval;
783 }
784 
787 {
788  if ( !matrix ) {
789  return NULL;
790  }
791 
792  if (matrix->freq_ratios) {
793  matrix->freq_ratios = (double**) _PSIDeallocateMatrix((void**)
794  matrix->freq_ratios,
795  (unsigned int)matrix->pssm->ncols);
796  }
797 
798  matrix->pssm = SBlastScoreMatrixFree(matrix->pssm);
799  matrix->kbp = Blast_KarlinBlkFree(matrix->kbp);
800  sfree(matrix);
801  return NULL;
802 }
803 
806 {
807  SPsiBlastScoreMatrix* retval = NULL;
808 
809  retval = (SPsiBlastScoreMatrix*) calloc(1, sizeof(SPsiBlastScoreMatrix));
810  if ( !retval ) {
811  return SPsiBlastScoreMatrixFree(retval);
812  }
813 
814  retval->pssm = SBlastScoreMatrixNew(ncols, BLASTAA_SIZE);
815 
816  if ( !retval->pssm ) {
817  return SPsiBlastScoreMatrixFree(retval);
818  }
819 
820  retval->freq_ratios = (double**) _PSIAllocateMatrix((unsigned int)ncols, BLASTAA_SIZE,
821  sizeof(double));
822  if ( !retval->freq_ratios ) {
823  return SPsiBlastScoreMatrixFree(retval);
824  }
825 
826  retval->kbp = Blast_KarlinBlkNew();
827  if ( !retval->kbp ) {
828  return SPsiBlastScoreMatrixFree(retval);
829  }
830 
831  return retval;
832 }
833 
834 /*
835  allocate space for gumbel block
836 */
837 static Blast_GumbelBlk*
839  return (Blast_GumbelBlk*) calloc(1, sizeof(Blast_GumbelBlk));
840 }
841 
842 /*
843  free space for gumbel blcok
844 */
845 static Blast_GumbelBlk*
847  if ( !gbp) return NULL;
848  sfree(gbp);
849  return NULL;
850 }
851 
852 int
854 {
855  int index = 0;
856  Boolean found = FALSE;
857 
858  if (sbp == NULL)
859  return -1;
860 
861  if (sbp->kbp == NULL || sbp->sfp == NULL)
862  return 1;
863 
864  for (index=0; index<sbp->number_of_contexts; index++)
865  {
866  if (sbp->kbp[index] || sbp->sfp[index])
867  {
868  found = TRUE;
869  break;
870  }
871  }
872 
873  if (found)
874  return 0;
875  else
876  return 1;
877 }
878 
879 /*
880  Allocates memory for the BlastScoreBlk*.
881 */
882 
884 BlastScoreBlkNew(Uint1 alphabet, Int4 number_of_contexts)
885 
886 {
887  BlastScoreBlk* sbp;
888  char* use_old_fsc;
889 
890  sbp = (BlastScoreBlk*) calloc(1, sizeof(BlastScoreBlk));
891 
892  if ( !sbp ) {
893  return NULL;
894  }
895 
896  sbp->alphabet_code = alphabet;
897  if (alphabet != BLASTNA_SEQ_CODE) {
899  } else {
901  }
902 
903  /* set the alphabet type (protein or not). */
904  switch (alphabet) {
905  case BLASTAA_SEQ_CODE:
906  sbp->protein_alphabet = TRUE;
907  break;
908  case BLASTNA_SEQ_CODE:
909  sbp->protein_alphabet = FALSE;
910  break;
911  default:
912  break;
913  }
914 
916  if (sbp->matrix == NULL) {
917  return BlastScoreBlkFree(sbp);
918  }
919  sbp->scale_factor = 1.0;
920 
921  /* FSCOLD: to switch back to original FSC, comment out the following line */
922  use_old_fsc = getenv("OLD_FSC");
923  if (!use_old_fsc) sbp->gbp = s_BlastGumbelBlkNew();
924 
925  sbp->number_of_contexts = number_of_contexts;
926  sbp->sfp = (Blast_ScoreFreq**)
927  calloc(sbp->number_of_contexts, sizeof(Blast_ScoreFreq*));
928  sbp->kbp_std = (Blast_KarlinBlk**)
929  calloc(sbp->number_of_contexts, sizeof(Blast_KarlinBlk*));
930  sbp->kbp_gap_std = (Blast_KarlinBlk**)
931  calloc(sbp->number_of_contexts, sizeof(Blast_KarlinBlk*));
932  sbp->kbp_psi = (Blast_KarlinBlk**)
933  calloc(sbp->number_of_contexts, sizeof(Blast_KarlinBlk*));
934  sbp->kbp_gap_psi = (Blast_KarlinBlk**)
935  calloc(sbp->number_of_contexts, sizeof(Blast_KarlinBlk*));
936 
937  return sbp;
938 }
939 
942 {
943  if (sfp == NULL)
944  return NULL;
945 
946  if (sfp->sprob0 != NULL)
947  sfree(sfp->sprob0);
948  sfree(sfp);
949  return sfp;
950 }
951 
952 /*
953  Deallocates the Karlin Block.
954 */
957 
958 {
959  sfree(kbp);
960 
961  return kbp;
962 }
963 
966 
967 {
968  Int4 index;
969  if (sbp == NULL)
970  return NULL;
971 
972  for (index=0; index<sbp->number_of_contexts; index++) {
973  if (sbp->sfp)
974  sbp->sfp[index] = Blast_ScoreFreqFree(sbp->sfp[index]);
975  if (sbp->kbp_std)
976  sbp->kbp_std[index] = Blast_KarlinBlkFree(sbp->kbp_std[index]);
977  if (sbp->kbp_gap_std)
978  sbp->kbp_gap_std[index] = Blast_KarlinBlkFree(sbp->kbp_gap_std[index]);
979  if (sbp->kbp_psi)
980  sbp->kbp_psi[index] = Blast_KarlinBlkFree(sbp->kbp_psi[index]);
981  if (sbp->kbp_gap_psi)
982  sbp->kbp_gap_psi[index] = Blast_KarlinBlkFree(sbp->kbp_gap_psi[index]);
983  }
984  if (sbp->kbp_ideal)
986  if (sbp->gbp)
987  sbp->gbp = s_BlastGumbelBlkFree(sbp->gbp);
988  sfree(sbp->sfp);
989  sbp->kbp = NULL;
990  sbp->kbp_gap = NULL;
991  sfree(sbp->kbp_std);
992  sfree(sbp->kbp_psi);
993  sfree(sbp->kbp_gap_std);
994  sfree(sbp->kbp_gap_psi);
995  sbp->matrix = SBlastScoreMatrixFree(sbp->matrix);
996  sbp->comments = ListNodeFreeData(sbp->comments);
997  if (sbp->name) {
998  sfree(sbp->name);
999  }
1001  sfree(sbp->ambiguous_res);
1002  sfree(sbp);
1003 
1004  return NULL;
1005 }
1006 
1007 /*
1008  Set the ambiguous residue (e.g, 'N', 'X') in the BlastScoreBlk*.
1009  Convert from ncbieaa to sbp->alphabet_code (i.e., ncbistdaa) first.
1010 */
1011 Int2
1012 BLAST_ScoreSetAmbigRes(BlastScoreBlk* sbp, char ambiguous_res)
1013 
1014 {
1015  Int2 index;
1016  Uint1* ambig_buffer;
1017 
1018  if (sbp == NULL)
1019  return 1;
1020 
1021  if (sbp->ambig_occupy >= sbp->ambig_size)
1022  {
1023  sbp->ambig_size += 5;
1024  ambig_buffer = (Uint1 *) calloc(sbp->ambig_size, sizeof(Uint1));
1025  for (index=0; index<sbp->ambig_occupy; index++)
1026  {
1027  ambig_buffer[index] = sbp->ambiguous_res[index];
1028  }
1029  sfree(sbp->ambiguous_res);
1030  sbp->ambiguous_res = ambig_buffer;
1031  }
1032 
1033  if (sbp->alphabet_code == BLASTAA_SEQ_CODE)
1034  {
1035  sbp->ambiguous_res[sbp->ambig_occupy] =
1036  AMINOACID_TO_NCBISTDAA[toupper((unsigned char) ambiguous_res)];
1037  }
1038  else {
1039  if (sbp->alphabet_code == BLASTNA_SEQ_CODE)
1040  sbp->ambiguous_res[sbp->ambig_occupy] =
1041  IUPACNA_TO_BLASTNA[toupper((unsigned char) ambiguous_res)];
1042  else if (sbp->alphabet_code == NCBI4NA_SEQ_CODE)
1043  sbp->ambiguous_res[sbp->ambig_occupy] =
1044  IUPACNA_TO_NCBI4NA[toupper((unsigned char) ambiguous_res)];
1045  }
1046  (sbp->ambig_occupy)++;
1047 
1048 
1049  return 0;
1050 }
1051 
1052 /** Fill in the matrix for blastn using the penaly and rewards
1053  * The query sequence alphabet is blastna, the subject sequence
1054  * is ncbi2na. The alphabet blastna is defined in blast_stat.h
1055  * and the first four elements of blastna are identical to ncbi2na.
1056  * @param sbp the BlastScoreBlk on which reward, penalty, and matrix will be set [in|out]
1057  * @return zero on success.
1058 */
1059 
1061 {
1062  Int2 index1, index2, degen;
1063  Int2 degeneracy[BLASTNA_SIZE+1];
1064  Int4 reward; /* reward for match of bases. */
1065  Int4 penalty; /* cost for mismatch of bases. */
1066  Int4** matrix; /* matrix to be populated. */
1067  /* How many of the first bases are ambiguous (four, of course). */
1068  const int k_number_non_ambig_bp = 4;
1069 
1070  ASSERT(sbp);
1072  ASSERT(sbp->matrix);
1073  ASSERT(sbp->matrix->ncols == BLASTNA_SIZE);
1074  ASSERT(sbp->matrix->nrows == BLASTNA_SIZE);
1075 
1076  reward = sbp->reward;
1077  penalty = sbp->penalty;
1078  matrix = sbp->matrix->data;
1079 
1080  for (index1 = 0; index1<BLASTNA_SIZE; index1++)
1081  for (index2 = 0; index2<BLASTNA_SIZE; index2++)
1082  matrix[index1][index2] = 0;
1083 
1084  /* In blastna the 1st four bases are A, C, G, and T, exactly as it is
1085  ncbi2na. */
1086  /* ncbi4na gives them the value 1, 2, 4, and 8. */
1087  /* Set the first four bases to degen. one */
1088  for (index1=0; index1<k_number_non_ambig_bp; index1++)
1089  degeneracy[index1] = 1;
1090 
1091  for (index1=k_number_non_ambig_bp; index1<BLASTNA_SIZE; index1++) {
1092  degen=0;
1093  for (index2=0; index2<k_number_non_ambig_bp; index2++) /* ncbi2na */
1094  {
1095  if (BLASTNA_TO_NCBI4NA[index1] & BLASTNA_TO_NCBI4NA[index2])
1096  degen++;
1097  }
1098  degeneracy[index1] = degen;
1099  }
1100 
1101 
1102  for (index1=0; index1<BLASTNA_SIZE; index1++) {
1103  for (index2=index1; index2<BLASTNA_SIZE; index2++) {
1104  if (BLASTNA_TO_NCBI4NA[index1] & BLASTNA_TO_NCBI4NA[index2]) {
1105  /* round up for positive scores, down for negatives. */
1106  matrix[index1][index2] =
1107  (Int4)BLAST_Nint( (double) ((degeneracy[index2]-1)*penalty +
1108  reward)/ (double) degeneracy[index2]);
1109  if (index1 != index2)
1110  {
1111  matrix[index2][index1] = matrix[index1][index2];
1112  }
1113  }
1114  else
1115  {
1116  matrix[index1][index2] = penalty;
1117  matrix[index2][index1] = penalty;
1118  }
1119  }
1120  }
1121 
1122  /* The value of 15 is a gap, which is a sentinel between strands in
1123  the ungapped extension algorithm */
1124  for (index1=0; index1<BLASTNA_SIZE; index1++)
1125  matrix[BLASTNA_SIZE-1][index1] = INT4_MIN / 2;
1126  for (index1=0; index1<BLASTNA_SIZE; index1++)
1127  matrix[index1][BLASTNA_SIZE-1] = INT4_MIN / 2;
1128 
1129  return 0;
1130 }
1131 
1132 /**
1133  * Read in a custom nucleotide matrix from the FILE *fp.
1134  * This function ASSUMES that the matrices are in the
1135  * format:
1136  *
1137  * # FREQS A 0.255 C 0.245 G 0.245 T 0.255
1138  * A R G C Y T K M S W N X
1139  * A 9 1 -5 -12 -12 -13 -9 -1 -8 -2 -1 -27
1140  * R 3 2 1 -12 -12 -13 -5 -4 -5 -4 -1 -27
1141  * ...
1142  *
1143  * @param sbp the BlastScoreBlk with the matrix to be populated [in|out]
1144  * @param fp the file pointer to read from [in]
1145  * @return zero on success
1146  *
1147  * -RMH-
1148  */
1149 
1150 static Int2
1152 {
1153  Int4 ** matrix;
1154  double* freqs;
1155  Int4 i = 0;
1156  Int4 j = 0;
1157  Int4 rowIdx,colIdx,val,base;
1158  Int4 numFreqs = 0;
1159  Int4 alphaSize = 0;
1160  double fval;
1161  register int index1, index2;
1162  char fbuf[512+3];
1163  char alphabet[24];
1164  char *cp,*ncp,*lp;
1165  double lambda_upper = 0;
1166  double lambda_lower = 0;
1167  double lambda = 0.5;
1168  double sum;
1169  double check;
1170  const char kCommentChar = '#';
1171  const char* kTokenStr = " \t\n\r";
1172 
1173  // Initialize matrix to default values
1174  matrix = sbp->matrix->data;
1175  for (index1 = 0; index1 < sbp->alphabet_size; index1++)
1176  for (index2 = 0; index2 < sbp->alphabet_size; index2++)
1177  matrix[index1][index2] = BLAST_SCORE_MIN;
1178 
1179  // Initialize matrix freqs to default values
1180  freqs = sbp->matrix->freqs;
1181  for (index1 = 0; index1 < sbp->alphabet_size; index1++)
1182  freqs[index1] = 0;
1183 
1184  alphabet[0] = 0;
1185  while ( fgets(fbuf,sizeof(fbuf),fp) ) {
1186  if (strchr(fbuf, '\n') == NULL) {
1187  return 2;
1188  }
1189 
1190  /* initialize column pointer */
1191  cp = (char *)fbuf;
1192 
1193  /* eat whitespace */
1194  while( (*cp) && isspace(*cp) ) cp++;
1195 
1196  if (*cp == kCommentChar) {
1197  /* special case FREQS line ( must exist ) */
1198  if ( (ncp = strstr( cp, (const char *)"FREQS" )) != NULL ) {
1199  cp = ncp + 5;
1200  /* eat whitespace */
1201  while( (*cp) && isspace(*cp) ) cp++;
1202 
1203  lp = (char*)strtok(cp, kTokenStr);
1204  /* Missing values */
1205  if (lp == NULL)
1206  return 2;
1207 
1208  numFreqs = 0;
1209  while (lp != NULL) {
1210  // Read Nucleotide
1211  base = (int)IUPACNA_TO_BLASTNA[toupper((unsigned char)(*lp))];
1212 
1213  lp = (char*)strtok(NULL, kTokenStr);
1214  /* Expected a token pair */
1215  if ( lp == NULL )
1216  return 2;
1217 
1218  // Read Frequency
1219  if ( sscanf(lp, "%lf", &fval ) != 1 )
1220  return( 2 );
1221 
1222  // Store base/fval
1223  freqs[base] = fval;
1224  numFreqs++;
1225  lp = (char*)strtok(NULL, kTokenStr);
1226  }
1227  }else {
1228  /* save the comment line in a linked list */
1229  *strchr(cp, '\n') = NULLB;
1230  ListNodeCopyStr(&sbp->comments, 0, cp);
1231  }
1232  continue;
1233  }
1234 
1235  /* alphabet line */
1236  if ( isalpha(*cp) && !alphabet[0] ) {
1237  j = 0;
1238  lp = (char*)strtok(cp, kTokenStr);
1239  while (lp != NULL) {
1240  alphabet[j++] = toupper((unsigned char)(*lp));
1241  lp = (char*)strtok(NULL, kTokenStr);
1242  }
1243  alphabet[j] = 0;
1244  alphaSize = j;
1245  continue;
1246  }else if ( isalpha(*cp) ) {
1247  /* Chew off first alphabet character */
1248  cp++;
1249  /* eat whitespace */
1250  while( (*cp) && isspace(*cp) ) cp++;
1251  }
1252 
1253  /* Matrix data */
1254  if ( isdigit(*cp) || *cp == '-' ) {
1255  j = 0;
1256  lp = (char*)strtok(cp, kTokenStr);
1257  rowIdx = (int)IUPACNA_TO_BLASTNA[toupper((unsigned char)alphabet[i])];
1258  while (lp != NULL) {
1259  if ( sscanf(lp, "%d", &val ) != 1 )
1260  return( 2 );
1261  colIdx = (int)IUPACNA_TO_BLASTNA[toupper((unsigned char)alphabet[j++])];
1262  matrix[rowIdx][colIdx] = val;
1263  lp = (char*)strtok(NULL, kTokenStr);
1264  }
1265  /* We should have as many values as we do characters in the
1266  alphabet */
1267  if ( j != alphaSize )
1268  return( 2 );
1269  i++;
1270  continue;
1271  }
1272  }
1273 
1274  /* Expected 4 base frequencies, and a square matrix */
1275  if ( numFreqs != 4 || i != alphaSize )
1276  return( 2 );
1277 
1278  /* Calculate lambda for complexity adjusted scoring. This
1279  scoring system was designed by Phil Green and used in the
1280  cross_match package. It was also used in MaskerAid to make
1281  wublast compatable with RepeatMasker. */
1282  do {
1283  sum = 0;
1284  check = 0;
1285  for ( i = 0 ; i < sbp->alphabet_size ; i++ ) {
1286  for ( j = 0 ; j < sbp->alphabet_size ; j++ ) {
1287  if ( freqs[i] && freqs[j] )
1288  {
1289  sum += freqs[i] * freqs[j] *
1290  exp( lambda * matrix[i][j] );
1291  check += freqs[i] * freqs[j];
1292  }
1293  }
1294  }
1295  ASSERT( ( check < (double)1.001 ) && ( check > (double)0.999 ) );
1296  if ( sum < 1.0 ) {
1297  lambda_lower = lambda;
1298  lambda *= 2.0;
1299  }
1300  } while ( sum < 1.0 );
1301 
1302  lambda_upper = lambda;
1303 
1304  while ( lambda_upper - lambda_lower > (double).00001 ) {
1305  lambda = ( lambda_lower + lambda_upper ) / 2.0;
1306  sum = 0;
1307  check = 0;
1308  for ( i = 0 ; i < sbp->alphabet_size ; i++ )
1309  {
1310  for ( j = 0 ; j < sbp->alphabet_size ; j++ )
1311  {
1312  if ( freqs[i] && freqs[j] )
1313  {
1314  sum += freqs[i] * freqs[j] *
1315  exp( lambda * matrix[i][j] );
1316  check += freqs[i] * freqs[j];
1317  }
1318  }
1319  }
1320  ASSERT( ( check < (double)1.001 ) && ( check > (double).999 ) );
1321  if ( sum >= 1.0 ) {
1322  lambda_upper = lambda;
1323  }
1324  else {
1325  lambda_lower = lambda;
1326  }
1327  }
1328  sbp->matrix->lambda = lambda;
1329 
1330  /* The value of 15 is a gap, which is a sentinel between strands in
1331  the ungapped extension algorithm. */
1332  for (index1=0; index1<BLASTNA_SIZE; index1++)
1333  matrix[BLASTNA_SIZE-1][index1] = INT4_MIN / 2;
1334  for (index1=0; index1<BLASTNA_SIZE; index1++)
1335  matrix[index1][BLASTNA_SIZE-1] = INT4_MIN / 2;
1336 
1337  return(0);
1338 
1339 }
1340 
1341 
1342 
1343 /** Read in the matrix from the FILE *fp.
1344  * This function ASSUMES that the matrices are in the ncbistdaa
1345  * @param sbp the BlastScoreBlk with the matrix to be populated [in|out]
1346  * @param fp the file pointer to read from [in]
1347  * @return zero on success
1348 */
1349 
1350 static Int2
1352 {
1353  char buf[512+3];
1354  char temp[512];
1355  char* cp,* lp;
1356  char ch;
1357  Int4 ** matrix;
1358  Int4 * m;
1359  Int4 score;
1360  Uint4 a1cnt = 0, a2cnt = 0;
1361  char a1chars[BLASTAA_SIZE], a2chars[BLASTAA_SIZE];
1362  long lineno = 0;
1363  double xscore;
1364  register int index1, index2;
1365  int x_index, u_index, o_index, c_index;
1366  const char kCommentChar = '#';
1367  const char* kTokenStr = " \t\n\r";
1368 
1370  ASSERT(sbp->matrix);
1371  ASSERT(sbp->matrix->ncols == BLASTAA_SIZE);
1372  ASSERT(sbp->matrix->nrows == BLASTAA_SIZE);
1373 
1374  matrix = sbp->matrix->data;
1375 
1376  if (sbp->alphabet_code != BLASTNA_SEQ_CODE) {
1377  for (index1 = 0; index1 < sbp->alphabet_size; index1++)
1378  for (index2 = 0; index2 < sbp->alphabet_size; index2++)
1379  matrix[index1][index2] = BLAST_SCORE_MIN;
1380  }
1381 
1382  /* Read the residue names for the second alphabet */
1383  while (fgets(buf, sizeof(buf), fp) != NULL) {
1384  ++lineno;
1385  if (strchr(buf, '\n') == NULL) {
1386  return 2;
1387  }
1388 
1389  if (buf[0] == kCommentChar) {
1390  /* save the comment line in a linked list */
1391  *strchr(buf, '\n') = NULLB;
1392  ListNodeCopyStr(&sbp->comments, 0, buf+1);
1393  continue;
1394  }
1395  if ((cp = strchr(buf, kCommentChar)) != NULL)
1396  *cp = NULLB;
1397  lp = (char*)strtok(buf, kTokenStr);
1398  if (lp == NULL) /* skip blank lines */
1399  continue;
1400  while (lp != NULL) {
1401  if (sbp->alphabet_code == BLASTAA_SEQ_CODE)
1402  ch = AMINOACID_TO_NCBISTDAA[toupper((unsigned char)(*lp))];
1403  else if (sbp->alphabet_code == BLASTNA_SEQ_CODE) {
1404  ch = IUPACNA_TO_BLASTNA[toupper((unsigned char)(*lp))];
1405  } else {
1406  ch = *lp;
1407  }
1408  a2chars[a2cnt++] = ch;
1409  lp = (char*)strtok(NULL, kTokenStr);
1410  }
1411 
1412  break; /* Exit loop after reading one line. */
1413  }
1414 
1415  if (a2cnt <= 1) {
1416  return 2;
1417  }
1418 
1419  while (fgets(buf, sizeof(buf), fp) != NULL) {
1420  ++lineno;
1421  if ((cp = strchr(buf, '\n')) == NULL) {
1422  return 2;
1423  }
1424  if ((cp = strchr(buf, kCommentChar)) != NULL)
1425  *cp = NULLB;
1426  if ((lp = (char*)strtok(buf, kTokenStr)) == NULL)
1427  continue;
1428  ch = *lp;
1429  if ((cp = strtok(NULL, kTokenStr)) == NULL) {
1430  return 2;
1431  }
1432  if (a1cnt >= DIM(a1chars)) {
1433  return 2;
1434  }
1435 
1436  if (sbp->alphabet_code == BLASTAA_SEQ_CODE) {
1437  ch = AMINOACID_TO_NCBISTDAA[toupper((unsigned char) ch)];
1438  } else {
1439  if (sbp->alphabet_code == BLASTNA_SEQ_CODE) {
1440  ch = IUPACNA_TO_BLASTNA[toupper((unsigned char) ch)];
1441  }
1442  }
1443  a1chars[a1cnt++] = ch;
1444  m = &matrix[(int)ch][0];
1445  index2 = 0;
1446  while (cp != NULL) {
1447  if (index2 >= (int) a2cnt) {
1448  return 2;
1449  }
1450  strcpy(temp, cp);
1451 
1452  if (strcasecmp(temp, "na") == 0) {
1453  score = BLAST_SCORE_MIN;
1454  } else {
1455  if (sscanf(temp, "%lg", &xscore) != 1) {
1456  return 2;
1457  }
1458  /*xscore = MAX(xscore, BLAST_SCORE_1MIN);*/
1459  if (xscore > BLAST_SCORE_MAX || xscore < BLAST_SCORE_MIN) {
1460  return 2;
1461  }
1462  xscore += (xscore >= 0. ? 0.5 : -0.5);
1463  score = (Int4)xscore;
1464  }
1465 
1466  m[(int)a2chars[index2++]] = score;
1467 
1468  cp = strtok(NULL, kTokenStr);
1469  }
1470  }
1471 
1472  if (a1cnt <= 1) {
1473  return 2;
1474  }
1475 
1476  /* Use the C scores for U and X scores for O characters;
1477  if this is not done then they will never align to non-gap residues */
1478  x_index = AMINOACID_TO_NCBISTDAA['X'];
1479  u_index = AMINOACID_TO_NCBISTDAA['U'];
1480  o_index = AMINOACID_TO_NCBISTDAA['O'];
1481  c_index = AMINOACID_TO_NCBISTDAA['C'];
1482  for (index1 = 0; index1 < sbp->alphabet_size; index1++) {
1483  matrix[u_index][index1] = matrix[c_index][index1];
1484  matrix[index1][u_index] = matrix[index1][c_index];
1485  matrix[o_index][index1] = matrix[x_index][index1];
1486  matrix[index1][o_index] = matrix[index1][x_index];
1487  }
1488 
1489  return 0;
1490 }
1491 
1492 /** Sets maximum and minimum scores on the BlastScoreBlk for a
1493  * given matrix
1494  * @param sbp the BlastScoreBlk on which loscore and hiscore
1495  * will be set [in|out]
1496  * @return zero on success
1497  */
1498 
1499 static Int2
1501 {
1502  Int4 score;
1503  Int4 ** matrix;
1504  Int2 index1, index2;
1505 
1506  sbp->loscore = BLAST_SCORE_MAX;
1507  sbp->hiscore = BLAST_SCORE_MIN;
1508  matrix = sbp->matrix->data;
1509  for (index1=0; index1<sbp->alphabet_size; index1++)
1510  {
1511  for (index2=0; index2<sbp->alphabet_size; index2++)
1512  {
1513  score = matrix[index1][index2];
1514  if (score <= BLAST_SCORE_MIN || score >= BLAST_SCORE_MAX)
1515  continue;
1516  if (sbp->loscore > score)
1517  sbp->loscore = score;
1518  if (sbp->hiscore < score)
1519  sbp->hiscore = score;
1520  }
1521  }
1522  /* If the lo/hi-scores are BLAST_SCORE_MIN/BLAST_SCORE_MAX, (i.e., for
1523  gaps), then use other scores. */
1524 
1525  if (sbp->loscore < BLAST_SCORE_MIN)
1526  sbp->loscore = BLAST_SCORE_MIN;
1527  if (sbp->hiscore > BLAST_SCORE_MAX)
1528  sbp->hiscore = BLAST_SCORE_MAX;
1529 
1530  return 0;
1531 }
1532 
1533 /** Sets sbp->matrix->data field using sbp->name field using
1534  * the matrices in the toolkit (util/tables/raw_scoremat.h).
1535  * @param sbp the object containing matrix and name [in|out]
1536  * @return 0 on success, 1 if matrix could not be loaded
1537  */
1538 static Int2
1540 {
1541  Int2 status = 0;
1542  Int4** matrix = NULL;
1543  int i, j; /* loop indices */
1544  int x_index, u_index, o_index, c_index;
1545  const SNCBIPackedScoreMatrix* psm;
1546 
1547  ASSERT(sbp);
1548  psm = NCBISM_GetStandardMatrix(sbp->name);
1549  if (psm == NULL)
1550  return 1;
1551 
1553  ASSERT(sbp->matrix);
1554  ASSERT(sbp->matrix->ncols == BLASTAA_SIZE);
1555  ASSERT(sbp->matrix->nrows == BLASTAA_SIZE);
1556 
1557  matrix = sbp->matrix->data;
1558 
1559  /* Initialize with BLAST_SCORE_MIN */
1560  for (i = 0; i < sbp->alphabet_size; i++) {
1561  for (j = 0; j < sbp->alphabet_size; j++) {
1562  matrix[i][j] = BLAST_SCORE_MIN;
1563  }
1564  }
1565 
1566  for (i = 0; i < sbp->alphabet_size; i++) {
1567  for (j = 0; j < sbp->alphabet_size; j++) {
1568  /* skip special characters */
1569  if (i == AMINOACID_TO_NCBISTDAA['U'] ||
1570  i == AMINOACID_TO_NCBISTDAA['O'] ||
1571  i == AMINOACID_TO_NCBISTDAA['-'] ||
1572  j == AMINOACID_TO_NCBISTDAA['U'] ||
1573  j == AMINOACID_TO_NCBISTDAA['O'] ||
1574  j == AMINOACID_TO_NCBISTDAA['-']) {
1575  continue;
1576  }
1577  matrix[i][j] = NCBISM_GetScore((const SNCBIPackedScoreMatrix*) psm,
1578  i, j);
1579  }
1580  }
1581 
1582  /* Use the C scores for U and X scores for the O characters;
1583  if this is not done then they will never align to non-gap residues */
1584  x_index = AMINOACID_TO_NCBISTDAA['X'];
1585  u_index = AMINOACID_TO_NCBISTDAA['U'];
1586  o_index = AMINOACID_TO_NCBISTDAA['O'];
1587  c_index = AMINOACID_TO_NCBISTDAA['C'];
1588  for (i = 0; i < sbp->alphabet_size; i++) {
1589  matrix[u_index][i] = matrix[c_index][i];
1590  matrix[i][u_index] = matrix[i][c_index];
1591  matrix[o_index][i] = matrix[x_index][i];
1592  matrix[i][o_index] = matrix[i][x_index];
1593  }
1594 
1595  return status;
1596 }
1597 
1598 Int2
1600 {
1601  Boolean matrix_found = FALSE;
1602  Int2 status = 0;
1603 
1604  /* For nucleotide case we first create a default matrix, based on the
1605  match and mismatch scores. */
1606  if (sbp->alphabet_code == BLASTNA_SEQ_CODE) {
1607  /* RMBLASTN supports reading a custom matrix. Currently
1608  * I am using the sbp->read_in_matrix parameter to tell if
1609  * we should invoke the matrix reader.
1610  * -RMH-
1611  */
1612  if ( sbp->read_in_matrix && get_path )
1613  {
1614  // Read in custom rmblastn matrix
1615  matrix_found = FALSE;
1616  }
1617  else
1618  {
1619  if ( (status=BlastScoreBlkNuclMatrixCreate(sbp)) != 0)
1620  return status;
1621  matrix_found = TRUE;
1622  }
1623  }
1624  else
1625  { /* Try to get compiled in matrix for proteins. */
1626  status = BlastScoreBlkProteinMatrixLoad(sbp);
1627  if (status == 0)
1628  matrix_found = TRUE;
1629  }
1630 
1631  if (matrix_found == FALSE && sbp->read_in_matrix && get_path) {
1632  // -RMH- Changed to FALSE
1633  char* matrix_path = get_path(sbp->name, FALSE);
1634  if (matrix_path) {
1635 
1636  FILE *fp = NULL;
1637  char* full_matrix_path = NULL;
1638  size_t path_len = strlen(matrix_path);
1639  size_t buflen = path_len + strlen(sbp->name);
1640 
1641  full_matrix_path = (char*) malloc((buflen + 1) * sizeof(char));
1642  if (!full_matrix_path) {
1643  return -1;
1644  }
1645  strncpy(full_matrix_path, matrix_path, path_len);
1646  strncpy(full_matrix_path + path_len, sbp->name, buflen - path_len);
1647  full_matrix_path[buflen] = '\0';
1648 
1649  sfree(matrix_path);
1650 
1651  if ( (fp=fopen(full_matrix_path, "r")) == NULL) {
1652  return -1;
1653  }
1654  sfree(full_matrix_path);
1655 
1656  if (sbp->alphabet_code == BLASTNA_SEQ_CODE) {
1657  /* nucleotide blast ( rmblastn ) which can utilize a
1658  * a custom matrix -RMH-
1659  */
1660  if ((status=BlastScoreBlkNucleotideMatrixRead(sbp, fp)) != 0)
1661  {
1662  fclose(fp);
1663  return status;
1664  }
1665  }else {
1666  /* protein blast */
1667  if ( (status=BlastScoreBlkProteinMatrixRead(sbp, fp)) != 0)
1668  {
1669  fclose(fp);
1670  return status;
1671  }
1672  }
1673 
1674  fclose(fp);
1675  matrix_found = TRUE;
1676  }
1677  }
1678 
1679  if (matrix_found == FALSE)
1680  return -1;
1681 
1682  if ( (status=BlastScoreBlkMaxScoreSet(sbp)) != 0)
1683  return status;
1684 
1685  return status;
1686 }
1687 
1690 {
1691  if (rfp == NULL)
1692  return NULL;
1693 
1694  if (rfp->prob0 != NULL)
1695  sfree(rfp->prob0);
1696 
1697  sfree(rfp);
1698 
1699  return rfp;
1700 }
1701 
1702 /*
1703  Allocates the Blast_ResFreq* and then fills in the frequencies
1704  in the probabilities.
1705 */
1706 
1709 {
1710  Blast_ResFreq* rfp;
1711 
1712  if (sbp == NULL)
1713  {
1714  return NULL;
1715  }
1716 
1717  rfp = (Blast_ResFreq*) calloc(1, sizeof(Blast_ResFreq));
1718  if (rfp == NULL)
1719  return NULL;
1720 
1721  rfp->alphabet_code = sbp->alphabet_code;
1722 
1723  rfp->prob0 = (double*) calloc(sbp->alphabet_size, sizeof(double));
1724  if (rfp->prob0 == NULL)
1725  {
1726  rfp = Blast_ResFreqFree(rfp);
1727  return rfp;
1728  }
1729  rfp->prob = rfp->prob0 - sbp->alphabet_start;
1730 
1731  return rfp;
1732 }
1733 
1734 /** Records probability of letter appearing in sequence.
1735 */
1736 typedef struct BLAST_LetterProb {
1737  char ch; /**< residue */
1738  double p; /**< probability of residue. */
1740 
1741 #if 0
1742 
1743 /* Unused for right now, but do not remove */
1744 
1745 /* M. O. Dayhoff amino acid background frequencies */
1746 static BLAST_LetterProb Dayhoff_prob[] = {
1747  { 'A', 87.13 },
1748  { 'C', 33.47 },
1749  { 'D', 46.87 },
1750  { 'E', 49.53 },
1751  { 'F', 39.77 },
1752  { 'G', 88.61 },
1753  { 'H', 33.62 },
1754  { 'I', 36.89 },
1755  { 'K', 80.48 },
1756  { 'L', 85.36 },
1757  { 'M', 14.75 },
1758  { 'N', 40.43 },
1759  { 'P', 50.68 },
1760  { 'Q', 38.26 },
1761  { 'R', 40.90 },
1762  { 'S', 69.58 },
1763  { 'T', 58.54 },
1764  { 'V', 64.72 },
1765  { 'W', 10.49 },
1766  { 'Y', 29.92 }
1767  };
1768 
1769 /* Stephen Altschul amino acid background frequencies */
1770 static BLAST_LetterProb Altschul_prob[] = {
1771  { 'A', 81.00 },
1772  { 'C', 15.00 },
1773  { 'D', 54.00 },
1774  { 'E', 61.00 },
1775  { 'F', 40.00 },
1776  { 'G', 68.00 },
1777  { 'H', 22.00 },
1778  { 'I', 57.00 },
1779  { 'K', 56.00 },
1780  { 'L', 93.00 },
1781  { 'M', 25.00 },
1782  { 'N', 45.00 },
1783  { 'P', 49.00 },
1784  { 'Q', 39.00 },
1785  { 'R', 57.00 },
1786  { 'S', 68.00 },
1787  { 'T', 58.00 },
1788  { 'V', 67.00 },
1789  { 'W', 13.00 },
1790  { 'Y', 32.00 }
1791  };
1792 #endif
1793 
1794 /* amino acid background frequencies from Robinson and Robinson */
1796  { 'A', 78.05 },
1797  { 'C', 19.25 },
1798  { 'D', 53.64 },
1799  { 'E', 62.95 },
1800  { 'F', 38.56 },
1801  { 'G', 73.77 },
1802  { 'H', 21.99 },
1803  { 'I', 51.42 },
1804  { 'K', 57.44 },
1805  { 'L', 90.19 },
1806  { 'M', 22.43 },
1807  { 'N', 44.87 },
1808  { 'P', 52.03 },
1809  { 'Q', 42.64 },
1810  { 'R', 51.29 },
1811  { 'S', 71.20 },
1812  { 'T', 58.41 },
1813  { 'V', 64.41 },
1814  { 'W', 13.30 },
1815  { 'Y', 32.16 }
1816  }; /**< amino acid background frequencies from Robinson and Robinson */
1817 
1818 #define STD_AMINO_ACID_FREQS Robinson_prob /**< points to the standard amino acid frequencies to use. */
1819 
1821  { 'A', 25.00 },
1822  { 'C', 25.00 },
1823  { 'G', 25.00 },
1824  { 'T', 25.00 }
1825  }; /**< nucleotide probabilities (25% each letter) */
1826 
1827 /** Normalizes all the residue frequencies and then normalizes them to "norm".
1828  * If "norm" is one, then they will all sum to one.
1829  * @param sbp needed for alphabet information [in]
1830  * @param rfp array of residue frequencies to be normalized [in|out]
1831  * @param norm value to normalize to [in]
1832  * @return zero on success, 1 otherwise
1833 */
1834 static Int2
1836 {
1837  Int2 alphabet_stop, index;
1838  double sum = 0., p;
1839 
1840  if (norm == 0.)
1841  return 1;
1842 
1843  alphabet_stop = sbp->alphabet_start + sbp->alphabet_size;
1844  for (index=sbp->alphabet_start; index<alphabet_stop; index++)
1845  {
1846  p = rfp->prob[index];
1847  if (p < 0.)
1848  return 1;
1849  sum += p;
1850  }
1851  if (sum <= 0.)
1852  return 0;
1853 
1854  for (index=sbp->alphabet_start; index<alphabet_stop; index++)
1855  {
1856  rfp->prob[index] /= sum;
1857  rfp->prob[index] *= norm;
1858  }
1859  return 0;
1860 }
1861 
1862 Int2
1863 Blast_GetStdAlphabet(Uint1 alphabet_code, Uint1* residues, Uint4 residues_size)
1864 {
1865  Int2 index;
1866 
1867  if (residues_size < DIM(STD_AMINO_ACID_FREQS))
1868  return -2;
1869 
1870  for (index=0; index<(int)DIM(STD_AMINO_ACID_FREQS); index++)
1871  {
1872  if (alphabet_code == BLASTAA_SEQ_CODE)
1873  {
1874  residues[index] =
1875  AMINOACID_TO_NCBISTDAA[toupper((unsigned char) STD_AMINO_ACID_FREQS[index].ch)];
1876  }
1877  else
1878  {
1879  residues[index] = STD_AMINO_ACID_FREQS[index].ch;
1880  }
1881  }
1882 
1883  return index;
1884 }
1885 
1886 Int2
1888 {
1889  Uint4 index;
1890 
1891  if (sbp->protein_alphabet == TRUE)
1892  {
1893  Int2 retval;
1894  Uint1* residues = (Uint1*) calloc(DIM(STD_AMINO_ACID_FREQS), sizeof(Uint1));
1895  retval = Blast_GetStdAlphabet(sbp->alphabet_code, residues, DIM(STD_AMINO_ACID_FREQS));
1896  if (retval < 1)
1897  return retval;
1898 
1899  for (index=0; index<DIM(STD_AMINO_ACID_FREQS); index++)
1900  {
1901  rfp->prob[residues[index]] = STD_AMINO_ACID_FREQS[index].p;
1902  }
1903  sfree(residues);
1904  }
1905  else
1906  { /* beginning of blastna and ncbi2na are the same. */
1907  /* Only blastna used for nucleotides. */
1908  for (index=0; index<DIM(nt_prob); index++)
1909  {
1910  rfp->prob[index] = nt_prob[index].p;
1911  }
1912  }
1913 
1914  Blast_ResFreqNormalize(sbp, rfp, 1.0);
1915 
1916  return 0;
1917 }
1918 
1919 /**
1920 Intermediate structure to store the composition of a sequence
1921 */
1922 typedef struct Blast_ResComp {
1923  Uint1 alphabet_code; /**< indicates alphabet. */
1924  Int4* comp; /**< store composition of a string. */
1925  Int4* comp0; /**< Same array as above, starts at zero. */
1927 
1928 /** Deallocates Blast_ResComp structure and
1929  * associated arrays.
1930  * @param rcp the object to be freed [in|out]
1931  * @return NULL
1932  */
1933 static Blast_ResComp*
1935 {
1936  if (rcp == NULL)
1937  return NULL;
1938 
1939  if (rcp->comp0 != NULL)
1940  sfree(rcp->comp0);
1941 
1942  sfree(rcp);
1943  return NULL;
1944 }
1945 
1946 /** Allocated the Blast_ResComp* for a given alphabet. Only the
1947  * alphabets ncbistdaa and ncbi4na should be used by BLAST.
1948  * @param sbp contains alphabet code and size.
1949  * @return pointer to Blast_ResComp, corectly initialized.
1950 */
1951 static Blast_ResComp*
1953 {
1954  Blast_ResComp* rcp;
1955 
1956  rcp = (Blast_ResComp*) calloc(1, sizeof(Blast_ResComp));
1957  if (rcp == NULL)
1958  return NULL;
1959 
1960  rcp->alphabet_code = sbp->alphabet_code;
1961 
1962 /* comp0 has zero offset, comp starts at 0, only one
1963 array is allocated. */
1964  rcp->comp0 = (Int4*) calloc(sbp->alphabet_size, sizeof(Int4));
1965  if (rcp->comp0 == NULL)
1966  {
1967  rcp = BlastResCompDestruct(rcp);
1968  return rcp;
1969  }
1970 
1971  rcp->comp = rcp->comp0 - sbp->alphabet_start;
1972 
1973  return rcp;
1974 }
1975 
1976 /** Store the composition of a (query) string.
1977  * @param sbp needed for alphabet information [in]
1978  * @param rcp object to be filled in [in|out]
1979  * @param str sequence to have composition calculated for [in]
1980  * @param length length of sequence [in]
1981  * @return zero on success, 1 otherwise.
1982 */
1983 static Int2
1984 BlastResCompStr(const BlastScoreBlk* sbp, Blast_ResComp* rcp, char* str, Int4 length)
1985 {
1986  char* lp,* lpmax;
1987  Int2 index;
1988  Uint1 mask;
1989 
1990  if (sbp == NULL || rcp == NULL || str == NULL)
1991  return 1;
1992 
1993  if (rcp->alphabet_code != sbp->alphabet_code)
1994  return 1;
1995 
1996  /* For megablast, check only the first 4 bits of the sequence values */
1997  mask = (sbp->protein_alphabet ? 0xff : 0x0f);
1998 
1999 /* comp0 starts at zero and extends for "num", comp is the same array, but
2000 "start_at" offset. */
2001  for (index=0; index<(sbp->alphabet_size); index++)
2002  rcp->comp0[index] = 0;
2003 
2004  for (lp = str, lpmax = lp+length; lp < lpmax; lp++)
2005  {
2006  ++rcp->comp[(int)(*lp & mask)];
2007  }
2008 
2009  /* Don't count ambig. residues. */
2010  for (index=0; index<sbp->ambig_occupy; index++)
2011  {
2012  rcp->comp[sbp->ambiguous_res[index]] = 0;
2013  }
2014 
2015  return 0;
2016 }
2017 
2018 /** Sets prob elements of Blast_ResFreq to zero
2019  * @param sbp needed for alphabet information [in]
2020  * @param rfp contains elements to be zeroed [in|out]
2021  * @return zero on success.
2022  */
2023 static Int2
2025 {
2026  Int2 alphabet_max, index;
2027 
2028  alphabet_max = sbp->alphabet_start + sbp->alphabet_size;
2029  for (index=sbp->alphabet_start; index<alphabet_max; index++)
2030  rfp->prob[index] = 0.0;
2031 
2032  return 0;
2033 }
2034 
2035 /** Calculate the residue frequencies associated with the provided ResComp
2036  * This function takes into account the composition of a given sequence
2037  * (expressed through rcp) rather than just doing it for a standard distribution.
2038  * @param sbp contains alphabet information [in]
2039  * @param rfp object to be filled in [in|out]
2040  * @param rcp object with composition information [in]
2041  * @return zero on success, 1 on failure
2042 */
2043 static Int2
2045  const Blast_ResComp* rcp)
2046 {
2047  Int2 alphabet_max, index;
2048  double sum = 0.;
2049 
2050  if (rfp == NULL || rcp == NULL)
2051  return 1;
2052 
2053  if (rfp->alphabet_code != rcp->alphabet_code)
2054  return 1;
2055 
2056  alphabet_max = sbp->alphabet_start + sbp->alphabet_size;
2057  for (index=sbp->alphabet_start; index<alphabet_max; index++)
2058  sum += rcp->comp[index];
2059 
2060  if (sum == 0.) {
2061  Blast_ResFreqClr(sbp, rfp);
2062  return 0;
2063  }
2064 
2065  for (index=sbp->alphabet_start; index<alphabet_max; index++)
2066  rfp->prob[index] = rcp->comp[index] / sum;
2067 
2068  return 0;
2069 }
2070 
2071 /** Fills in residue frequences for a given sequence.
2072  * @param sbp needed for alphabet information [in]
2073  * @param rfp object to be populated [in|out]
2074  * @param string sequence for calculation [in]
2075  * @param length length of above sequence [in]
2076  */
2077 static Int2
2078 Blast_ResFreqString(const BlastScoreBlk* sbp, Blast_ResFreq* rfp, char* string, Int4 length)
2079 {
2080  Blast_ResComp* rcp;
2081 
2082  rcp = BlastResCompNew(sbp);
2083 
2084  BlastResCompStr(sbp, rcp, string, length);
2085 
2086  Blast_ResFreqResComp(sbp, rfp, rcp);
2087 
2088  rcp = BlastResCompDestruct(rcp);
2089 
2090  return 0;
2091 }
2092 
2093 /** Check that the lo and hi score are within the allowed ranges
2094  * @param lo the lowest permitted value [in]
2095  * @param hi the highest permitted value [in]
2096  * @return zero on success, 1 otherwise
2097  */
2098 
2099 static Int2
2101 {
2102  if (lo >= 0 || hi <= 0 ||
2103  lo < BLAST_SCORE_MIN || hi > BLAST_SCORE_MAX)
2104  return 1;
2105 
2106  if (hi - lo > BLAST_SCORE_RANGE_MAX)
2107  return 1;
2108 
2109  return 0;
2110 }
2111 
2113 Blast_ScoreFreqNew(Int4 score_min, Int4 score_max)
2114 {
2115  Blast_ScoreFreq* sfp;
2116  Int4 range;
2117 
2118  if (BlastScoreChk(score_min, score_max) != 0)
2119  return NULL;
2120 
2121  sfp = (Blast_ScoreFreq*) calloc(1, sizeof(Blast_ScoreFreq));
2122  if (sfp == NULL)
2123  return NULL;
2124 
2125  range = score_max - score_min + 1;
2126  sfp->sprob = (double*) calloc(range, sizeof(double));
2127  if (sfp->sprob == NULL)
2128  {
2129  Blast_ScoreFreqFree(sfp);
2130  return NULL;
2131  }
2132 
2133  sfp->sprob0 = sfp->sprob;
2134  sfp->sprob -= score_min; /* center around 0 */
2135  sfp->score_min = score_min;
2136  sfp->score_max = score_max;
2137  sfp->obs_min = sfp->obs_max = 0;
2138  sfp->score_avg = 0.0;
2139  return sfp;
2140 }
2141 
2142 /** Calculates the score frequencies.
2143  *
2144  * @param sbp object with scoring information [in]
2145  * @param sfp object to hold frequency information [in|out]
2146  * @param rfp1 letter frequencies for first sequence (query) [in]
2147  * @param rfp2 letter frequencies for second sequence (database) [in]
2148  * @return zero on success
2149  */
2150 static Int2
2152 {
2153  Int4 ** matrix;
2154  Int4 score, obs_min, obs_max;
2155  double score_sum, score_avg;
2156  Int2 alphabet_start, alphabet_end, index1, index2;
2157 
2158  if (sbp == NULL || sfp == NULL)
2159  return 1;
2160 
2161  if (sbp->loscore < sfp->score_min || sbp->hiscore > sfp->score_max)
2162  return 1;
2163 
2164  for (score = sfp->score_min; score <= sfp->score_max; score++)
2165  sfp->sprob[score] = 0.0;
2166 
2167  matrix = sbp->matrix->data;
2168 
2169  alphabet_start = sbp->alphabet_start;
2170  alphabet_end = alphabet_start + sbp->alphabet_size;
2171  for (index1=alphabet_start; index1<alphabet_end; index1++)
2172  {
2173  for (index2=alphabet_start; index2<alphabet_end; index2++)
2174  {
2175  score = matrix[index1][index2];
2176  if (score >= sbp->loscore)
2177  {
2178  sfp->sprob[score] += rfp1->prob[index1] * rfp2->prob[index2];
2179  }
2180  }
2181  }
2182 
2183  score_sum = 0.;
2184  obs_min = obs_max = BLAST_SCORE_MIN;
2185  for (score = sfp->score_min; score <= sfp->score_max; score++)
2186  {
2187  if (sfp->sprob[score] > 0.)
2188  {
2189  score_sum += sfp->sprob[score];
2190  obs_max = score;
2191  if (obs_min == BLAST_SCORE_MIN)
2192  obs_min = score;
2193  }
2194  }
2195  sfp->obs_min = obs_min;
2196  sfp->obs_max = obs_max;
2197 
2198  score_avg = 0.0;
2199  if (score_sum > 0.0001 || score_sum < -0.0001)
2200  {
2201  for (score = obs_min; score <= obs_max; score++)
2202  {
2203  sfp->sprob[score] /= score_sum;
2204  score_avg += score * sfp->sprob[score];
2205  }
2206  }
2207  sfp->score_avg = score_avg;
2208 
2209  return 0;
2210 }
2211 
2212 
2213 
2214 /** The following procedure computes K. The input includes Lambda, H,
2215  * and an array of probabilities for each score.
2216  * There are distinct closed form for three cases:
2217  * 1. high score is 1 low score is -1
2218  * 2. high score is 1 low score is not -1
2219  * 3. low score is -1, high score is not 1
2220  *
2221  * Otherwise, in most cases the value is computed as:
2222  * -exp(-2.0*outerSum) / ((H/lambda)*(exp(-lambda) - 1)
2223  * The last term (exp(-lambda) - 1) can be computed in two different
2224  * ways depending on whether lambda is small or not.
2225  * outerSum is a sum of the terms
2226  * innerSum/j, where j is denoted by iterCounter in the code.
2227  * The sum is truncated when the new term innersum/j i sufficiently small.
2228  * innerSum is a weighted sum of the probabilities of
2229  * of achieving a total score i in a gapless alignment,
2230  * which we denote by P(i,j).
2231  * of exactly j characters. innerSum(j) has two parts
2232  * Sum over i < 0 P(i,j)exp(-i * lambda) +
2233  * Sum over i >=0 P(i,j)
2234  * The terms P(i,j) are computed by dynamic programming.
2235  * An earlier version was flawed in that ignored the special case 1
2236  * and tried to replace the tail of the computation of outerSum
2237  * by a geometric series, but the base of the geometric series
2238  * was not accurately estimated in some cases.
2239  *
2240  * @param sfp object holding scoring frequency information [in]
2241  * @param lambda a Karlin-Altschul parameter [in]
2242  * @param H a Karlin-Altschul parameter [in]
2243  * @return K, another Karlin-Altschul parameter
2244  */
2245 
2246 static double
2248 {
2249  /*The next array stores the probabilities of getting each possible
2250  score in an alignment of fixed length; the array is shifted
2251  during part of the computation, so that
2252  entry 0 is for score 0. */
2253  double *alignmentScoreProbabilities = NULL;
2254  Int4 low; /* Lowest score (must be negative) */
2255  Int4 high; /* Highest score (must be positive) */
2256  Int4 range; /* range of scores, computed as high - low*/
2257  double K; /* local copy of K to return*/
2258  int i; /*loop index*/
2259  int iterCounter; /*counter on iterations*/
2260  Int4 divisor; /*candidate divisor of all scores with
2261  non-zero probabilities*/
2262  /*highest and lowest possible alignment scores for current length*/
2263  Int4 lowAlignmentScore, highAlignmentScore;
2264  Int4 first, last; /*loop indices for dynamic program*/
2265  register double innerSum;
2266  double oldsum, oldsum2; /* values of innerSum on previous
2267  iterations*/
2268  double outerSum; /* holds sum over j of (innerSum
2269  for iteration j/j)*/
2270 
2271  double score_avg; /*average score*/
2272  /*first term to use in the closed form for the case where
2273  high == 1 or low == -1, but not both*/
2274  double firstTermClosedForm; /*usually store H/lambda*/
2275  int iterlimit; /*upper limit on iterations*/
2276  double sumlimit; /*lower limit on contributions
2277  to sum over scores*/
2278 
2279  /*array of score probabilities reindexed so that low is at index 0*/
2280  double *probArrayStartLow;
2281 
2282  /*pointers used in dynamic program*/
2283  double *ptrP, *ptr1, *ptr2, *ptr1e;
2284  double expMinusLambda; /*e^^(-Lambda) */
2285 
2286  if (lambda <= 0. || H <= 0.) {
2287  /* Theory dictates that H and lambda must be positive, so
2288  * return -1 to indicate an error */
2289  return -1.;
2290  }
2291 
2292  /*Karlin-Altschul theory works only if the expected score
2293  is negative*/
2294  if (sfp->score_avg >= 0.0) {
2295  return -1.;
2296  }
2297 
2298  low = sfp->obs_min;
2299  high = sfp->obs_max;
2300  range = high - low;
2301 
2302  probArrayStartLow = &sfp->sprob[low];
2303  /* Look for the greatest common divisor ("delta" in Appendix of PNAS 87 of
2304  Karlin&Altschul (1990) */
2305  for (i = 1, divisor = -low; i <= range && divisor > 1; ++i) {
2306  if (probArrayStartLow[i] != 0.0)
2307  divisor = BLAST_Gcd(divisor, i);
2308  }
2309 
2310  high /= divisor;
2311  low /= divisor;
2312  lambda *= divisor;
2313 
2314  range = high - low;
2315 
2316  firstTermClosedForm = H/lambda;
2317  expMinusLambda = exp((double) -lambda);
2318 
2319  if (low == -1 && high == 1) {
2320  K = (sfp->sprob[low*divisor] - sfp->sprob[high*divisor]) *
2321  (sfp->sprob[low*divisor] - sfp->sprob[high*divisor]) / sfp->sprob[low*divisor];
2322  return(K);
2323  }
2324 
2325  if (low == -1 || high == 1) {
2326  if (high != 1) {
2327  score_avg = sfp->score_avg / divisor;
2328  firstTermClosedForm
2329  = (score_avg * score_avg) / firstTermClosedForm;
2330  }
2331  return firstTermClosedForm * (1.0 - expMinusLambda);
2332  }
2333 
2335  iterlimit = BLAST_KARLIN_K_ITER_MAX;
2336 
2337  alignmentScoreProbabilities =
2338  (double *)calloc((iterlimit*range + 1), sizeof(*alignmentScoreProbabilities));
2339  if (alignmentScoreProbabilities == NULL)
2340  return -1.;
2341 
2342  outerSum = 0.;
2343  lowAlignmentScore = highAlignmentScore = 0;
2344  alignmentScoreProbabilities[0] = innerSum = oldsum = oldsum2 = 1.;
2345 
2346  for (iterCounter = 0;
2347  ((iterCounter < iterlimit) && (innerSum > sumlimit));
2348  outerSum += innerSum /= ++iterCounter) {
2349  first = last = range;
2350  lowAlignmentScore += low;
2351  highAlignmentScore += high;
2352  /*dynamic program to compute P(i,j)*/
2353  for (ptrP = alignmentScoreProbabilities +
2354  (highAlignmentScore-lowAlignmentScore);
2355  ptrP >= alignmentScoreProbabilities;
2356  *ptrP-- =innerSum) {
2357  ptr1 = ptrP - first;
2358  ptr1e = ptrP - last;
2359  ptr2 = probArrayStartLow + first;
2360  for (innerSum = 0.; ptr1 >= ptr1e; ) {
2361  innerSum += *ptr1 * *ptr2;
2362  ptr1--;
2363  ptr2++;
2364  }
2365  if (first)
2366  --first;
2367  if (ptrP - alignmentScoreProbabilities <= range)
2368  --last;
2369  }
2370  /* Horner's rule */
2371  innerSum = *++ptrP;
2372  for( i = lowAlignmentScore + 1; i < 0; i++ ) {
2373  innerSum = *++ptrP + innerSum * expMinusLambda;
2374  }
2375  innerSum *= expMinusLambda;
2376 
2377  for (; i <= highAlignmentScore; ++i)
2378  innerSum += *++ptrP;
2379  oldsum2 = oldsum;
2380  oldsum = innerSum;
2381  }
2382 
2383 #ifdef ADD_GEOMETRIC_TERMS_TO_K
2384  /*old code assumed that the later terms in sum were
2385  asymptotically comparable to those of a geometric
2386  progression, and tried to speed up convergence by
2387  guessing the estimated ratio between sucessive terms
2388  and using the explicit terms of a geometric progression
2389  to speed up convergence. However, the assumption does not
2390  always hold, and convergenece of the above code is fast
2391  enough in practice*/
2392  /* Terms of geometric progression added for correction */
2393  {
2394  double ratio; /* fraction used to generate the
2395  geometric progression */
2396 
2397  ratio = oldsum / oldsum2;
2398  if (ratio >= (1.0 - sumlimit*0.001)) {
2399  K = -1.;
2400  if (alignmentScoreProbabilities != NULL)
2401  sfree(alignmentScoreProbabilities);
2402  return K;
2403  }
2404  sumlimit *= 0.01;
2405  while (innerSum > sumlimit) {
2406  oldsum *= ratio;
2407  outerSum += innerSum = oldsum / ++iterCounter;
2408  }
2409  }
2410 #endif
2411 
2412  K = -exp((double)-2.0*outerSum) /
2413  (firstTermClosedForm*BLAST_Expm1(-(double)lambda));
2414 
2415  if (alignmentScoreProbabilities != NULL)
2416  sfree(alignmentScoreProbabilities);
2417 
2418  return K;
2419 }
2420 
2421 
2422 /**
2423  * Find positive solution to
2424  *
2425  * sum_{i=low}^{high} exp(i lambda) * probs[i] = 1.
2426  *
2427  * Note that this solution does not exist unless the average score is
2428  * negative and the largest score that occurs with nonzero probability
2429  * is positive.
2430  *
2431  * @param probs probabilities of a score occurring
2432  * @param d the gcd of the possible scores. This equals 1 if
2433  * the scores are not a lattice
2434  * @param low the lowest possible score that occurs with
2435  * nonzero probability
2436  * @param high the highest possible score that occurs with
2437  * nonzero probability.
2438  * @param lambda0 an initial guess for lambda
2439  * @param tolx the tolerance to which lambda must be computed
2440  * @param itmax the maximum number of times the function may be
2441  * evaluated
2442  * @param maxNewton the maximum permissible number of Newton
2443  * iterations; after that the computation will proceed
2444  * by bisection.
2445  * @param *itn the number of iterations needed to compute Lambda,
2446  * or itmax if Lambda could not be computed.
2447  *
2448  * Let phi(lambda) = sum_{i=low}^{high} exp(i lambda) - 1. Then
2449  * phi(lambda) may be written
2450  *
2451  * phi(lambda) = exp(u lambda) f( exp(-lambda) )
2452  *
2453  * where f(x) is a polynomial that has exactly two zeros, one at x = 1
2454  * and one at x = exp(-lambda). It is simpler to solve this problem
2455  * in x = exp(-lambda) than it is to solve it in lambda, because we
2456  * know that for x, a solution lies in [0,1], and because Newton's
2457  * method is generally more stable and efficient for polynomials than
2458  * it is for exponentials.
2459  *
2460  * For the most part, this function is a standard safeguarded Newton
2461  * iteration: define an interval of uncertainty [a,b] with f(a) > 0
2462  * and f(b) < 0 (except for the initial value b = 1, where f(b) = 0);
2463  * evaluate the function and use the sign of that value to shrink the
2464  * interval of uncertainty; compute a Newton step; and if the Newton
2465  * step suggests a point outside the interval of uncertainty or fails
2466  * to decrease the function sufficiently, then bisect. There are
2467  * three further details needed to understand the algorithm:
2468  *
2469  * 1) If y the unique solution in [0,1], then f is positive to the left of
2470  * y, and negative to the right. Therefore, we may determine whether
2471  * the Newton step -f(x)/f'(x) is moving toward, or away from, y by
2472  * examining the sign of f'(x). If f'(x) >= 0, we bisect instead
2473  * of taking the Newton step.
2474  * 2) There is a neighborhood around x = 1 for which f'(x) >= 0, so
2475  * (1) prevents convergence to x = 1 (and for a similar reason
2476  * prevents convergence to x = 0, if the function is incorrectly
2477  * called with probs[high] == 0).
2478  * 3) Conditions like fabs(p) < lambda_tolerance * x * (1-x) are used in
2479  * convergence criteria because these values translate to a bound
2480  * on the relative error in lambda. This is proved in the
2481  * "Blast Scoring Parameters" document that accompanies the BLAST
2482  * code.
2483  *
2484  * The iteration on f(x) is robust and doesn't overflow; defining a
2485  * robust safeguarded Newton iteration on phi(lambda) that cannot
2486  * converge to lambda = 0 and that is protected against overflow is
2487  * more difficult. So (despite the length of this comment) the Newton
2488  * iteration on f(x) is the simpler solution.
2489  */
2490 static double
2491 NlmKarlinLambdaNR(double* probs, Int4 d, Int4 low, Int4 high, double lambda0,
2492  double tolx, Int4 itmax, Int4 maxNewton, Int4 * itn )
2493 {
2494  Int4 k;
2495  double x0, x, a = 0, b = 1;
2496  double f = 4; /* Larger than any possible value of the poly in [0,1] */
2497  Int4 isNewton = 0; /* we haven't yet taken a Newton step. */
2498 
2499  assert( d > 0 );
2500 
2501  x0 = exp( -lambda0 );
2502  x = ( 0 < x0 && x0 < 1 ) ? x0 : .5;
2503 
2504  for( k = 0; k < itmax; k++ ) { /* all iteration indices k */
2505  Int4 i;
2506  double g, fold = f;
2507  Int4 wasNewton = isNewton; /* If true, then the previous step was a */
2508  /* Newton step */
2509  isNewton = 0; /* Assume that this step is not */
2510 
2511  /* Horner's rule for evaluating a polynomial and its derivative */
2512  g = 0;
2513  f = probs[low];
2514  for( i = low + d; i < 0; i += d ) {
2515  g = x * g + f;
2516  f = f * x + probs[i];
2517  }
2518  g = x * g + f;
2519  f = f * x + probs[0] - 1;
2520  for( i = d; i <= high; i += d ) {
2521  g = x * g + f;
2522  f = f * x + probs[i];
2523  }
2524  /* End Horner's rule */
2525 
2526  if( f > 0 ) {
2527  a = x; /* move the left endpoint */
2528  } else if( f < 0 ) {
2529  b = x; /* move the right endpoint */
2530  } else { /* f == 0 */
2531  break; /* x is an exact solution */
2532  }
2533  if( b - a < 2 * a * ( 1 - b ) * tolx ) {
2534  /* The midpoint of the interval converged */
2535  x = (a + b) / 2; break;
2536  }
2537 
2538  if( k >= maxNewton ||
2539  /* If convergence of Newton's method appears to be failing; or */
2540  ( wasNewton && fabs( f ) > .9 * fabs(fold) ) ||
2541  /* if the previous iteration was a Newton step but didn't decrease
2542  * f sufficiently; or */
2543  g >= 0
2544  /* if a Newton step will move us away from the desired solution */
2545  ) { /* then */
2546  /* bisect */
2547  x = (a + b)/2;
2548  } else {
2549  /* try a Newton step */
2550  double p = - f/g;
2551  double y = x + p;
2552  if( y <= a || y >= b ) { /* The proposed iterate is not in (a,b) */
2553  x = (a + b)/2;
2554  } else { /* The proposed iterate is in (a,b). Accept it. */
2555  isNewton = 1;
2556  x = y;
2557  if( fabs( p ) < tolx * x * (1-x) ) break; /* Converged */
2558  } /* else the proposed iterate is in (a,b) */
2559  } /* else try a Newton step. */
2560  } /* end for all iteration indices k */
2561  *itn = k;
2562  return -log(x)/d;
2563 }
2564 
2565 
2566 double
2567 Blast_KarlinLambdaNR(Blast_ScoreFreq* sfp, double initialLambdaGuess)
2568 {
2569  Int4 low; /* Lowest score (must be negative) */
2570  Int4 high; /* Highest score (must be positive) */
2571  Int4 itn;
2572  Int4 i, d;
2573  double* sprob;
2574  double returnValue;
2575 
2576  low = sfp->obs_min;
2577  high = sfp->obs_max;
2578  if (sfp->score_avg >= 0.) { /* Expected score must be negative */
2579  return -1.0;
2580  }
2581  if (BlastScoreChk(low, high) != 0) return -1.;
2582 
2583  sprob = sfp->sprob;
2584  /* Find greatest common divisor of all scores */
2585  for (i = 1, d = -low; i <= high-low && d > 1; ++i) {
2586  if (sprob[i+low] != 0.0) {
2587  d = BLAST_Gcd(d, i);
2588  }
2589  }
2590  returnValue =
2591  NlmKarlinLambdaNR( sprob, d, low, high,
2592  initialLambdaGuess,
2594  20, 20 + BLAST_KARLIN_LAMBDA_ITER_DEFAULT, &itn );
2595 
2596 
2597  return returnValue;
2598 }
2599 
2600 /** Calculate H, the relative entropy of the p's and q's
2601  *
2602  * @param sfp object containing scoring frequency information [in]
2603  * @param lambda a Karlin-Altschul parameter [in]
2604  * @return H, a Karlin-Altschul parameter
2605  */
2606 static double
2608 {
2609  Int4 score;
2610  double H, etonlam, sum, scale;
2611 
2612  double *probs = sfp->sprob;
2613  Int4 low = sfp->obs_min, high = sfp->obs_max;
2614 
2615  if (lambda < 0.) {
2616  return -1.;
2617  }
2618  if (BlastScoreChk(low, high) != 0) return -1.;
2619 
2620  etonlam = exp( - lambda );
2621  sum = low * probs[low];
2622  for( score = low + 1; score <= high; score++ ) {
2623  sum = score * probs[score] + etonlam * sum;
2624  }
2625 
2626  scale = BLAST_Powi( etonlam, high );
2627  if( scale > 0.0 ) {
2628  H = lambda * sum/scale;
2629  } else { /* Underflow of exp( -lambda * high ) */
2630  H = lambda * exp( lambda * high + log(sum) );
2631  }
2632  return H;
2633 }
2634 
2635 /**************** Statistical Significance Parameter Subroutine ****************
2636 
2637  Version 1.0 February 2, 1990
2638  Version 1.2 July 6, 1990
2639 
2640  Program by: Stephen Altschul
2641 
2642  Address: National Center for Biotechnology Information
2643  National Library of Medicine
2644  National Institutes of Health
2645  Bethesda, MD 20894
2646 
2647  Internet: altschul@ncbi.nlm.nih.gov
2648 
2649 See: Karlin, S. & Altschul, S.F. "Methods for Assessing the Statistical
2650  Significance of Molecular Sequence Features by Using General Scoring
2651  Schemes," Proc. Natl. Acad. Sci. USA 87 (1990), 2264-2268.
2652 
2653  Computes the parameters lambda and K for use in calculating the
2654  statistical significance of high-scoring segments or subalignments.
2655 
2656  The scoring scheme must be integer valued. A positive score must be
2657  possible, but the expected (mean) score must be negative.
2658 
2659  A program that calls this routine must provide the value of the lowest
2660  possible score, the value of the greatest possible score, and a pointer
2661  to an array of probabilities for the occurrence of all scores between
2662  these two extreme scores. For example, if score -2 occurs with
2663  probability 0.7, score 0 occurs with probability 0.1, and score 3
2664  occurs with probability 0.2, then the subroutine must be called with
2665  low = -2, high = 3, and pr pointing to the array of values
2666  { 0.7, 0.0, 0.1, 0.0, 0.0, 0.2 }. The calling program must also provide
2667  pointers to lambda and K; the subroutine will then calculate the values
2668  of these two parameters. In this example, lambda=0.330 and K=0.154.
2669 
2670  The parameters lambda and K can be used as follows. Suppose we are
2671  given a length N random sequence of independent letters. Associated
2672  with each letter is a score, and the probabilities of the letters
2673  determine the probability for each score. Let S be the aggregate score
2674  of the highest scoring contiguous segment of this sequence. Then if N
2675  is sufficiently large (greater than 100), the following bound on the
2676  probability that S is greater than or equal to x applies:
2677 
2678  P( S >= x ) <= 1 - exp [ - KN exp ( - lambda * x ) ].
2679 
2680  In other words, the p-value for this segment can be written as
2681  1-exp[-KN*exp(-lambda*S)].
2682 
2683  This formula can be applied to pairwise sequence comparison by assigning
2684  scores to pairs of letters (e.g. amino acids), and by replacing N in the
2685  formula with N*M, where N and M are the lengths of the two sequences
2686  being compared.
2687 
2688  In addition, letting y = KN*exp(-lambda*S), the p-value for finding m
2689  distinct segments all with score >= S is given by:
2690 
2691  2 m-1 -y
2692  1 - [ 1 + y + y /2! + ... + y /(m-1)! ] e
2693 
2694  Notice that for m=1 this formula reduces to 1-exp(-y), which is the same
2695  as the previous formula.
2696 
2697 *******************************************************************************/
2698 Int2
2700 {
2701 
2702 
2703  if (kbp == NULL || sfp == NULL)
2704  return 1;
2705 
2706  /* Calculate the parameter Lambda */
2707 
2709  if (kbp->Lambda < 0.)
2710  goto ErrExit;
2711 
2712 
2713  /* Calculate H */
2714 
2715  kbp->H = BlastKarlinLtoH(sfp, kbp->Lambda);
2716  if (kbp->H < 0.)
2717  goto ErrExit;
2718 
2719 
2720  /* Calculate K and log(K) */
2721 
2722  kbp->K = BlastKarlinLHtoK(sfp, kbp->Lambda, kbp->H);
2723  if (kbp->K < 0.)
2724  goto ErrExit;
2725  kbp->logK = log(kbp->K);
2726 
2727  /* Normal return */
2728  return 0;
2729 
2730 ErrExit:
2731  kbp->Lambda = kbp->H = kbp->K = -1.;
2732  kbp->logK = HUGE_VAL;
2733  return 1;
2734 }
2735 
2736 Int2
2738  BlastScoreBlk* sbp, Uint1* query,
2739  const BlastQueryInfo* query_info,
2740  Blast_Message* *blast_message)
2741 {
2742  Int2 status = 0;
2743  Int4 context; /* loop variable. */
2744  Blast_ResFreq* rfp,* stdrfp;
2745  BlastContextInfo* contexts = query_info->contexts;
2746  Boolean check_ideal =
2747  (program == eBlastTypeBlastx || program == eBlastTypeTblastx ||
2748  program == eBlastTypeRpsTblastn);
2749  Boolean valid_context = FALSE;
2750 
2751  ASSERT(contexts);
2752 
2753  /* Ideal Karlin block is filled unconditionally. */
2754  status = Blast_ScoreBlkKbpIdealCalc(sbp);
2755  if (status)
2756  return status;
2757 
2758  stdrfp = Blast_ResFreqNew(sbp);
2759  Blast_ResFreqStdComp(sbp, stdrfp);
2760  rfp = Blast_ResFreqNew(sbp);
2761 
2762  for (context = query_info->first_context;
2763  context <= query_info->last_context; ++context) {
2764 
2765  Int4 context_offset;
2766  Int4 query_length;
2767  Uint1 *buffer; /* holds sequence */
2768  Blast_KarlinBlk* kbp;
2769  Int2 loop_status; /* status flag for functions in this loop. */
2770 
2771  if ( !contexts[context].is_valid )
2772  continue;
2773 
2774  query_length = contexts[context].query_length;
2775  context_offset = contexts[context].query_offset;
2776  buffer = &query[context_offset];
2777 
2778  Blast_ResFreqString(sbp, rfp, (char*)buffer, query_length);
2779  sbp->sfp[context] = Blast_ScoreFreqNew(sbp->loscore, sbp->hiscore);
2780  BlastScoreFreqCalc(sbp, sbp->sfp[context], rfp, stdrfp);
2781  sbp->kbp_std[context] = kbp = Blast_KarlinBlkNew();
2782  loop_status = Blast_KarlinBlkUngappedCalc(kbp, sbp->sfp[context]);
2783  if (loop_status) {
2784  contexts[context].is_valid = FALSE;
2785  sbp->sfp[context] = Blast_ScoreFreqFree(sbp->sfp[context]);
2787  if (!Blast_QueryIsTranslated(program) ) {
2790  }
2791  continue;
2792  }
2793  /* For searches with translated queries, check whether ideal values
2794  should be substituted instead of calculated values, so a more
2795  conservative (smaller) Lambda is used. */
2796  if (check_ideal && kbp->Lambda >= sbp->kbp_ideal->Lambda)
2797  Blast_KarlinBlkCopy(kbp, sbp->kbp_ideal);
2798 
2800  loop_status = Blast_KarlinBlkUngappedCalc(sbp->kbp_psi[context],
2801  sbp->sfp[context]);
2802  if (loop_status) {
2803  contexts[context].is_valid = FALSE;
2804  sbp->sfp[context] = Blast_ScoreFreqFree(sbp->sfp[context]);
2807  continue;
2808  }
2809  valid_context = TRUE;
2810  }
2811 
2812  rfp = Blast_ResFreqFree(rfp);
2813  stdrfp = Blast_ResFreqFree(stdrfp);
2814 
2815  if (valid_context == FALSE)
2816  { /* No valid contexts were found. */
2817  /* Message for non-translated search issued above. */
2818  if (Blast_QueryIsTranslated(program) ) {
2821  }
2822  status = 1; /* Not a single context was valid. */
2823  }
2824 
2825  /* Set ungapped Blast_KarlinBlk* alias */
2826  sbp->kbp = Blast_QueryIsPssm(program) ? sbp->kbp_psi : sbp->kbp_std;
2827 
2828  return status;
2829 }
2830 
2831 Int2
2833 
2834 {
2835  Blast_ResFreq* stdrfp = NULL;
2836  Blast_ScoreFreq* sfp = NULL;
2837  Int2 status = 0;
2838 
2839  if ( !sbp ) {
2840  return (status = 1);
2841  }
2842 
2843  stdrfp = Blast_ResFreqNew(sbp);
2844  Blast_ResFreqStdComp(sbp, stdrfp);
2845  sfp = Blast_ScoreFreqNew(sbp->loscore, sbp->hiscore);
2846  BlastScoreFreqCalc(sbp, sfp, stdrfp, stdrfp);
2847  sbp->kbp_ideal = Blast_KarlinBlkNew();
2849 
2850  stdrfp = Blast_ResFreqFree(stdrfp);
2851  sfp = Blast_ScoreFreqFree(sfp);
2852 
2853  return status;
2854 }
2855 
2856 /*
2857  Creates the Karlin Block.
2858 */
2859 
2862 
2863 {
2864  Blast_KarlinBlk* kbp;
2865 
2866  kbp = (Blast_KarlinBlk*) calloc(1, sizeof(Blast_KarlinBlk));
2867 
2868  return kbp;
2869 }
2870 
2872 {
2873  if (!kbp_to || !kbp_from)
2874  return -1;
2875 
2876  kbp_to->Lambda = kbp_from->Lambda;
2877  kbp_to->K = kbp_from->K;
2878  kbp_to->logK = kbp_from->logK;
2879  kbp_to->H = kbp_from->H;
2880  kbp_to->paramC = kbp_from->paramC;
2881  return 0;
2882 }
2883 
2884 /** Deallocates MatrixInfo as well as name string.
2885  * @param matrix_info the object to be deallocated [in]
2886  * @return NULL pointer
2887  */
2888 
2889 static MatrixInfo*
2891 
2892 {
2893  if (matrix_info == NULL)
2894  return NULL;
2895 
2896  sfree(matrix_info->name);
2897  sfree(matrix_info);
2898  return NULL;
2899 }
2900 
2901 /** Allocates New MatrixInfo*
2902  * @param name name of matrix [in]
2903  * @param values array contains information about a matrix [in]
2904  * @param prefs contains information on a which values are preferred [in]
2905  * @param max_number size of those arrays [in]
2906  * @return pointer to the allocated MatrixInfo
2907  */
2908 
2909 static MatrixInfo*
2910 MatrixInfoNew(const char* name, array_of_8 *values, Int4* prefs, Int4 max_number)
2911 
2912 {
2913  MatrixInfo* matrix_info;
2914 
2915  matrix_info = (MatrixInfo*) calloc(1, sizeof(MatrixInfo));
2916  matrix_info->name = strdup(name);
2917  matrix_info->values = values;
2918  matrix_info->prefs = prefs;
2919  matrix_info->max_number_values = max_number;
2920 
2921  return matrix_info;
2922 }
2923 
2924 /** Free linked list of MatrixValues and all associated data
2925  * @param vnp linked list of MatrixValues [in]
2926  * @return NULL pointer
2927  */
2928 static ListNode*
2930 
2931 {
2932  ListNode* head;
2933 
2934  head = vnp;
2935  while (vnp)
2936  {
2938  vnp = vnp->next;
2939  }
2940 
2941  head = ListNodeFree(head);
2942 
2943  return head;
2944 }
2945 
2946 /** Loads all the matrix values, returns a ListNode* chain that contains
2947  * MatrixInfo*'s.
2948  * @return list of MatrixInfos.
2949  *
2950 */
2951 static ListNode*
2953 
2954 {
2955  MatrixInfo* matrix_info;
2956  ListNode* retval=NULL;
2957 
2958  matrix_info = MatrixInfoNew("BLOSUM80", blosum80_values, blosum80_prefs, BLOSUM80_VALUES_MAX);
2959  ListNodeAddPointer(&retval, 0, matrix_info);
2960 
2961  matrix_info = MatrixInfoNew("BLOSUM62", blosum62_values, blosum62_prefs, BLOSUM62_VALUES_MAX);
2962  ListNodeAddPointer(&retval, 0, matrix_info);
2963 
2964  matrix_info = MatrixInfoNew("BLOSUM50", blosum50_values, blosum50_prefs, BLOSUM50_VALUES_MAX);
2965  ListNodeAddPointer(&retval, 0, matrix_info);
2966 
2967  matrix_info = MatrixInfoNew("BLOSUM45", blosum45_values, blosum45_prefs, BLOSUM45_VALUES_MAX);
2968  ListNodeAddPointer(&retval, 0, matrix_info);
2969 
2970  matrix_info = MatrixInfoNew("PAM250", pam250_values, pam250_prefs, PAM250_VALUES_MAX);
2971  ListNodeAddPointer(&retval, 0, matrix_info);
2972 
2973 #ifdef BLOSUM62_20_ENABLE
2974  matrix_info = MatrixInfoNew("BLOSUM62_20", blosum62_20_values, blosum62_20_prefs, BLOSUM62_20_VALUES_MAX);
2975  ListNodeAddPointer(&retval, 0, matrix_info);
2976 #endif
2977 
2978  matrix_info = MatrixInfoNew("BLOSUM90", blosum90_values, blosum90_prefs, BLOSUM90_VALUES_MAX);
2979  ListNodeAddPointer(&retval, 0, matrix_info);
2980 
2981  matrix_info = MatrixInfoNew("PAM30", pam30_values, pam30_prefs, PAM30_VALUES_MAX);
2982  ListNodeAddPointer(&retval, 0, matrix_info);
2983 
2984  matrix_info = MatrixInfoNew("PAM70", pam70_values, pam70_prefs, PAM70_VALUES_MAX);
2985  ListNodeAddPointer(&retval, 0, matrix_info);
2986 
2987  if (!standard_only) {
2988  matrix_info = MatrixInfoNew("IDENTITY", prot_idenity_values,
2991  ListNodeAddPointer(&retval, 0, matrix_info);
2992  }
2993 
2994  return retval;
2995 }
2996 
2997 /** Obtains arrays of the allowed opening and extension penalties for gapped BLAST for
2998  * the given matrix. Also obtains arrays of Lambda, K, and H. Any of these fields that
2999  * are not required should be set to NULL. The Int2 return value is the length of the arrays.
3000  * @param matrix name of the matrix [in]
3001  * @param open gap existence parameter [in|out]
3002  * @param extension cost to extend a gap by one letter [in|out]
3003  * @param lambda Karlin-Altschul parameter [in|out]
3004  * @param K Karlin-Altschul parameter [in|out]
3005  * @param H Karlin-Altschul parameter [in|out]
3006  * @param alpha Karlin-Altschul parameter [in|out]
3007  * @param beta Karlin-Altschul parameter [in|out]
3008  * @param pref_flags describes preferred values [in|out]
3009  * @return maximum number of values (length of arrays).
3010 */
3011 
3012 static Int2
3013 Blast_GetMatrixValues(const char* matrix, Int4** open, Int4** extension, double** lambda, double** K, double** H, double** alpha, double** beta, Int4** pref_flags)
3014 
3015 {
3016  array_of_8 *values = NULL;
3017  Boolean found_matrix=FALSE;
3018  Int4 index, max_number_values=0;
3019  Int4* open_array=NULL,* extension_array=NULL,* pref_flags_array=NULL,* prefs=NULL;
3020  double* lambda_array=NULL,* K_array=NULL,* H_array=NULL,* alpha_array=NULL,* beta_array=NULL;
3021  MatrixInfo* matrix_info;
3022  ListNode* vnp,* head;
3023 
3024  if (matrix == NULL)
3025  return 0;
3026 
3027  vnp = head = BlastLoadMatrixValues(FALSE);
3028 
3029  while (vnp)
3030  {
3031  matrix_info = vnp->ptr;
3032  if (strcasecmp(matrix_info->name, matrix) == 0)
3033  {
3034  values = matrix_info->values;
3035  max_number_values = matrix_info->max_number_values;
3036  prefs = matrix_info->prefs;
3037  found_matrix = TRUE;
3038  break;
3039  }
3040  vnp = vnp->next;
3041  }
3042 
3043  if (found_matrix)
3044  {
3045  if (open)
3046  *open = open_array = (Int4 *) calloc(max_number_values, sizeof(Int4));
3047  if (extension)
3048  *extension = extension_array =
3049  (Int4 *) calloc(max_number_values, sizeof(Int4));
3050  if (lambda)
3051  *lambda = lambda_array =
3052  (double*) calloc(max_number_values, sizeof(double));
3053  if (K)
3054  *K = K_array = (double*) calloc(max_number_values, sizeof(double));
3055  if (H)
3056  *H = H_array = (double*) calloc(max_number_values, sizeof(double));
3057  if (alpha)
3058  *alpha = alpha_array = (double*) calloc(max_number_values, sizeof(double));
3059  if (beta)
3060  *beta = beta_array = (double*) calloc(max_number_values, sizeof(double));
3061  if (pref_flags)
3062  *pref_flags = pref_flags_array =
3063  (Int4 *) calloc(max_number_values, sizeof(Int4));
3064 
3065  for (index=0; index<max_number_values; index++)
3066  {
3067  if (open)
3068  open_array[index] = (Int4) values[index][0];
3069  if (extension)
3070  extension_array[index] = (Int4) values[index][1];
3071  /* decline_align value is skipped */
3072  if (lambda)
3073  lambda_array[index] = values[index][3];
3074  if (K)
3075  K_array[index] = values[index][4];
3076  if (H)
3077  H_array[index] = values[index][5];
3078  if (alpha)
3079  alpha_array[index] = values[index][6];
3080  if (beta)
3081  beta_array[index] = values[index][7];
3082  if (pref_flags)
3083  pref_flags_array[index] = prefs[index];
3084  }
3085  }
3086 
3088 
3089  return max_number_values;
3090 }
3091 
3092 /*Extract the alpha and beta settings for this matrixName, and these
3093  gap open and gap extension costs*/
3094 void BLAST_GetAlphaBeta(const char* matrixName, double *alpha,
3095  double *beta, Boolean gapped, Int4 gap_open,
3096  Int4 gap_extend, const Blast_KarlinBlk* kbp_ungapped)
3097 {
3098  Int4* gapOpen_arr,* gapExtend_arr,* pref_flags;
3099  double* alpha_arr,* beta_arr;
3100  Int2 num_values;
3101  Int4 i; /*loop index*/
3102 
3103  num_values = Blast_GetMatrixValues(matrixName, &gapOpen_arr,
3104  &gapExtend_arr, NULL, NULL, NULL, &alpha_arr, &beta_arr,
3105  &pref_flags);
3106 
3107  if (gapped) {
3108  if ((0 == gap_open) && (0 == gap_extend)) {
3109  for(i = 1; i < num_values; i++) {
3110  if(pref_flags[i]==BLAST_MATRIX_BEST) {
3111  (*alpha) = alpha_arr[i];
3112  (*beta) = beta_arr[i];
3113  break;
3114  }
3115  }
3116  }
3117  else {
3118  for(i = 1; i < num_values; i++) {
3119  if ((gapOpen_arr[i] == gap_open) &&
3120  (gapExtend_arr[i] == gap_extend)) {
3121  (*alpha) = alpha_arr[i];
3122  (*beta) = beta_arr[i];
3123  break;
3124  }
3125  }
3126  }
3127  }
3128  else if (num_values > 0) {
3129  (*alpha) = alpha_arr[0];
3130  (*beta) = beta_arr[0];
3131  } else {
3132  *alpha = kbp_ungapped->Lambda / kbp_ungapped->H;
3133  *beta = 0;
3134  }
3135 
3136  sfree(gapOpen_arr);
3137  sfree(gapExtend_arr);
3138  sfree(pref_flags);
3139  sfree(alpha_arr);
3140  sfree(beta_arr);
3141 }
3142 
3143 /** Splits an ArrayOf8 into two arrays of supported gap costs. One is for non-affine
3144  * (megablast linear values) and the other is for standard (typically affine) values.
3145  * @param input the array to be split [in]
3146  * @param normal the standard (typically affine) values [out]
3147  * @param non_affine the megablast (linear) values [out]
3148  * @param split Boolean specifying whether the non-affine values are populated [out]
3149  * @return 0 on success, -1 on error
3150 */
3151 static Int2
3152 s_SplitArrayOf8(const array_of_8* input, const array_of_8** normal, const array_of_8** non_affine, Boolean *split)
3153 {
3154 
3155  if (input == NULL || normal == NULL || non_affine == NULL)
3156  return -1;
3157 
3158  *normal = NULL;
3159  *non_affine = NULL;
3160 
3161  if (input[0][0] == 0 && input[0][1] == 0)
3162  {
3163  *normal = input+1;
3164  *non_affine = input;
3165  *split = TRUE;
3166  }
3167  else
3168  {
3169  *normal = input;
3170  *split = FALSE;
3171  }
3172  return 0;
3173 
3174 }
3175 
3176 /** Adjust Lambda and H if reward and penalty have a non-1 gcd.
3177  * the two arrays (normal and linear) should be filled in with values already.
3178  * @param normal the values for normal (e.g, "affine") gap costs [in|out]
3179  * @param linear specialized values used for megablast [in|out]
3180  * @param size Number of supported combinations for this match/mismatch pair [out]
3181  * @param gap_existence_max start of infinite regime for gap existence [in|out]
3182  * @param gap_extend_max start of infinite regime for gap extension [in|out]
3183  * @param divisor divisor for gap costs [out]
3184 */
3185 static Int2
3186 s_AdjustGapParametersByGcd(array_of_8* normal, array_of_8* linear, int size, Int4* gap_existence_max, Int4* gap_extend_max, int divisor)
3187 {
3188  if (divisor == 1)
3189  return 0;
3190 
3191  if (size <=0)
3192  return 1;
3193 
3194  (*gap_existence_max) *= divisor;
3195  (*gap_extend_max) *= divisor;
3196 
3197  if (normal)
3198  {
3199  int i;
3200 
3201  for (i=0; i<size; i++)
3202  { /* divide lambda and alpha by divisor. */
3203  /* multiply gap existence and extension by divisor. */
3204  normal[i][0] *= divisor;
3205  normal[i][1] *= divisor;
3206  normal[i][2] /= divisor;
3207  normal[i][5] /= divisor;
3208  }
3209  }
3210  if (linear)
3211  { /* divide lambda and alpha by divisor. */
3212  linear[0][0] *= divisor;
3213  linear[0][1] *= divisor;
3214  linear[0][2] /= divisor;
3215  linear[0][5] /= divisor;
3216  }
3217 
3218  return 0;
3219 }
3220 
3221 /** Returns the array of values corresponding to the given match/mismatch
3222  * scores, the number of supported gap cost combinations and thresholds for
3223  * the gap costs, beyond which the ungapped statistics can be applied.
3224  * @param reward Match reward score [in]
3225  * @param penalty Mismatch penalty score [in]
3226  * @param array_size Number of supported combinations for this match/mismatch
3227  * pair [out]
3228  * @param normal the values for normal (e.g, "affine") gap costs [out]
3229  * @param non_affine specialized values used for megablast [out]
3230  * @param gap_open_max Gap opening cost threshold for infinite gap costs [out]
3231  * @param gap_extend_max Gap extension cost threshold for infinite gap costs [out]
3232  * @param round_down if set to TRUE only even scores should be used for calculation
3233  * of expect value or bit scores [out]
3234  * @param error_return Pointer to error message [out]
3235  * @return zero on success, other values if error
3236  */
3237 static Int2
3238 s_GetNuclValuesArray(Int4 reward, Int4 penalty, Int4* array_size,
3239  array_of_8** normal, array_of_8** non_affine,
3240  Int4* gap_open_max, Int4* gap_extend_max, Boolean* round_down,
3241  Blast_Message** error_return)
3242 {
3243  Int2 status = 0;
3244  const array_of_8 * kValues = NULL;
3245  const array_of_8 * kValues_non_affine = NULL;
3246  Boolean split = FALSE;
3247  int divisor = BLAST_Gcd(reward, penalty);
3248 
3249  *round_down = FALSE;
3250 
3251  *array_size = 0;
3252  *normal = NULL;
3253  *non_affine = NULL;
3254 
3255  if (divisor != 1)
3256  {
3257  reward /= divisor;
3258  penalty /= divisor;
3259  }
3260 
3261  if (reward == 1 && penalty == -5) {
3262  if ((status=s_SplitArrayOf8(blastn_values_1_5, &kValues, &kValues_non_affine, &split)))
3263  return status;
3264 
3265  *array_size = sizeof(blastn_values_1_5)/sizeof(array_of_8);
3266  *gap_open_max = 3;
3267  *gap_extend_max = 3;
3268  } else if (reward == 1 && penalty == -4) {
3269  if ((status=s_SplitArrayOf8(blastn_values_1_4, &kValues, &kValues_non_affine, &split)))
3270  return status;
3271 
3272  *array_size = sizeof(blastn_values_1_4)/sizeof(array_of_8);
3273  *gap_open_max = 2;
3274  *gap_extend_max = 2;
3275  } else if (reward == 2 && penalty == -7) {
3276  if ((status=s_SplitArrayOf8(blastn_values_2_7, &kValues, &kValues_non_affine, &split)))
3277  return status;
3278 
3279  *round_down = TRUE;
3280  *array_size = sizeof(blastn_values_2_7)/sizeof(array_of_8);
3281  *gap_open_max = 4;
3282  *gap_extend_max = 4;
3283  } else if (reward == 1 && penalty == -3) {
3284  if ((status=s_SplitArrayOf8(blastn_values_1_3, &kValues, &kValues_non_affine, &split)))
3285  return status;
3286 
3287  *array_size = sizeof(blastn_values_1_3)/sizeof(array_of_8);
3288  *gap_open_max = 2;
3289  *gap_extend_max = 2;
3290  } else if (reward == 2 && penalty == -5) {
3291  if ((status=s_SplitArrayOf8(blastn_values_2_5, &kValues, &kValues_non_affine, &split)))
3292  return status;
3293 
3294  *round_down = TRUE;
3295  *array_size = sizeof(blastn_values_2_5)/sizeof(array_of_8);
3296  *gap_open_max = 4;
3297  *gap_extend_max = 4;
3298  } else if (reward == 1 && penalty == -2) {
3299  if ((status=s_SplitArrayOf8(blastn_values_1_2, &kValues, &kValues_non_affine, &split)))
3300  return status;
3301 
3302  *array_size = sizeof(blastn_values_1_2)/sizeof(array_of_8);
3303  *gap_open_max = 2;
3304  *gap_extend_max = 2;
3305  } else if (reward == 2 && penalty == -3) {
3306  if ((status=s_SplitArrayOf8(blastn_values_2_3, &kValues, &kValues_non_affine, &split)))
3307  return status;
3308 
3309  *round_down = TRUE;
3310  *array_size = sizeof(blastn_values_2_3)/sizeof(array_of_8);
3311  *gap_open_max = 6;
3312  *gap_extend_max = 4;
3313  } else if (reward == 3 && penalty == -4) {
3314  if ((status=s_SplitArrayOf8(blastn_values_3_4, &kValues, &kValues_non_affine, &split)))
3315  return status;
3316 
3317  *round_down = TRUE;
3318  *array_size = sizeof(blastn_values_3_4)/sizeof(array_of_8);
3319  *gap_open_max = 6;
3320  *gap_extend_max = 3;
3321  } else if (reward == 1 && penalty == -1) {
3322  if ((status=s_SplitArrayOf8(blastn_values_1_1, &kValues, &kValues_non_affine, &split)))
3323  return status;
3324 
3325  *array_size = sizeof(blastn_values_1_1)/sizeof(array_of_8);
3326  *gap_open_max = 4;
3327  *gap_extend_max = 2;
3328  } else if (reward == 3 && penalty == -2) {
3329  if ((status=s_SplitArrayOf8(blastn_values_3_2, &kValues, &kValues_non_affine, &split)))
3330  return status;
3331 
3332  *array_size = sizeof(blastn_values_3_2)/sizeof(array_of_8);
3333  *gap_open_max = 5;
3334  *gap_extend_max = 5;
3335  } else if (reward == 4 && penalty == -5) {
3336  if ((status=s_SplitArrayOf8(blastn_values_4_5, &kValues, &kValues_non_affine, &split)))
3337  return status;
3338 
3339  *array_size = sizeof(blastn_values_4_5)/sizeof(array_of_8);
3340  *gap_open_max = 12;
3341  *gap_extend_max = 8;
3342  } else if (reward == 5 && penalty == -4) {
3343  if ((status=s_SplitArrayOf8(blastn_values_5_4, &kValues, &kValues_non_affine, &split)))
3344  return status;
3345 
3346  *array_size = sizeof(blastn_values_5_4)/sizeof(array_of_8);
3347  *gap_open_max = 25;
3348  *gap_extend_max = 10;
3349  } else { /* Unsupported reward-penalty */
3350  status = -1;
3351  if (error_return) {
3352  char buffer[256];
3353  sprintf(buffer, "Substitution scores %d and %d are not supported",
3354  reward, penalty);
3356  }
3357  }
3358  if (split)
3359  (*array_size)--;
3360 
3361  if (status == 0)
3362  {
3363  if (*array_size > 0)
3364  *normal = BlastMemDup(kValues, (*array_size)*sizeof(array_of_8));
3365  if (kValues_non_affine)
3366  *non_affine = BlastMemDup(kValues_non_affine, sizeof(array_of_8));
3367 
3368  status = s_AdjustGapParametersByGcd(*normal, *non_affine, *array_size, gap_open_max, gap_extend_max, divisor);
3369  }
3370 
3371  return status;
3372 }
3373 
3375  Int4* gap_existence,
3376  Int4* gap_extension)
3377 {
3378  Int4* gapOpen_arr,* gapExtend_arr,* pref_flags;
3379  Int4 i; /*loop index*/
3380  Int2 num_values = Blast_GetMatrixValues(matrixName, &gapOpen_arr,
3381  &gapExtend_arr, NULL, NULL, NULL, NULL, NULL, &pref_flags);
3382 
3383  if (num_values <= 0)
3384  return -1;
3385 
3386  for(i = 1; i < num_values; i++) {
3387  if(pref_flags[i]==BLAST_MATRIX_BEST) {
3388  (*gap_existence) = gapOpen_arr[i];
3389  (*gap_extension) = gapExtend_arr[i];
3390  break;
3391  }
3392  }
3393 
3394  sfree(gapOpen_arr);
3395  sfree(gapExtend_arr);
3396  sfree(pref_flags);
3397 
3398  return 0;
3399 }
3400 
3401 
3403  Int4 penalty,
3404  Int4* gap_existence,
3405  Int4* gap_extension)
3406 {
3407  int array_size = 0; /* dummy parameter. */
3408  array_of_8* normal=NULL; /* dummy parameter */
3409  array_of_8* non_affine=NULL; /* dummy parameter */
3410  Boolean round_down = FALSE;
3411  int gap_existence_max=0;
3412  int gap_extension_max=0;
3413  Int2 status = s_GetNuclValuesArray(reward, penalty, &array_size, &normal, &non_affine,
3414  &gap_existence_max, &gap_extension_max, &round_down, NULL);
3415 
3416  if (status)
3417  {
3418  sfree(normal);
3419  sfree(non_affine);
3420  return status;
3421  }
3422 
3423  if (*gap_existence == 0 && *gap_extension == 0 && non_affine)
3424  status = 0; /* these values are supported. */
3425  else
3426  {
3427  int index=0;
3428  Boolean found=FALSE;
3429  while (index < array_size)
3430  {
3431  if (*gap_existence == normal[index][0] && *gap_extension == normal[index][1])
3432  {
3433  found = TRUE;
3434  break; /* these values are supported. */
3435  }
3436  index++;
3437  }
3438 
3439  if (!found)
3440  { /* If values are above max, then use. Otherwise set max values. */
3441  if (*gap_existence < gap_existence_max || *gap_extension < gap_extension_max)
3442  {
3443  *gap_existence = gap_existence_max;
3444  *gap_extension = gap_extension_max;
3445  }
3446  }
3447  status = 0;
3448  }
3449  sfree(normal);
3450  sfree(non_affine);
3451  return status;
3452 }
3453 
3455 {
3456  int array_size = 0; /* dummy parameter. */
3457  array_of_8* normal = NULL; /* dummy parameter */
3458  array_of_8* non_affine = NULL; /* dummy parameter */
3459  Boolean round_down = FALSE;
3460  int gap_existence_max = 0;
3461  int gap_extension_max = 0;
3462  Int2 status = s_GetNuclValuesArray(reward, penalty, &array_size, &normal,
3463  &non_affine, &gap_existence_max,
3464  &gap_extension_max, &round_down, NULL);
3465 
3466  sfree(normal);
3467  sfree(non_affine);
3468 
3469  return status == 0;
3470 }
3471 
3472 /** Fills in error_return with strings describing the allowed values.
3473  * @param matrix_name name of the matrix [in]
3474  * @param error_return object to be filled in [in|out]
3475  * @return zero on success.
3476  */
3477 static Int2
3478 BlastKarlinReportAllowedValues(const char *matrix_name,
3479  Blast_Message** error_return)
3480 {
3481  array_of_8 *values = NULL;
3482  Boolean found_matrix=FALSE;
3483  Int4 max_number_values=0;
3484  ListNode* vnp,* head;
3485  MatrixInfo* matrix_info;
3486 
3487  vnp = head = BlastLoadMatrixValues(FALSE);
3488  while (vnp)
3489  {
3490  matrix_info = vnp->ptr;
3491  if (strcasecmp(matrix_info->name, matrix_name) == 0)
3492  {
3493  values = matrix_info->values;
3494  max_number_values = matrix_info->max_number_values;
3495  found_matrix = TRUE;
3496  break;
3497  }
3498  vnp = vnp->next;
3499  }
3500 
3501  if (found_matrix)
3502  {
3503  Int4 index;
3504  char buffer[256];
3505  for (index=0; index<max_number_values; index++)
3506  {
3507  if (BLAST_Nint(values[index][2]) == INT2_MAX)
3508  sprintf(buffer, "Gap existence and extension values of %ld and %ld are supported", (long) BLAST_Nint(values[index][0]), (long) BLAST_Nint(values[index][1]));
3509  else
3510  sprintf(buffer, "Gap existence, extension and decline-to-align values of %ld, %ld and %ld are supported", (long) BLAST_Nint(values[index][0]), (long) BLAST_Nint(values[index][1]), (long) BLAST_Nint(values[index][2]));
3512  }
3513  }
3514 
3516 
3517  return 0;
3518 }
3519 
3520 /*
3521  Supplies lambda, H, and K values, as calcualted by Stephen Altschul
3522  in Methods in Enzy. (vol 266, page 474).
3523 
3524  if kbp is NULL, then a validation is perfomed.
3525 */
3526 Int2
3527 Blast_KarlinBlkGappedCalc(Blast_KarlinBlk* kbp, Int4 gap_open, Int4 gap_extend, const char* matrix_name, Blast_Message** error_return)
3528 
3529 {
3530  char buffer[256];
3531  Int2 status =
3533  gap_extend, matrix_name, FALSE);
3534 
3535  if (status && error_return)
3536  {
3537  if (status == 1)
3538  {
3539  MatrixInfo* matrix_info;
3540  ListNode* vnp,* head;
3541 
3542  vnp = head = BlastLoadMatrixValues(FALSE);
3543 
3544  sprintf(buffer, "%s is not a supported matrix", matrix_name);
3546 
3547  while (vnp)
3548  {
3549  matrix_info = vnp->ptr;
3550  sprintf(buffer, "%s is a supported matrix", matrix_info->name);
3552  vnp = vnp->next;
3553  }
3554 
3556  }
3557  else if (status == 2)
3558  {
3559  sprintf(buffer, "Gap existence and extension values of %ld and %ld not supported for %s", (long) gap_open, (long) gap_extend, matrix_name);
3561  BlastKarlinReportAllowedValues(matrix_name, error_return);
3562  }
3563  }
3564 
3565  return status;
3566 }
3567 
3568 /*
3569  Attempts to fill KarlinBlk for given gap opening, extensions etc.
3570  Will return non-zero status if that fails.
3571 
3572  return values: -1 if matrix_name is NULL;
3573  1 if matrix not found
3574  2 if matrix found, but open, extend etc. values not supported.
3575 */
3576 Int2
3578  Int4 gap_extend, const char* matrix_name,
3579  Boolean standard_only)
3580 {
3581  Boolean found_matrix=FALSE;
3582  array_of_8 *values;
3583  Int2 status=0;
3584  Int4 max_number_values=0;
3585  MatrixInfo* matrix_info;
3586  ListNode* vnp,* head;
3587 
3588  if (matrix_name == NULL)
3589  return -1;
3590 
3591  values = NULL;
3592 
3593  vnp = head = BlastLoadMatrixValues(standard_only);
3594  while (vnp)
3595  {
3596  matrix_info = vnp->ptr;
3597  if (strcasecmp(matrix_info->name, matrix_name) == 0)
3598  {
3599  values = matrix_info->values;
3600  max_number_values = matrix_info->max_number_values;
3601  found_matrix = TRUE;
3602  break;
3603  }
3604  vnp = vnp->next;
3605  }
3606 
3607 
3608  if (found_matrix)
3609  {
3610  Boolean found_values=FALSE;
3611  Int4 index;
3612  for (index=0; index<max_number_values; index++)
3613  {
3614  if (BLAST_Nint(values[index][0]) == gap_open &&
3615  BLAST_Nint(values[index][1]) == gap_extend)
3616  {
3617  if (kbp)
3618  {
3619  kbp->Lambda = values[index][3];
3620  kbp->K = values[index][4];
3621  kbp->logK = log(kbp->K);
3622  kbp->H = values[index][5];
3623  }
3624  found_values = TRUE;
3625  break;
3626  }
3627  }
3628 
3629  if (found_values == TRUE)
3630  {
3631  status = 0;
3632  }
3633  else
3634  {
3635  status = 2;
3636  }
3637  }
3638  else
3639  {
3640  status = 1;
3641  }
3642 
3644 
3645  return status;
3646 }
3647 
3648 /*
3649  Supplies Gumbel parameters for p-value estimation with FSC
3650 */
3651 Int2
3653  Int4 gap_extend, const char* matrix_name, Blast_Message** error_return)
3654 {
3655  char buffer[256];
3656  Int2 status =
3657  Blast_GumbelBlkLoadFromTables(gbp, gap_open, gap_extend, matrix_name);
3658 
3659  if (status && error_return) {
3660  if (status == 1) {
3661  MatrixInfo* matrix_info;
3662  ListNode* vnp,* head;
3663 
3664  vnp = head = BlastLoadMatrixValues(FALSE);
3665 
3666  sprintf(buffer, "%s is not a supported matrix", matrix_name);
3668 
3669  while (vnp) {
3670  matrix_info = vnp->ptr;
3671  sprintf(buffer, "%s is a supported matrix", matrix_info->name);
3673  vnp = vnp->next;
3674  }
3675 
3677  } else if (status == 2) {
3678  sprintf(buffer, "Gap existence and extension values of %ld and %ld not supported for %s", (long) gap_open, (long) gap_extend, matrix_name);
3680  BlastKarlinReportAllowedValues(matrix_name, error_return);
3681  }
3682  }
3683 
3684  return status;
3685 }
3686 
3687 /*
3688  Attempts to fill gumbel parameters.
3689  Will return non-zero status if that fails.
3690 
3691  return values: -1 if matrix_name is NULL;
3692  1 if matrix not found
3693  2 if matrix found, but open, extend etc. values not supported.
3694 */
3695 Int2
3697  Int4 gap_extend, const char* matrix_name)
3698 {
3699  Boolean found_matrix=FALSE;
3700  array_of_8 *values;
3701  Int2 status=0;
3702  Int4 max_number_values=0;
3703  MatrixInfo* matrix_info;
3704  ListNode* vnp,* head;
3705 
3706  if (matrix_name == NULL)
3707  return -1;
3708 
3709  values = NULL;
3710 
3711  vnp = head = BlastLoadMatrixValues(FALSE);
3712  while (vnp) {
3713  matrix_info = vnp->ptr;
3714  if (strcasecmp(matrix_info->name, matrix_name) == 0) {
3715  values = matrix_info->values;
3716  max_number_values = matrix_info->max_number_values;
3717  found_matrix = TRUE;
3718  break;
3719  }
3720  vnp = vnp->next;
3721  }
3722 
3723 
3724  if (found_matrix) {
3725  Boolean found_values=FALSE;
3726  Int4 index;
3727  for (index=0; index<max_number_values; index++) {
3728  if (BLAST_Nint(values[index][0]) == gap_open &&
3729  BLAST_Nint(values[index][1]) == gap_extend) {
3730  if (gbp) {
3731  gbp->Lambda = values[index][3];
3732  gbp->C = values[index][8];
3733  gbp->G = gap_open + gap_extend;
3734  gbp->a = values[index][6];
3735  gbp->Alpha = values[index][9];
3736  gbp->Sigma = values[index][10];
3737  gbp->a_un = values[0][6];
3738  gbp->Alpha_un = values[0][9];
3739  gbp->b = 2.0 * gbp->G * (gbp->a_un - gbp->a);
3740  gbp->Beta = 2.0 * gbp->G * (gbp->Alpha_un - gbp->Alpha);
3741  gbp->Tau = 2.0 * gbp->G * (gbp->Alpha_un - gbp->Sigma);
3742  gbp->filled = TRUE;
3743  }
3744  found_values = TRUE;
3745  break;
3746  }
3747  }
3748 
3749  status = found_values ? 0 : 2;
3750  } else {
3751  status = 1;
3752  }
3753 
3755 
3756  return status;
3757 }
3758 
3759 char*
3760 BLAST_PrintMatrixMessage(const char *matrix_name, Boolean standard_only)
3761 {
3762  char* buffer= (char *) calloc(1024, sizeof(char));
3763  char* ptr;
3764  MatrixInfo* matrix_info;
3765  ListNode* vnp,* head;
3766 
3767  ptr = buffer;
3768  sprintf(ptr, "%s is not a supported matrix, supported matrices are:\n", matrix_name);
3769 
3770  ptr += strlen(ptr);
3771 
3772  vnp = head = BlastLoadMatrixValues(standard_only);
3773 
3774  while (vnp)
3775  {
3776  matrix_info = vnp->ptr;
3777  sprintf(ptr, "%s \n", matrix_info->name);
3778  ptr += strlen(ptr);
3779  vnp = vnp->next;
3780  }
3782 
3783  return buffer;
3784 }
3785 
3786 char*
3787 BLAST_PrintAllowedValues(const char *matrix_name,
3788  Int4 gap_open, Int4 gap_extend)
3789 {
3790  array_of_8 *values = NULL;
3791  Boolean found_matrix=FALSE;
3792  char* buffer,* ptr;
3793  Int4 index, max_number_values=0;
3794  MatrixInfo* matrix_info;
3795  ListNode* vnp,* head;
3796 
3797  ptr = buffer = (char *) calloc(2048, sizeof(char));
3798 
3799  sprintf(ptr, "Gap existence and extension values of %ld and %ld not supported for %s\nsupported values are:\n",
3800  (long) gap_open, (long) gap_extend, matrix_name);
3801 
3802  ptr += strlen(ptr);
3803 
3804  vnp = head = BlastLoadMatrixValues(FALSE);
3805  while (vnp)
3806  {
3807  matrix_info = vnp->ptr;
3808  if (strcasecmp(matrix_info->name, matrix_name) == 0)
3809  {
3810  values = matrix_info->values;
3811  max_number_values = matrix_info->max_number_values;
3812  found_matrix = TRUE;
3813  break;
3814  }
3815  vnp = vnp->next;
3816  }
3817 
3818  if (found_matrix)
3819  {
3820  for (index=0; index<max_number_values; index++)
3821  {
3822  if (BLAST_Nint(values[index][2]) == INT2_MAX)
3823  sprintf(ptr, "%ld, %ld\n", (long) BLAST_Nint(values[index][0]), (long) BLAST_Nint(values[index][1]));
3824  else
3825  sprintf(ptr, "%ld, %ld, %ld\n", (long) BLAST_Nint(values[index][0]), (long) BLAST_Nint(values[index][1]), (long) BLAST_Nint(values[index][2]));
3826  ptr += strlen(ptr);
3827  }
3828  }
3829 
3831 
3832  return buffer;
3833 }
3834 
3835 Int2
3837  Int4 gap_extend, Int4 reward, Int4 penalty,
3838  Blast_KarlinBlk* kbp_ungap,
3839  Boolean* round_down,
3840  Blast_Message** error_return)
3841 {
3842  const int kGapOpenIndex = 0;
3843  const int kGapExtIndex = 1;
3844  const int kLambdaIndex = 2;
3845  const int kKIndex = 3;
3846  const int kHIndex = 4;
3847  int num_combinations = 0;
3848  int gap_open_max, gap_extend_max;
3849  array_of_8* normal=NULL;
3850  array_of_8* linear=NULL;
3851  Int2 status = s_GetNuclValuesArray(reward,
3852  penalty,
3853  &num_combinations,
3854  &normal,
3855  &linear,
3856  &gap_open_max,
3857  &gap_extend_max,
3858  round_down,
3859  error_return);
3860 
3861  if (status)
3862  {
3863  sfree(normal);
3864  sfree(linear);
3865  return status;
3866  }
3867 
3868  ASSERT(kbp && kbp_ungap);
3869 
3870 
3871  /* Try to find the table entry corresponding to input gap costs. */
3872  if (gap_open == 0 && gap_extend == 0 && linear)
3873  {
3874  kbp->Lambda = linear[0][kLambdaIndex];
3875  kbp->K = linear[0][kKIndex];
3876  kbp->logK = log(kbp->K);
3877  kbp->H = linear[0][kHIndex];
3878  }
3879  else
3880  {
3881  int index=0;
3882  for (index = 0; index < num_combinations; ++index) {
3883  if (normal[index][kGapOpenIndex] == gap_open &&
3884  normal[index][kGapExtIndex] == gap_extend) {
3885  kbp->Lambda = normal[index][kLambdaIndex];
3886  kbp->K = normal[index][kKIndex];
3887  kbp->logK = log(kbp->K);
3888  kbp->H = normal[index][kHIndex];
3889  break;
3890  }
3891  }
3892 
3893  /* If gap costs are not found in the table, check if they belong to the
3894  infinite domain, where ungapped values of the parameters can be used. */
3895  if (index == num_combinations) {
3896  /* If gap costs are larger than maximal provided in tables, copy
3897  the values from the ungapped Karlin block. */
3898  if (gap_open >= gap_open_max && gap_extend >= gap_extend_max) {
3899  Blast_KarlinBlkCopy(kbp, kbp_ungap);
3900  } else if (error_return) {
3901  char buffer[8192];
3902  int i=0;
3903  size_t len=0;
3904  /* Unsupported gap costs combination. */
3905  sprintf(buffer, "Gap existence and extension values %ld and %ld "
3906  "are not supported for substitution scores %ld and %ld\n",
3907  (long) gap_open, (long) gap_extend, (long) reward, (long) penalty);
3908  for (i = 0; i < num_combinations; ++i)
3909  {
3910  len = strlen(buffer);
3911  sprintf(buffer+len, "%ld and %ld are supported existence and extension values\n",
3912  (long) normal[i][kGapOpenIndex], (long) normal[i][kGapExtIndex]);
3913  }
3914  len = strlen(buffer);
3915  sprintf(buffer+len, "%ld and %ld are supported existence and extension values\n",
3916  (long) gap_open_max, (long) gap_extend_max);
3917  len = strlen(buffer);
3918  sprintf(buffer+len, "Any values more stringent than %ld and %ld are supported\n",
3919  (long) gap_open_max, (long) gap_extend_max);
3921  sfree(normal);
3922  sfree(linear);
3923  return 1;
3924  }
3925  }
3926  }
3927 
3928  sfree(normal);
3929  sfree(linear);
3930  return 0;
3931 }
3932 
3933 /** Returns the beta statistical parameter value, given the nucleotide
3934  * substitution scores.
3935  * @param reward Match reward score [in]
3936  * @param penalty Mismatch penalty score [in]
3937  * @return The value of the beta parameter.
3938  */
3939 static double s_GetUngappedBeta(Int4 reward, Int4 penalty)
3940 {
3941  double beta = 0;
3942  if ((reward == 1 && penalty == -1) ||
3943  (reward == 2 && penalty == -3))
3944  beta = -2;
3945 
3946  return beta;
3947 }
3948 
3949 Int2 Blast_GetNuclAlphaBeta(Int4 reward, Int4 penalty, Int4 gap_open,
3950  Int4 gap_extend, Blast_KarlinBlk* kbp,
3951  Boolean gapped_calculation,
3952  double *alpha, double *beta)
3953 {
3954  const int kGapOpenIndex = 0;
3955  const int kGapExtIndex = 1;
3956  const int kAlphaIndex = 5;
3957  const int kBetaIndex = 6;
3958  Int4 num_combinations = 0;
3959  Int4 gap_open_max = 0, gap_extend_max = 0;
3960  Int4 index = 0;
3961  array_of_8* normal=NULL;
3962  array_of_8* linear=NULL;
3963  Boolean round_down = FALSE;
3964  Boolean found = FALSE;
3965  Int2 status = s_GetNuclValuesArray(reward,
3966  penalty,
3967  &num_combinations,
3968  &normal,
3969  &linear,
3970  &gap_open_max,
3971  &gap_extend_max,
3972  &round_down,
3973  NULL);
3974 
3975  if (status)
3976  return status;
3977 
3978  ASSERT(alpha && beta && kbp);
3979 
3980  /* For ungapped search return ungapped values of alpha and beta. */
3981  if (gapped_calculation && normal) {
3982  if (gap_open == 0 && gap_extend == 0 && linear)
3983  {
3984  *alpha = linear[0][kAlphaIndex];
3985  *beta = linear[0][kBetaIndex];
3986  found = TRUE;
3987  }
3988  else
3989  {
3990 
3991  for (index = 0; index < num_combinations; ++index) {
3992  if (normal[index][kGapOpenIndex] == gap_open &&
3993  normal[index][kGapExtIndex] == gap_extend) {
3994  *alpha = normal[index][kAlphaIndex];
3995  *beta = normal[index][kBetaIndex];
3996  found = TRUE;
3997  break;
3998  }
3999  }
4000  }
4001 
4002  }
4003 
4004  /* If input values not found in tables, or if this is an ungapped search,
4005  return the ungapped values of alpha and beta. */
4006  if (!found)
4007  {
4008  *alpha = kbp->Lambda/kbp->H;
4009  *beta = s_GetUngappedBeta(reward, penalty);
4010  }
4011 
4012  sfree(linear);
4013  sfree(normal);
4014  return 0;
4015 }
4016 
4017 /** Calculates score from expect value and search space.
4018  * @param E expect value [in]
4019  * @param kbp contains Karlin-Altschul parameters [in]
4020  * @param searchsp query times database size [in]
4021  * @return score
4022  */
4023 static Int4
4024 BlastKarlinEtoS_simple(double E, /* Expect value */
4025  const Blast_KarlinBlk* kbp,
4026  Int8 searchsp) /* size of search space */
4027 {
4028 
4029  double Lambda, K, H; /* parameters for Karlin statistics */
4030  Int4 S;
4031 /* Smallest float that might not cause a floating point exception in
4032  S = (Int4) (ceil( log((double)(K * searchsp / E)) / Lambda )); below. */
4033  const double kSmallFloat = 1.0e-297;
4034 
4035  Lambda = kbp->Lambda;
4036  K = kbp->K;
4037  H = kbp->H;
4038  if (Lambda < 0. || K < 0. || H < 0.0)
4039  {
4040  return BLAST_SCORE_MIN;
4041  }
4042 
4043  E = MAX(E, kSmallFloat);
4044 
4045  S = (Int4) (ceil( log((double)(K * searchsp / E)) / Lambda ));
4046  return S;
4047 }
4048 
4049 /* Compute a divisor used to weight the evalue of a collection of
4050  * "nsegs" distinct alignments. These divisors are used to compensate
4051  * for the effect of choosing the best among multiple collections of
4052  * alignments. See
4053  *
4054  * Stephen F. Altschul. Evaluating the statitical significance of
4055  * multiple distinct local alignments. In Suhai, editior, Theoretical
4056  * and Computational Methods in Genome Research, pages 1-14. Plenum
4057  * Press, New York, 1997.
4058  *
4059  * The "decayrate" parameter of this routine is a value in the
4060  * interval (0,1). Typical values of decayrate are .1 and .5. */
4061 
4062 double
4063 BLAST_GapDecayDivisor(double decayrate, unsigned nsegs )
4064 {
4065  return (1. - decayrate) * BLAST_Powi(decayrate, nsegs - 1);
4066 }
4067 
4068 
4069 /*
4070  BlastCutoffs
4071  Calculate the cutoff score, S, and the highest expected score.
4072 */
4073 Int2
4074 BLAST_Cutoffs(Int4 *S, /* cutoff score */
4075  double* E, /* expected no. of HSPs scoring at or above S */
4076  Blast_KarlinBlk* kbp,
4077  Int8 searchsp, /* size of search space. */
4078  Boolean dodecay, /* TRUE ==> use gapdecay feature */
4079  double gap_decay_rate)
4080 {
4081  Int4 s = *S, es;
4082  double e = *E, esave;
4083  Boolean s_changed = FALSE;
4084 
4085  if (kbp->Lambda == -1. || kbp->K == -1. || kbp->H == -1.)
4086  return 1;
4087 
4088  /*
4089  Calculate a cutoff score, S, from the Expected
4090  (or desired) number of reported HSPs, E.
4091  */
4092  es = 1;
4093  esave = e;
4094  if (e > 0.)
4095  {
4096  if (dodecay) {
4097  /* Invert the adjustment to the e-value that will be applied
4098  * to compensate for the effect of choosing the best among
4099  * multiple alignments */
4100  if( gap_decay_rate > 0 && gap_decay_rate < 1 ) {
4101  e *= BLAST_GapDecayDivisor(gap_decay_rate, 1);
4102  }
4103  }
4104  es = BlastKarlinEtoS_simple(e, kbp, searchsp);
4105  }
4106  /*
4107  Pick the larger cutoff score between the user's choice
4108  and that calculated from the value of E.
4109  */
4110  if (es > s) {
4111  s_changed = TRUE;
4112  *S = s = es;
4113  }
4114 
4115  /*
4116  Re-calculate E from the cutoff score, if E going in was too high
4117  */
4118  if (esave <= 0. || !s_changed)
4119  {
4120  e = BLAST_KarlinStoE_simple(s, kbp, searchsp);
4121  if (dodecay) {
4122  /* Weight the e-value to compensate for the effect of
4123  * choosing the best of more than one collection of
4124  * distinct alignments */
4125  if( gap_decay_rate > 0 && gap_decay_rate < 1 ) {
4126  e /= BLAST_GapDecayDivisor(gap_decay_rate, 1);
4127  }
4128  }
4129  *E = e;
4130  }
4131 
4132  return 0;
4133 }
4134 
4135 /*
4136  BlastKarlinStoE() -- given a score, return the associated Expect value
4137 
4138  Error return value is -1.
4139 */
4140 double
4142  Blast_KarlinBlk* kbp,
4143  Int8 searchsp) /* size of search space. */
4144 {
4145  double Lambda, K, H; /* parameters for Karlin statistics */
4146 
4147  Lambda = kbp->Lambda;
4148  K = kbp->K;
4149  H = kbp->H;
4150  if (Lambda < 0. || K < 0. || H < 0.) {
4151  return -1.;
4152  }
4153 
4154  return (double) searchsp * exp((double)(-Lambda * S) + kbp->logK);
4155 }
4156 
4157 /* Convert a P-value to an E-value. */
4158 double
4160 {
4161  if (p < 0.0 || p > 1.0) {
4162  return INT4_MIN;
4163  }
4164  if (p == 1.0)
4165  return INT4_MAX;
4166 
4167  return -BLAST_Log1p(-p);
4168 }
4169 
4170 
4171 /* Convert an E-value to a P-value. */
4172 double
4174 {
4175  return -BLAST_Expm1(-x);
4176 }
4177 
4178 
4179 /** Internal data structure used by Romberg integration callbacks. */
4180 typedef struct SRombergCbackArgs {
4181  int num_hsps; /**< number of HSPs */
4182  int num_hsps_minus_2; /**< number of HSPs minus 2 */
4183  double adj1; /**< Nat log of r**(r-2)/((r-1)! (r-2)!) */
4184  double adj2; /**< adj1 - score */
4185  double sdvir; /**< score divided by number of HSPs. */
4186  double epsilon; /**< convergence criteria for Romberg integration. */
4188 
4189 
4190 /** Callback for the Romberg integration function.
4191  * This is the second of the double integrals that
4192  * BlastSumPCalc calculates This is eqn. 4 of Karlin
4193  * and Altschul, PNAS USA, 90, 5873-5877 (1993).
4194  *
4195  * @param x variable to integrate over [in]
4196  * @param vp pointer to parameters [in]
4197  * @return value of integrand
4198  */
4199 static double
4200 s_OuterIntegralCback(double x, void* vp)
4201 {
4202  SRombergCbackArgs* callback_args = (SRombergCbackArgs*) vp;
4203  double y = exp(x - callback_args->sdvir);
4204 
4205  if (y == HUGE_VAL)
4206  return 0.;
4207 
4208  if (callback_args->num_hsps_minus_2 == 0)
4209  return exp(callback_args->adj2 - y);
4210  if (x == 0.)
4211  return 0.;
4212  return exp((callback_args->num_hsps_minus_2)*log(x) + callback_args->adj2 - y);
4213 }
4214 
4215 /** Callback for the Romberg integration function.
4216  * This is the first of the double integrals that
4217  * BlastSumPCalc calculates. This is the integral
4218  * described in the paragraph after eqn. 4 of Karlin
4219  * and Altschul, PNAS USA, 90, 5873-5877 (1993).
4220  *
4221  * @param s variable to integrate over [in]
4222  * @param vp pointer to parameters [in]
4223  * @return value of integrand
4224  */
4225 static double
4226 s_InnerIntegralCback(double s, void* vp)
4227 {
4228  double mx;
4229  SRombergCbackArgs* callback_args = (SRombergCbackArgs*) vp;
4230 
4231  callback_args->adj2 = callback_args->adj1 - s;
4232  callback_args->sdvir = s / callback_args->num_hsps;
4233  mx = (s > 0. ? callback_args->sdvir + 3. : 3.);
4234  return BLAST_RombergIntegrate(s_OuterIntegralCback, vp, 0., mx, callback_args->epsilon, 0, 1);
4235 }
4236 
4237 /**
4238  *
4239  * Evaluate the following double integral, where r = number of segments
4240  *
4241  * and s = the adjusted score in nats:
4242  *
4243  * (r-2) oo oo
4244  * Prob(r,s) = r - - (r-2)
4245  * ------------- | exp(-y) | x exp(-exp(x - y/r)) dx dy
4246  * (r-1)! (r-2)! U U
4247  * s 0
4248  * @param r number of segments
4249  * @param s adjusted score in nats
4250  * @return P value
4251  */
4252 static double
4253 s_BlastSumPCalc(int r, double s)
4254 {
4255  int r1, itmin;
4256  double t, d;
4257  double mean, stddev, stddev4;
4258  double logr;
4259  SRombergCbackArgs callback_args;
4260  const double kSumpEpsilon = 0.002;
4261 
4262  if (r == 1) {
4263  if (s > 8.)
4264  return exp(-s);
4265  return -BLAST_Expm1(-exp(-s));
4266  }
4267  if (r < 1)
4268  return 0.;
4269 
4270  if (r < 8) {
4271  if (s <= -2.3*r)
4272  return 1.;
4273  }
4274  else if (r < 15) {
4275  if (s <= -2.5*r)
4276  return 1.;
4277  }
4278  else if (r < 27) {
4279  if (s <= -3.0*r)
4280  return 1.;
4281  }
4282  else if (r < 51) {
4283  if (s <= -3.4*r)
4284  return 1.;
4285  }
4286  else if (r < 101) {
4287  if (s <= -4.0*r)
4288  return 1.;
4289  }
4290 
4291  /* stddev in the limit of infinite r, but quite good for even small r */
4292  stddev = sqrt(r);
4293  stddev4 = 4.*stddev;
4294  r1 = r - 1;
4295 
4296  if (r > 100) {
4297  /* Calculate lower bound on the mean using inequality log(r) <= r */
4298  double est_mean = -r * r1;
4299  if (s <= est_mean - stddev4)
4300  return 1.;
4301  }
4302 
4303  /* mean is rather close to the mode, and the mean is readily calculated */
4304  /* mean in the limit of infinite r, but quite good for even small r */
4305  logr = log(r);
4306  mean = r * (1. - logr) - 0.5;
4307  if (s <= mean - stddev4)
4308  return 1.;
4309 
4310  if (s >= mean) {
4311  t = s + 6.*stddev;
4312  itmin = 1;
4313  }
4314  else {
4315  t = mean + 6.*stddev;
4316  itmin = 2;
4317  }
4318 
4319  memset((void *)&callback_args, 0, sizeof(callback_args));
4320  callback_args.num_hsps = r;
4321  callback_args.num_hsps_minus_2 = r - 2;
4322  callback_args.adj1 = callback_args.num_hsps_minus_2*logr - BLAST_LnGammaInt(r1) - BLAST_LnGammaInt(r);
4323  callback_args.epsilon = kSumpEpsilon;
4324 
4325  do {
4326  d = BLAST_RombergIntegrate(s_InnerIntegralCback, &callback_args, s, t, callback_args.epsilon, 0, itmin);
4327  if (d == HUGE_VAL)
4328  return d;
4329  } while (s < mean && d < 0.4 && itmin++ < 4);
4330 
4331  return (d < 1. ? d : 1.);
4332 }
4333 
4334 /** Estimate the Sum P-value by calculation or interpolation, as appropriate.
4335  * Approx. 2-1/2 digits accuracy minimum throughout the range of r, s.
4336  * @param r number of segments [in]
4337  * @param s total score (in nats), adjusted by -r*log(KN) [in]
4338  * @return p-value
4339 */
4340 static double
4341 s_BlastSumP(Int4 r, double s)
4342 {
4343  static const double kTab2[] = { /* table for r == 2 */
4344  0.01669, 0.0249, 0.03683, 0.05390, 0.07794, 0.1111, 0.1559, 0.2146,
4345  0.2890, 0.3794, 0.4836, 0.5965, 0.7092, 0.8114, 0.8931, 0.9490,
4346  0.9806, 0.9944, 0.9989
4347  };
4348  static const double kTab3[] = { /* table for r == 3 */
4349  0.9806, 0.9944, 0.9989, 0.0001682,0.0002542,0.0003829,0.0005745,0.0008587,
4350  0.001278, 0.001893, 0.002789, 0.004088, 0.005958, 0.008627, 0.01240, 0.01770,
4351  0.02505, 0.03514, 0.04880, 0.06704, 0.09103, 0.1220, 0.1612, 0.2097,
4352  0.2682, 0.3368, 0.4145, 0.4994, 0.5881, 0.6765, 0.7596, 0.8326,
4353  0.8922, 0.9367, 0.9667, 0.9846, 0.9939, 0.9980
4354  };
4355  static const double kTab4[] = { /* table for r == 4 */
4356  2.658e-07,4.064e-07,6.203e-07,9.450e-07,1.437e-06,2.181e-06,3.302e-06,4.990e-06,
4357  7.524e-06,1.132e-05,1.698e-05,2.541e-05,3.791e-05,5.641e-05,8.368e-05,0.0001237,
4358  0.0001823,0.0002677,0.0003915,0.0005704,0.0008275,0.001195, 0.001718, 0.002457,
4359  0.003494, 0.004942, 0.006948, 0.009702, 0.01346, 0.01853, 0.02532, 0.03431,
4360  0.04607, 0.06128, 0.08068, 0.1051, 0.1352, 0.1719, 0.2157, 0.2669,
4361  0.3254, 0.3906, 0.4612, 0.5355, 0.6110, 0.6849, 0.7544, 0.8168,
4362  0.8699, 0.9127, 0.9451, 0.9679, 0.9827, 0.9915, 0.9963
4363  };
4364  const double* kTable[] = { kTab2, kTab3, kTab4 };
4365  const int kTabsize[] = { DIM(kTab2)-1, DIM(kTab3)-1, DIM(kTab4)-1 }; /* sizes of tab2, tab3, tab4 */
4366 
4367  if (r == 1)
4368  return -BLAST_Expm1(-exp(-s));
4369 
4370  if (r <= 4) {
4371  Int4 r1;
4372  double a;
4373 
4374  if (r < 1)
4375  return 0.;
4376  r1 = r - 1;
4377  if (s >= r*r+r1) {
4378  a = BLAST_LnGammaInt(r+1);
4379  return r * exp(r1*log(s)-s-a-a);
4380  }
4381  if (s > -2*r) {
4382  Int4 i, r2;
4383  /* interpolate */
4384  i = (Int4) (a = s+s+(4*r));
4385  a -= i;
4386  i = kTabsize[r2 = r - 2] - i;
4387  return a*kTable[r2][i-1] + (1.-a)*kTable[r2][i];
4388  }
4389  return 1.;
4390  }
4391 
4392  return s_BlastSumPCalc(r, s);
4393 }
4394 
4395 //TODO: implement Spouge's FSC?
4396 /*
4397  Calculates the e-value for alignments with "small" gaps (typically
4398  under fifty residues/basepairs) following ideas of Stephen Altschul's.
4399 */
4400 
4401 double
4403  Int4 starting_points, /* the number of starting points
4404  * permitted between adjacent
4405  * alignments;
4406  * max_overlap + max_gap + 1 */
4407  Int2 num, /* the number of distinct alignments in this
4408  * collection */
4409  double xsum, /* the sum of the scores of these
4410  * alignments, each weighted normalized
4411  * using an appropriate value of
4412  * Lambda and logK */
4413  Int4 query_length, /* the effective len of the query seq */
4414  Int4 subject_length, /* the effective len of the database seq */
4415  Int8 searchsp_eff, /* the effective size of the search space */
4416  double weight_divisor) /* a divisor used to weight the e-value
4417  * when multiple collections of alignments
4418  * are being considered by the calling
4419  * routine */
4420 {
4421 
4422  double sum_e; /* The e-value of this set of alignments */
4423 
4424  if(num == 1) {
4425  sum_e = searchsp_eff * exp(-xsum);
4426  } else {
4427  double pair_search_space; /* The effective size of the search
4428  * space, for this query-subject pair */
4429  double sum_p; /* The p-value of this set of alignments */
4430 
4431  pair_search_space = (double)subject_length * (double)query_length;
4432 
4433  xsum -=
4434  log(pair_search_space) + 2 * (num-1)*log((double)starting_points);
4435 
4436  xsum -= BLAST_LnFactorial((double) num);
4437 
4438  sum_p = s_BlastSumP(num, xsum);
4439  sum_e = BLAST_KarlinPtoE(sum_p) *
4440  ((double) searchsp_eff / (double) pair_search_space);
4441  }
4442  if( weight_divisor == 0.0 || (sum_e /= weight_divisor) > INT4_MAX ) {
4443  sum_e = INT4_MAX;
4444  }
4445 
4446  return sum_e;
4447 }
4448 
4449 //TODO: implement Spouge's FSC?
4450 /** Calculates the e-value of a collection multiple distinct
4451  * alignments with asymmetric gaps between the alignments. The gaps
4452  * in one (protein) sequence are typically small (like in
4453  * BLAST_SmallGapSumE) gap an the gaps in the other (translated DNA)
4454  * sequence are possibly large (up to 4000 bp.) This routine is used
4455  * for linking HSPs representing exons in the DNA sequence that are
4456  * separated by introns.
4457  * @param query_start_points the number of starting points in
4458  * the query sequence permitted
4459  * between adjacent alignments [in]
4460  * @param subject_start_points the number of starting points in
4461  * the subject sequence permitted
4462  * between adjacent alignments [in]
4463  * @param num The number of distinct alignments in this collection [in]
4464  * @param xsum The sum of the scores of these alignments, each normalized
4465  * using an appropriate value of Lambda and logK [in]
4466  * @param query_length The effective len of the query seq [in]
4467  * @param subject_length The effective len of the database seq [in]
4468  * @param searchsp_eff effective size of the search space [in]
4469  * @param weight_divisor A divisor used to weight the e-value when multiple
4470  * collections of alignments are being considered by
4471  * the calling routine [in]
4472  * @return Resulting e-value of a combined set.
4473  */
4474 double
4475 BLAST_UnevenGapSumE(Int4 query_start_points, Int4 subject_start_points,
4476  Int2 num, double xsum,
4477  Int4 query_length, Int4 subject_length,
4478  Int8 searchsp_eff,
4479  double weight_divisor)
4480 {
4481  double sum_e; /* The e-value of this set of alignments */
4482 
4483  if(num == 1) {
4484  sum_e = searchsp_eff * exp(-xsum);
4485  } else {
4486  double sum_p; /* The p-value of this set of alignments */
4487 
4488  double pair_search_space; /* The effective size of the search
4489  * space, for this query-subject pair */
4490  pair_search_space = (double)subject_length*(double)query_length;
4491 
4492  xsum -= log(pair_search_space) +
4493  (num-1)*(log((double) query_start_points) +
4494  log((double) subject_start_points));
4495  xsum -= BLAST_LnFactorial((double) num);
4496 
4497  sum_p = s_BlastSumP(num, xsum);
4498  sum_e = BLAST_KarlinPtoE(sum_p) *
4499  ((double) searchsp_eff / (double) pair_search_space);
4500  }
4501  if( weight_divisor == 0.0 || (sum_e /= weight_divisor) > INT4_MAX ) {
4502  sum_e = INT4_MAX;
4503  }
4504 
4505  return sum_e;
4506 }
4507 
4508 
4509 //TODO: implement Spouge's FSC?
4510 /*
4511  Calculates the e-value if a collection of distinct alignments with
4512  arbitrarily large gaps between the alignments
4513 */
4514 
4515 double
4517  Int2 num, /* the number of distinct alignments in this
4518  * collection */
4519  double xsum, /* the sum of the scores of these
4520  * alignments each, normalized using an
4521  * appropriate value of Lambda and
4522  * logK */
4523  Int4 query_length, /* the effective len of the query seq */
4524  Int4 subject_length, /* the effective len of the database seq */
4525  Int8 searchsp_eff, /* the effective size of the search space */
4526  double weight_divisor) /* a divisor used to weight the e-value
4527  * when multiple collections of alignments
4528  * are being considered by the calling
4529  * routine */
4530 {
4531  double sum_p; /* The p-value of this set of alignments */
4532  double sum_e; /* The e-value of this set of alignments */
4533 
4534 /* The next two variables are for compatability with Warren's code. */
4535  double lcl_subject_length; /* query_length as a float */
4536  double lcl_query_length; /* subject_length as a float */
4537 
4538  lcl_query_length = (double) query_length;
4539  lcl_subject_length = (double) subject_length;
4540 
4541  if( num == 1 ) {
4542  sum_e = searchsp_eff * exp(-xsum);
4543  } else {
4544  xsum -= num*log(lcl_subject_length*lcl_query_length)
4545  - BLAST_LnFactorial((double) num);
4546 
4547  sum_p = s_BlastSumP(num, xsum);
4548 
4549  sum_e = BLAST_KarlinPtoE(sum_p) *
4550  ((double) searchsp_eff / (lcl_query_length * lcl_subject_length));
4551  }
4552  if( weight_divisor == 0.0 || (sum_e /= weight_divisor) > INT4_MAX ) {
4553  sum_e = INT4_MAX;
4554  }
4555 
4556  return sum_e;
4557 }
4558 
4559 /* Please see comment in blast_stat.h */
4560 void
4561 Blast_FillResidueProbability(const Uint1* sequence, Int4 length, double * resProb)
4562 {
4563  Int4 frequency[BLASTAA_SIZE]; /*frequency of each letter*/
4564  Int4 i; /*index*/
4565  Int4 denominator; /*length not including X's*/
4566 
4567  denominator = length;
4568  for(i = 0; i < BLASTAA_SIZE; i++)
4569  frequency[i] = 0;
4570 
4571  for(i = 0; i < length; i++) {
4572  if (sequence[i] != AMINOACID_TO_NCBISTDAA['X'])
4573  frequency[sequence[i]]++;
4574  else
4575  denominator--;
4576  }
4577 
4578  for(i = 0; i < BLASTAA_SIZE; i++) {
4579  if (frequency[i] == 0)
4580  resProb[i] = 0.0;
4581  else
4582  resProb[i] = ((double) frequency[i]) /((double) denominator);
4583  }
4584 }
4585 
4586 /*------------------- RPS BLAST functions --------------------*/
4587 
4588 /** Gets the ungapped lambda calculated for the matrix in question given
4589  * standard residue composition for both query and subject sequences.
4590  * @param matrixName name of amino acid substitution matrix [in]
4591  * @return lambda ungapped or 0.0 if matrix is not supported
4592  */
4593 static double
4594 RPSfindUngappedLambda(const char *matrixName)
4595 {
4596  double* lambda_array = NULL;
4597  int num_lambdas = Blast_GetMatrixValues(matrixName, NULL, NULL,
4598  &lambda_array, NULL, NULL, NULL,
4599  NULL, NULL);
4600  if (num_lambdas > 0) {
4601  double retval = lambda_array[0];
4602  sfree(lambda_array);
4603  return retval;
4604  } else {
4605  sfree(lambda_array);
4606  return 0.0;
4607  }
4608 }
4609 
4610 /**
4611  * the routine RPSFillScores computes the probability of each score weighted
4612  * by the probability of each query residue and fills those probabilities
4613  * into scoreArray and puts scoreArray as a field in
4614  * that in the structure that is returned
4615  * for indexing convenience the field storing scoreArray points to the
4616  * entry for score 0, so that referring to the -k index corresponds to
4617  * score -k
4618  *FIXME: This can be replaced by _PSIComputeScoreProbabilities??
4619  * @param matrix a position-specific score matrix with matrixLength positions [in]
4620  * @param matrixLength number of positions in the pssm (arg above) [in]
4621  * @param queryProbArray an array containing the probability of occurrence
4622  * of each residue in the query [in]
4623  * @param scoreArray an array of probabilities for each score that is
4624  * to be used as a field in return_sfp
4625  * @param return_sfp a structure to be filled in and returned [in|out]
4626  * @param range the size of scoreArray and is an upper bound on the
4627  * difference between maximum score and minimum score in the matrix [in]
4628  * @param alphabet_size Number of letters in the alphabet of the input
4629  * score matrix [in]
4630 */
4631 
4632 static void
4633 RPSFillScores(Int4 **matrix, Int4 matrixLength,
4634  double *queryProbArray, double *scoreArray,
4635  Blast_ScoreFreq* return_sfp, Int4 range,
4636  Int4 alphabet_size)
4637 {
4638  Int4 minScore, maxScore; /*observed minimum and maximum scores */
4639  Int4 i,j; /* indices */
4640  double recipLength; /* 1/matrix length as a double */
4641 
4642  minScore = maxScore = 0;
4643  for (i = 0; i < matrixLength; i++) {
4644  for (j = 0 ; j < alphabet_size; j++) {
4645  if (j == AMINOACID_TO_NCBISTDAA['X'])
4646  continue;
4647  if ((matrix[i][j] > BLAST_SCORE_MIN) &&
4648  (matrix[i][j] < minScore))
4649  minScore = matrix[i][j];
4650  if (matrix[i][j] > maxScore)
4651  maxScore = matrix[i][j];
4652  }
4653  }
4654 
4655  return_sfp->obs_min = minScore;
4656  return_sfp->obs_max = maxScore;
4657  memset(scoreArray, 0, (maxScore - minScore + 1) * sizeof(double));
4658 
4659  return_sfp->sprob = &(scoreArray[-minScore]); /*center around 0*/
4660  recipLength = 1.0 / (double) matrixLength;
4661  for(i = 0; i < matrixLength; i++) {
4662  for (j = 0; j < alphabet_size; j++) {
4663  if (j == AMINOACID_TO_NCBISTDAA['X'])
4664  continue;
4665  if(matrix[i][j] >= minScore)
4666  return_sfp->sprob[matrix[i][j]] += recipLength *
4667  queryProbArray[j];
4668  }
4669  }
4670 
4671  return_sfp->score_avg = 0;
4672  for(i = minScore; i <= maxScore; i++)
4673  return_sfp->score_avg += i * return_sfp->sprob[i];
4674 }
4675 
4676 Int4 **
4677 RPSRescalePssm(double scalingFactor, Int4 rps_query_length,
4678  const Uint1* rps_query_seq, Int4 db_seq_length,
4679  Int4 **posMatrix, BlastScoreBlk *sbp)
4680 {
4681  double *scoreArray; /*array of score probabilities*/
4682  double *resProb; /*array of probabilities for each residue*/
4683  Blast_ScoreFreq * return_sfp;/*score frequency pointers to compute lambda*/
4684  Int4* * returnMatrix; /*the PSSM to return */
4685  double initialUngappedLambda;
4686  double scaledInitialUngappedLambda;
4687  double correctUngappedLambda;
4688  double finalLambda;
4689  double temp; /*intermediate variable for adjusting matrix*/
4690  Int4 index, inner_index;
4691  Int4 alphabet_size;
4692 
4693  resProb = (double *)malloc(BLASTAA_SIZE * sizeof(double));
4694  scoreArray = (double *)malloc(BLAST_SCORE_RANGE_MAX * sizeof(double));
4695  return_sfp = (Blast_ScoreFreq *)malloc(sizeof(Blast_ScoreFreq));
4696 
4697  Blast_FillResidueProbability(rps_query_seq, rps_query_length, resProb);
4698 
4699  alphabet_size = (Int4)sbp->psi_matrix->pssm->nrows;
4700  RPSFillScores(posMatrix, db_seq_length, resProb, scoreArray,
4701  return_sfp, BLAST_SCORE_RANGE_MAX, alphabet_size);
4702 
4703  initialUngappedLambda = RPSfindUngappedLambda(sbp->name);
4704  ASSERT(initialUngappedLambda > 0.0);
4705  scaledInitialUngappedLambda = initialUngappedLambda / scalingFactor;
4706  correctUngappedLambda = Blast_KarlinLambdaNR(return_sfp,
4707  scaledInitialUngappedLambda);
4708  sfree(resProb);
4709  sfree(scoreArray);
4710  sfree(return_sfp);
4711  if(correctUngappedLambda == -1.0)
4712  return NULL;
4713 
4714  finalLambda = correctUngappedLambda/scaledInitialUngappedLambda;
4715 
4716  /* note that the final score matrix returned has an
4717  alphabet size of BLASTAA_SIZE, even if the initial
4718  PSSM has a smaller alphabet*/
4719  returnMatrix = (Int4 **)_PSIAllocateMatrix(db_seq_length,
4720  BLASTAA_SIZE,
4721  sizeof(Int4));
4722 
4723  for (index = 0; index < db_seq_length; index++) {
4724  for (inner_index = 0; inner_index < alphabet_size; inner_index++) {
4725  if (posMatrix[index][inner_index] <= BLAST_SCORE_MIN ||
4726  inner_index == AMINOACID_TO_NCBISTDAA['X']) {
4727  returnMatrix[index][inner_index] =
4728  posMatrix[index][inner_index];
4729  }
4730  else {
4731  temp = ((double)(posMatrix[index][inner_index])) * finalLambda;
4732  returnMatrix[index][inner_index] = (Int4)BLAST_Nint(temp);
4733  }
4734  }
4735  for (; inner_index < BLASTAA_SIZE; inner_index++) {
4736  returnMatrix[index][inner_index] = BLAST_SCORE_MIN;
4737  }
4738  }
4739 
4740  return returnMatrix;
4741 }
4742 
4743 /*------------------- compressed alphabet functions --------------------*/
4744 
4745 /** 2-D array mapping compressed letters to sets
4746  * of ordinary protein letters
4747  */
4749  [BLASTAA_SIZE+1];
4750 
4751 /** parse the string defining the conversion between the
4752  * ordinary protein alphabet and a compressed alphabet
4753  * @param trans_string The alphabet mappig [in]
4754  * @param table A map from protein letter to compressed letter.
4755  * Protein letter that have no compressed equivalent
4756  * will translate to value alphabet_size [out]
4757  * @param compressed_alphabet_size The anticipated size of the
4758  * compressed alphabet [in]
4759  * @param rev_table A (one-to-many) mapping from compressed letter to
4760  * protein letter. The list of protein letters in each
4761  * row of the table ends with a negative value [out]
4762  */
4763 static void s_BuildCompressedTranslation(const char *trans_string,
4764  Uint1 *table,
4765  Int4 compressed_alphabet_size,
4766  CompressedReverseLookup rev_table)
4767 {
4768  Int4 i, j;
4769  Int4 compressed_letter;
4770 
4771  for (i = 0; i < BLASTAA_SIZE; i++)
4772  table[i] = compressed_alphabet_size;
4773 
4774  for (i = j = compressed_letter = 0; trans_string[i] != 0; i++) {
4775 
4776  Int4 c = trans_string[i];
4777 
4778  if (isspace(c)) {
4779  compressed_letter++;
4780  j = 0;
4781  }
4782  else if (isalpha(c)) {
4783  Int4 aa_letter = AMINOACID_TO_NCBISTDAA[c];
4784  table[aa_letter] = compressed_letter;
4785  rev_table[compressed_letter][j++] = aa_letter;
4786  rev_table[compressed_letter][j] = -1;
4787  }
4788  }
4789 
4790  ASSERT(compressed_letter == compressed_alphabet_size - 1);
4791 }
4792 
4793 /** Calculate conditional probability of each letter in each group
4794  * @param sbp Structure containing alphabet information [in]
4795  * @param compressed_prob Array containing final probabilities [out]
4796  * @param compressed_alphabet_size size of the alphabet [in]
4797  * @param rev_table A (one-to-many) mapping from compressed letter to
4798  * protein letter. The list of protein letters in each
4799  * row of the table ends with a negative value [in]
4800  */
4802  double* compressed_prob,
4803  Int4 compressed_alphabet_size,
4804  CompressedReverseLookup rev_table)
4805 {
4806  Int4 i, letter;
4807  Blast_ResFreq *rfp;
4808 
4809  rfp = Blast_ResFreqNew(sbp);
4810  if (rfp == NULL)
4811  return -1;
4812 
4813  /* get the normalized background residue frequencies
4814  for the protein alphabet */
4815 
4816  Blast_ResFreqStdComp(sbp, rfp);
4817 
4818  for (i = 0; i < BLASTAA_SIZE; i++)
4819  compressed_prob[i] = 0.0;
4820 
4821  for (letter = 0; letter < compressed_alphabet_size; letter++) {
4822  double prob_sum = 0.;
4823 
4824  /* sum the frequencies of the protein letters making
4825  up this compressed letter */
4826  for (i = 0; i < BLASTAA_SIZE; i++) {
4827  Int4 aa = rev_table[letter][i];
4828 
4829  if (aa < 0)
4830  break;
4831  prob_sum += rfp->prob[aa];
4832  }
4833 
4834  /* compute P(i | letter) */
4835 
4836  for (i = 0; i < BLASTAA_SIZE; i++) {
4837  Int4 aa = rev_table[letter][i];
4838 
4839  if (aa < 0)
4840  break;
4841  compressed_prob[aa] = rfp->prob[aa] / prob_sum;
4842  }
4843  }
4844 
4845  Blast_ResFreqFree(rfp);
4846  return 0;
4847 }
4848 
4849 /** Compute a (non-square) score matrix for a compressed alphabet
4850  * @param sbp Structure containing alphabet and scoring information [in]
4851  * @param new_alphabet Structure defining the new alphabet, including the
4852  * final score matrix [in][out]
4853  * @param matrix_scale_factor Score matrix entries are scaled by this value [in]
4854  * @param rev_table A (one-to-many) mapping from compressed letter to
4855  * protein letter. The list of protein letters in each
4856  * row of the table ends with a negative value [in]
4857  */
4859  SCompressedAlphabet *new_alphabet,
4860  double matrix_scale_factor,
4861  CompressedReverseLookup rev_table)
4862 {
4863  double compressed_prob[BLASTAA_SIZE];
4864  double lambda;
4865  SFreqRatios * std_freqs;
4866  SBlastScoreMatrix *new_matrix;
4867  Int4 compressed_alphabet_size =
4868  new_alphabet->compressed_alphabet_size;
4869 
4870  /* get the ungapped lambda for the protein score matrix */
4871 
4873  if (lambda <= 0)
4874  return -1;
4875 
4876  matrix_scale_factor /= lambda;
4877 
4878  /* get the frequency ratios for the protein score matrix */
4879 
4880  std_freqs = _PSIMatrixFrequencyRatiosNew(sbp->name);
4881  if (std_freqs == NULL)
4882  return -2;
4883 
4884  /* sum the background frequencies of each compressed letter */
4885 
4886  if (s_GetCompressedProbs(sbp, compressed_prob,
4887  compressed_alphabet_size,
4888  rev_table) < 0) {
4889  _PSIMatrixFrequencyRatiosFree(std_freqs);
4890  return -3;
4891  }
4892 
4893  new_matrix = new_alphabet->matrix = SBlastScoreMatrixNew(
4894  BLASTAA_SIZE, compressed_alphabet_size);
4895  if (new_matrix) {
4896  Int4 **scores = new_matrix->data;
4897  Int4 i, q, s;
4898  const double min_freq = BLAST_SCORE_MIN / matrix_scale_factor;
4899 
4900  for (q = 0; q < BLASTAA_SIZE; q++) {
4901  for (s = 0; s < compressed_alphabet_size; s++) {
4902 
4903  double val = 0; /* combined frequency ratio */
4904 
4905  for (i = 0; i < BLASTAA_SIZE; i++) {
4906 
4907  Int4 aa = rev_table[s][i];
4908  if (aa < 0)
4909  break;
4910 
4911  /* find the frequency of the original letter, then
4912  adjust using letter's weight */
4913 
4914  val += std_freqs->data[q][aa] * compressed_prob[aa];
4915  }
4916 
4917  val = (val < 1e-8) ? min_freq : log(val);
4918  scores[q][s] = (Int4)BLAST_Nint(val * matrix_scale_factor);
4919  }
4920  }
4921  }
4922 
4923  _PSIMatrixFrequencyRatiosFree(std_freqs);
4924  return 0;
4925 }
4926 
4927 /* for more information see Edgar RC, "Local homology
4928  recognition and distance measures in linear time using
4929  compressed amino acid alphabets." PMID: 14729922.
4930  The strings below have letter groups sorted in order
4931  of decreasing combined residue frequency */
4932 
4933 /** 23-to-10 letter compressed alphabet. Based on SE-V(10) */
4934 static const char* s_alphabet10 = "IJLMV AST BDENZ KQR G FY P H C W";
4935 /** 23-to-15 letter compressed alphabet. Based on SE_B(14) */
4936 static const char* s_alphabet15 = "ST IJV LM KR EQZ A G BD P N F Y H C W";
4937 
4940  Int4 compressed_alphabet_size,
4941  double matrix_scale_factor)
4942 {
4943  SCompressedAlphabet *new_alphabet;
4944  CompressedReverseLookup rev_table;
4945  const char* alphabet_string = compressed_alphabet_size == 10 ?
4947 
4948  ASSERT(compressed_alphabet_size == 10 ||
4949  compressed_alphabet_size == 15);
4950 
4951  new_alphabet = (SCompressedAlphabet*)calloc(1,
4952  sizeof(SCompressedAlphabet));
4953 
4954  /* parse the compressed alphabet */
4955 
4956  new_alphabet->compressed_alphabet_size = compressed_alphabet_size;
4957  new_alphabet->compress_table = (Uint1*)malloc(BLASTAA_SIZE * sizeof(Uint1));
4958 
4959  s_BuildCompressedTranslation(alphabet_string,
4960  new_alphabet->compress_table,
4961  compressed_alphabet_size,
4962  rev_table);
4963 
4964  /* build the corresponding score matrix */
4965 
4966  if (s_BuildCompressedScoreMatrix(sbp, new_alphabet,
4967  matrix_scale_factor, rev_table) < 0) {
4968  return SCompressedAlphabetFree(new_alphabet);
4969  }
4970 
4971  return new_alphabet;
4972 }
4973 
4976 {
4977  if (alphabet) {
4978  SBlastScoreMatrixFree(alphabet->matrix);
4979  sfree(alphabet->compress_table);
4980  sfree(alphabet);
4981  }
4982  return NULL;
4983 }
4984 
4985 /**
4986  * Computes the adjustment to the lengths of the query and database sequences
4987  * that is used to compensate for edge effects when computing evalues.
4988  *
4989  * The length adjustment is an integer-valued approximation to the fixed
4990  * point of the function
4991  *
4992  * f(ell) = beta +
4993  * (alpha/lambda) * (log K + log((m - ell)*(n - N ell)))
4994  *
4995  * where m is the query length n is the length of the database and N is the
4996  * number of sequences in the database. The values beta, alpha, lambda and
4997  * K are statistical, Karlin-Altschul parameters.
4998  *
4999  * The value of the length adjustment computed by this routine, A,
5000  * will always be an integer smaller than the fixed point of
5001  * f(ell). Usually, it will be the largest such integer. However, the
5002  * computed length adjustment, A, will also be so small that
5003  *
5004  * K * (m - A) * (n - N * A) > MAX(m,n).
5005  *
5006  * Moreover, an iterative method is used to compute A, and under
5007  * unusual circumstances the iterative method may not converge.
5008  *
5009  * @param K the statistical parameter K
5010  * @param logK the natural logarithm of K
5011  * @param alpha_d_lambda the ratio of the statistical parameters
5012  * alpha and lambda (for ungapped alignments, the
5013  * value 1/H should be used)
5014  * @param beta the statistical parameter beta (for ungapped
5015  * alignments, beta == 0)
5016  * @param query_length the length of the query sequence
5017  * @param db_length the length of the database
5018  * @param db_num_seqs the number of sequences in the database
5019  * @param length_adjustment the computed value of the length adjustment [out]
5020  *
5021  * @return 0 if length_adjustment is known to be the largest integer less
5022  * than the fixed point of f(ell); 1 otherwise.
5023  */
5024 Int4
5026  double logK,
5027  double alpha_d_lambda,
5028  double beta,
5029  Int4 query_length,
5030  Int8 db_length,
5031  Int4 db_num_seqs,
5032  Int4 * length_adjustment)
5033 {
5034  Int4 i; /* iteration index */
5035  const Int4 kMaxIterations = 20; /* maximum allowed iterations */
5036  double m = (double) query_length;
5037  double n = (double) db_length;
5038  double N = (double) db_num_seqs;
5039 
5040  double ell; /* A float value of the length adjustment */
5041  double ss; /* effective size of the search space */
5042  double ell_min = 0, ell_max; /* At each iteration i,
5043  * ell_min <= ell <= ell_max. */
5044  Boolean converged = FALSE; /* True if the iteration converged */
5045  double ell_next = 0; /* Value the variable ell takes at iteration
5046  * i + 1 */
5047  /* Choose ell_max to be the largest nonnegative value that satisfies
5048  *
5049  * K * (m - ell) * (n - N * ell) > MAX(m,n)
5050  *
5051  * Use quadratic formula: 2 c /( - b + sqrt( b*b - 4 * a * c )) */
5052  { /* scope of a, mb, and c, the coefficients in the quadratic formula
5053  * (the variable mb is -b) */
5054  double a = N;
5055  double mb = m * N + n;
5056  double c = n * m - MAX(m, n) / K;
5057 
5058  if(c < 0) {
5059  *length_adjustment = 0;
5060  return 1;
5061  } else {
5062  ell_max = 2 * c / (mb + sqrt(mb * mb - 4 * a * c));
5063  }
5064  } /* end scope of a, mb and c */
5065 
5066  for(i = 1; i <= kMaxIterations; i++) { /* for all iteration indices */
5067  double ell_bar; /* proposed next value of ell */
5068  ell = ell_next;
5069  ss = (m - ell) * (n - N * ell);
5070  ell_bar = alpha_d_lambda * (logK + log(ss)) + beta;
5071  if(ell_bar >= ell) { /* ell is no bigger than the true fixed point */
5072  ell_min = ell;
5073  if(ell_bar - ell_min <= 1.0) {
5074  converged = TRUE;
5075  break;
5076  }
5077  if(ell_min == ell_max) { /* There are no more points to check */
5078  break;
5079  }
5080  } else { /* else ell is greater than the true fixed point */
5081  ell_max = ell;
5082  }
5083  if(ell_min <= ell_bar && ell_bar <= ell_max) {
5084  /* ell_bar is in range. Accept it */
5085  ell_next = ell_bar;
5086  } else { /* else ell_bar is not in range. Reject it */
5087  ell_next = (i == 1) ? ell_max : (ell_min + ell_max) / 2;
5088  }
5089  } /* end for all iteration indices */
5090  if(converged) { /* the iteration converged */
5091  /* If ell_fixed is the (unknown) true fixed point, then we
5092  * wish to set (*length_adjustment) to floor(ell_fixed). We
5093  * assume that floor(ell_min) = floor(ell_fixed) */
5094  *length_adjustment = (Int4) ell_min;
5095  /* But verify that ceil(ell_min) != floor(ell_fixed) */
5096  ell = ceil(ell_min);
5097  if( ell <= ell_max ) {
5098  ss = (m - ell) * (n - N * ell);
5099  if(alpha_d_lambda * (logK + log(ss)) + beta >= ell) {
5100  /* ceil(ell_min) == floor(ell_fixed) */
5101  *length_adjustment = (Int4) ell;
5102  }
5103  }
5104  } else { /* else the iteration did not converge. */
5105  /* Use the best value seen so far */
5106  *length_adjustment = (Int4) ell_min;
5107  }
5108 
5109  return converged ? 0 : 1;
5110 }
5111 
5112 /* UNUSED
5113 static double s_CalculateNormalProbability(double x_, double eps_)
5114 {
5115  double pi = 3.1415926535897932384626433832795;
5116  double x_max;
5117  double const_val, h, res;
5118  Int4 i, N;
5119 
5120  if(x_==0) return 0.5;
5121 
5122  eps_ = MIN(1.0,eps_);
5123 
5124  x_max = 10*eps_+sqrt(MAX(0.0,-2*log(eps_)));
5125 
5126  if(x_>=x_max) {
5127  double x=x_/sqrt(2.0);
5128  return 1-0.5*exp(-x*x)/(x*sqrt(pi))*(1-1.0/(2*x*2*x));
5129  };
5130 
5131  if(x_<=-x_max) {
5132  double x=x_/sqrt(2.0);
5133  return 0.5*exp(-x*x)/(-x*sqrt(pi))*(1-1.0/(2*x*2*x));
5134  };
5135 
5136  const_val = 1/sqrt(2.0*pi);
5137  N = (Int4) (fabs(x_)/eps_ + 1.5);
5138  h = x_/(double)N;
5139  res = 0.0;
5140 
5141  for (i=0;i<=N;i++) {
5142  double y=h*i;
5143  double tmp=exp(-0.5*y*y);
5144  res += (i==0||i==N)? 0.5*tmp : tmp;
5145  }
5146 
5147  res *= h;
5148  return 0.5+const_val*(res);
5149 }
5150 UNUSED */
5151 
5152 
5153 /*
5154  BlastSpougeStoE() -- given a score, return the associated Expect value
5155  using Spouge's FSC
5156 
5157  Error return value is -1.
5158 */
5159 double
5161  Blast_KarlinBlk* kbp,
5162  Blast_GumbelBlk* gbp,
5163  Int4 m_, Int4 n_)
5164 {
5165  // the score and lambda may have been rescaled. We will compute the scaling factor
5166  // and use it to scale a, alpha, and Sigma similarly.
5167  double scale_factor = kbp->Lambda / gbp->Lambda;
5168 
5169  // the pair-wise e-value must be scaled back to db-wise e-value
5170  double db_scale_factor = (gbp->db_length) ?
5171  (double)gbp->db_length/(double)n_ : 1.0;
5172 
5173  double lambda_ = kbp->Lambda;
5174  double k_ = kbp->K;
5175  double ai_hat_ = gbp->a * scale_factor;
5176  double bi_hat_ = gbp->b;
5177  double alphai_hat_= gbp->Alpha * scale_factor;
5178  double betai_hat_ = gbp->Beta;
5179  double sigma_hat_ = gbp->Sigma * scale_factor;
5180  double tau_hat_ = gbp->Tau;
5181 
5182  /* here we consider symmetric matrix only */
5183  double aj_hat_ = ai_hat_;
5184  double bj_hat_ = bi_hat_;
5185  double alphaj_hat_= alphai_hat_;
5186  double betaj_hat_ = betai_hat_;
5187 
5188  /* this is 1/sqrt(2.0*PI) */
5189  static double const_val = 0.39894228040143267793994605993438;
5190 
5191  double m_li_y, vi_y, sqrt_vi_y, m_F, P_m_F;
5192  double n_lj_y, vj_y, sqrt_vj_y, n_F, P_n_F;
5193  double c_y, p1, p2, area;
5194  double e_value;
5195 
5196  m_li_y = m_ - (ai_hat_*y_ + bi_hat_);
5197  vi_y = MAX(2.0*alphai_hat_/lambda_, alphai_hat_*y_+betai_hat_);
5198  sqrt_vi_y = sqrt(vi_y);
5199  m_F = m_li_y/sqrt_vi_y;
5200  P_m_F = ErfC(-m_F / sqrt(2.0)) / 2.0;
5201  p1 = m_li_y * P_m_F + sqrt_vi_y * const_val * exp(-0.5*m_F*m_F);
5202 
5203  n_lj_y = n_ - (aj_hat_*y_ + bj_hat_);
5204  vj_y = MAX(2.0*alphaj_hat_/lambda_, alphaj_hat_*y_+betaj_hat_);
5205  sqrt_vj_y = sqrt(vj_y);
5206  n_F = n_lj_y/sqrt_vj_y;
5207  P_n_F = ErfC(-n_F / sqrt(2.0)) / 2.0;
5208  p2 = n_lj_y * P_n_F + sqrt_vj_y * const_val * exp(-0.5*n_F*n_F);
5209 
5210  c_y = MAX(2.0*sigma_hat_/lambda_, sigma_hat_*y_+tau_hat_);
5211  area = p1 * p2 + c_y * P_m_F * P_n_F;
5212 
5213  e_value = area * k_ * exp(-lambda_ * y_) * db_scale_factor;
5214  ASSERT(e_value >= 0.0);
5215 
5216  return e_value;
5217 }
5218 
5219 Int4
5221  Blast_KarlinBlk* kbp,
5222  Blast_GumbelBlk* gbp,
5223  Int4 m, Int4 n)
5224 {
5225  Int4 a=0, b, c;
5226  double e;
5227  double db_scale_factor = (gbp->db_length) ?
5228  (double)gbp->db_length : 1.0;
5229 
5230  b = MAX((int)(log(db_scale_factor/e0) / kbp->Lambda), 2);
5231 
5232  e = BLAST_SpougeStoE(b, kbp, gbp, m, n);
5233 
5234  if (e > e0) {
5235  while (e> e0) {
5236  a = b;
5237  b *= 2;
5238  e = BLAST_SpougeStoE(b, kbp, gbp, m, n);
5239  }
5240  } else {
5241  a = 0;
5242  }
5243  while(b-a > 1) {
5244  c = (a+b)/2;
5245  e = BLAST_SpougeStoE(c, kbp, gbp, m, n);
5246  if (e> e0) {
5247  a = c;
5248  } else {
5249  b = c;
5250  }
5251  }
5252  return a;
5253 }
#define sfree(x)
Safe free a pointer: belongs to a higher level header.
Definition: blast_def.h:112
const char * kBlastErrMsg_CantCalculateUngappedKAParams
Definition: blast_message.c:38
@ eBlastSevError
Definition: blast_message.h:58
@ eBlastSevWarning
Definition: blast_message.h:57
Int2 Blast_MessageWrite(Blast_Message **blast_msg, EBlastSeverity severity, int context, const char *message)
Writes a message to a structure.
const int kBlastMessageNoContext
Declared in blast_message.h as extern const.
Definition: blast_message.c:36
Boolean Blast_QueryIsPssm(EBlastProgramType p)
Returns true if the query is PSSM.
Definition: blast_program.c:46
Boolean Blast_QueryIsTranslated(EBlastProgramType p)
Returns true if the query is translated.
Definition: blast_program.c:60
EBlastProgramType
Defines the engine's notion of the different applications of the BLAST algorithm.
Definition: blast_program.h:72
@ eBlastTypeBlastx
Definition: blast_program.h:75
@ eBlastTypeRpsTblastn
Definition: blast_program.h:85
@ eBlastTypeTblastx
Definition: blast_program.h:79
void ** _PSIAllocateMatrix(unsigned int ncols, unsigned int nrows, unsigned int data_type_sz)
Generic 2 dimensional matrix allocator.
void ** _PSIDeallocateMatrix(void **matrix, unsigned int ncols)
Generic 2 dimensional matrix deallocator.
Private interface for Position Iterated BLAST API, contains the PSSM generation engine.
static Int4 pam30_prefs[11]
Quality values for PAM30 matrix, each element corresponds to same element number in array pam30_value...
Definition: blast_stat.c:393
static Int4 prot_identity_prefs[2]
Definition: blast_stat.c:583
static Int2 BlastScoreBlkProteinMatrixRead(BlastScoreBlk *sbp, FILE *fp)
Read in the matrix from the FILE *fp.
Definition: blast_stat.c:1351
static Blast_ResComp * BlastResCompDestruct(Blast_ResComp *rcp)
Deallocates Blast_ResComp structure and associated arrays.
Definition: blast_stat.c:1934
BlastScoreBlk * BlastScoreBlkFree(BlastScoreBlk *sbp)
Deallocates BlastScoreBlk as well as all associated structures.
Definition: blast_stat.c:965
static BLAST_LetterProb Robinson_prob[]
amino acid background frequencies from Robinson and Robinson
Definition: blast_stat.c:1795
Int1 CompressedReverseLookup[BLASTAA_SIZE+1][BLASTAA_SIZE+1]
2-D array mapping compressed letters to sets of ordinary protein letters
Definition: blast_stat.c:4749
#define BLOSUM45_VALUES_MAX
Number of different combinations supported for BLOSUM45.
Definition: blast_stat.c:182
static Blast_GumbelBlk * s_BlastGumbelBlkNew()
Definition: blast_stat.c:838
static Int4 blosum45_prefs[14]
Quality values for BLOSUM45 matrix, each element corresponds to same element number in array blosum45...
Definition: blast_stat.c:200
Int2 BLAST_GetProteinGapExistenceExtendParams(const char *matrixName, Int4 *gap_existence, Int4 *gap_extension)
Extract the recommended gap existence and extension values.
Definition: blast_stat.c:3374
double BLAST_GapDecayDivisor(double decayrate, unsigned nsegs)
Compute a divisor used to weight the evalue of a collection of "nsegs" distinct alignments.
Definition: blast_stat.c:4063
static double s_GetUngappedBeta(Int4 reward, Int4 penalty)
Returns the beta statistical parameter value, given the nucleotide substitution scores.
Definition: blast_stat.c:3939
static const array_of_8 blastn_values_5_4[]
Karlin-Altschul parameter values for substitution scores 5 and -4.
Definition: blast_stat.c:722
static double s_BlastSumP(Int4 r, double s)
Estimate the Sum P-value by calculation or interpolation, as appropriate.
Definition: blast_stat.c:4341
#define BLAST_SCORE_RANGE_MAX
maximum allowed range of BLAST scores.
Definition: blast_stat.c:56
double BLAST_LargeGapSumE(Int2 num, double xsum, Int4 query_length, Int4 subject_length, Int8 searchsp_eff, double weight_divisor)
Calculates the e-value if a collection of distinct alignments with arbitrarily large gaps between the...
Definition: blast_stat.c:4516
static array_of_8 blosum80_values[10]
Supported values (gap-existence, extension, etc.) for BLOSUM80.
Definition: blast_stat.c:290
double array_of_8[11]
Holds values (gap-opening, extension, etc.) for a matrix.
Definition: blast_stat.c:78
static const array_of_8 blastn_values_3_4[]
Karlin-Altschul parameter values for substitution scores 3 and -4.
Definition: blast_stat.c:687
static Int4 blosum50_prefs[16]
Quality values for BLOSUM50 matrix, each element corresponds to same element number in array blosum50...
Definition: blast_stat.c:238
#define PAM70_VALUES_MAX
Number of different combinations supported for PAM70.
Definition: blast_stat.c:408
Int2 Blast_GetNuclAlphaBeta(Int4 reward, Int4 penalty, Int4 gap_open, Int4 gap_extend, Blast_KarlinBlk *kbp, Boolean gapped_calculation, double *alpha, double *beta)
Extract the alpha and beta settings for these substitution and gap scores.
Definition: blast_stat.c:3949
void Blast_FillResidueProbability(const Uint1 *sequence, Int4 length, double *resProb)
Given a sequence of 'length' amino acid residues, compute the probability of each residue and put tha...
Definition: blast_stat.c:4561
#define BLOSUM90_VALUES_MAX
Number of different combinations supported for BLOSUM90.
Definition: blast_stat.c:316
static Int4 blosum90_prefs[8]
Quality values for BLOSUM90 matrix, each element corresponds to same element number in array blosum90...
Definition: blast_stat.c:328
static ListNode * BlastMatrixValuesDestruct(ListNode *vnp)
Free linked list of MatrixValues and all associated data.
Definition: blast_stat.c:2929
struct BLAST_LetterProb BLAST_LetterProb
Records probability of letter appearing in sequence.
#define BLAST_KARLIN_LAMBDA0_DEFAULT
Initial guess for the value of Lambda in BlastKarlinLambdaNR.
Definition: blast_stat.c:70
static Int2 s_SplitArrayOf8(const array_of_8 *input, const array_of_8 **normal, const array_of_8 **non_affine, Boolean *split)
Splits an ArrayOf8 into two arrays of supported gap costs.
Definition: blast_stat.c:3152
double Blast_KarlinLambdaNR(Blast_ScoreFreq *sfp, double initialLambdaGuess)
Calculates the parameter Lambda given an initial guess for its value.
Definition: blast_stat.c:2567
Int4 ** RPSRescalePssm(double scalingFactor, Int4 rps_query_length, const Uint1 *rps_query_seq, Int4 db_seq_length, Int4 **posMatrix, BlastScoreBlk *sbp)
Rescale the PSSM, using composition-based statistics, for use with RPS BLAST.
Definition: blast_stat.c:4677
Blast_ResFreq * Blast_ResFreqFree(Blast_ResFreq *rfp)
Deallocates Blast_ResFreq and prob0 element.
Definition: blast_stat.c:1689
SCompressedAlphabet * SCompressedAlphabetFree(SCompressedAlphabet *alphabet)
Free a compressed alphabet and score matrix.
Definition: blast_stat.c:4975
static Int2 BlastScoreBlkProteinMatrixLoad(BlastScoreBlk *sbp)
Sets sbp->matrix->data field using sbp->name field using the matrices in the toolkit (util/tables/raw...
Definition: blast_stat.c:1539
char * BLAST_PrintMatrixMessage(const char *matrix_name, Boolean standard_only)
Prints a messages about the allowed matrices, BlastKarlinBlkGappedFill should return 1 before this is...
Definition: blast_stat.c:3760
double BLAST_SmallGapSumE(Int4 starting_points, Int2 num, double xsum, Int4 query_length, Int4 subject_length, Int8 searchsp_eff, double weight_divisor)
Calculates the e-value for alignments with "small" gaps (typically under fifty residues/basepairs) fo...
Definition: blast_stat.c:4402
static const array_of_8 blastn_values_1_2[]
Karlin-Altschul parameter values for substitution scores 1 and -2.
Definition: blast_stat.c:660
double BLAST_SpougeStoE(Int4 y_, Blast_KarlinBlk *kbp, Blast_GumbelBlk *gbp, Int4 m_, Int4 n_)
Calculates the Expect value based upon the Spouge's FSC method.
Definition: blast_stat.c:5160
static MatrixInfo * MatrixInfoNew(const char *name, array_of_8 *values, Int4 *prefs, Int4 max_number)
Allocates New MatrixInfo*.
Definition: blast_stat.c:2910
#define BLOSUM62_VALUES_MAX
Number of different combinations supported for BLOSUM62.
Definition: blast_stat.c:257
double BLAST_KarlinEtoP(double x)
Convert an E-value to a P-value.
Definition: blast_stat.c:4173
Int2 BlastScoreBlkNuclMatrixCreate(BlastScoreBlk *sbp)
Fill in the matrix for blastn using the penaly and rewards The query sequence alphabet is blastna,...
Definition: blast_stat.c:1060
Int2 Blast_KarlinBlkGappedCalc(Blast_KarlinBlk *kbp, Int4 gap_open, Int4 gap_extend, const char *matrix_name, Blast_Message **error_return)
Fills in lambda, H, and K values, as calculated by Stephen Altschul in Methods in Enzy.
Definition: blast_stat.c:3527
static array_of_8 blosum62_values[12]
Supported values (gap-existence, extension, etc.) for BLOSUM62.
Definition: blast_stat.c:258
static void RPSFillScores(Int4 **matrix, Int4 matrixLength, double *queryProbArray, double *scoreArray, Blast_ScoreFreq *return_sfp, Int4 range, Int4 alphabet_size)
the routine RPSFillScores computes the probability of each score weighted by the probability of each ...
Definition: blast_stat.c:4633
SBlastScoreMatrix * SBlastScoreMatrixFree(SBlastScoreMatrix *matrix)
Deallocates SBlastScoreMatrix structure.
Definition: blast_stat.c:732
static Int2 Blast_ResFreqResComp(const BlastScoreBlk *sbp, Blast_ResFreq *rfp, const Blast_ResComp *rcp)
Calculate the residue frequencies associated with the provided ResComp This function takes into accou...
Definition: blast_stat.c:2044
#define PROT_IDENTITY_VALUES_MAX
Definition: blast_stat.c:577
Blast_KarlinBlk * Blast_KarlinBlkNew(void)
Callocs a Blast_KarlinBlk.
Definition: blast_stat.c:2861
static MatrixInfo * MatrixInfoDestruct(MatrixInfo *matrix_info)
Deallocates MatrixInfo as well as name string.
Definition: blast_stat.c:2890
Blast_KarlinBlk * Blast_KarlinBlkFree(Blast_KarlinBlk *kbp)
Deallocates the KarlinBlk.
Definition: blast_stat.c:956
static Int2 BlastScoreBlkMaxScoreSet(BlastScoreBlk *sbp)
Sets maximum and minimum scores on the BlastScoreBlk for a given matrix.
Definition: blast_stat.c:1500
Int2 BLAST_ScoreSetAmbigRes(BlastScoreBlk *sbp, char ambiguous_res)
Set the ambiguous residue (e.g, 'N', 'X') in the BlastScoreBlk*.
Definition: blast_stat.c:1012
static double BlastKarlinLtoH(Blast_ScoreFreq *sfp, double lambda)
Calculate H, the relative entropy of the p's and q's.
Definition: blast_stat.c:2607
Int2 Blast_KarlinBlkUngappedCalc(Blast_KarlinBlk *kbp, Blast_ScoreFreq *sfp)
Computes the parameters lambda, H K for use in calculating the statistical significance of high-scori...
Definition: blast_stat.c:2699
double BLAST_KarlinPtoE(double p)
Convert a P-value to an E-value.
Definition: blast_stat.c:4159
static const array_of_8 blastn_values_2_7[]
Karlin-Altschul parameter values for substitution scores 2 and -7.
Definition: blast_stat.c:629
struct MatrixInfo MatrixInfo
Used to temporarily store matrix values for retrieval.
static const array_of_8 blastn_values_1_3[]
Karlin-Altschul parameter values for substitution scores 1 and -3.
Definition: blast_stat.c:638
static array_of_8 blosum90_values[8]
Supported values (gap-existence, extension, etc.) for BLOSUM90.
Definition: blast_stat.c:317
#define BLOSUM80_VALUES_MAX
Number of different combinations supported for BLOSUM80.
Definition: blast_stat.c:289
static double BlastKarlinLHtoK(Blast_ScoreFreq *sfp, double lambda, double H)
The following procedure computes K.
Definition: blast_stat.c:2247
static Int4 pam250_prefs[16]
Quality values for PAM250 matrix, each element corresponds to same element number in array pam250_val...
Definition: blast_stat.c:359
Int2 Blast_ScoreBlkKbpUngappedCalc(EBlastProgramType program, BlastScoreBlk *sbp, Uint1 *query, const BlastQueryInfo *query_info, Blast_Message **blast_message)
Calculate and fill the ungapped Karlin-Altschul parameters in the BlastScoreBlk structure (fields kbp...
Definition: blast_stat.c:2737
static const array_of_8 blastn_values_2_5[]
Karlin-Altschul parameter values for substitution scores 2 and -5.
Definition: blast_stat.c:651
struct SRombergCbackArgs SRombergCbackArgs
Internal data structure used by Romberg integration callbacks.
Int2 Blast_ResFreqStdComp(const BlastScoreBlk *sbp, Blast_ResFreq *rfp)
Calculates residues frequencies given a standard distribution.
Definition: blast_stat.c:1887
static const array_of_8 blastn_values_3_2[]
Karlin-Altschul parameter values for substitution scores 3 and -2.
Definition: blast_stat.c:717
static Int2 s_BuildCompressedScoreMatrix(BlastScoreBlk *sbp, SCompressedAlphabet *new_alphabet, double matrix_scale_factor, CompressedReverseLookup rev_table)
Compute a (non-square) score matrix for a compressed alphabet.
Definition: blast_stat.c:4858
char * BLAST_PrintAllowedValues(const char *matrix_name, Int4 gap_open, Int4 gap_extend)
Prints a messages about the allowed open etc values for the given matrix, BlastKarlinBlkGappedFill sh...
Definition: blast_stat.c:3787
Int2 Blast_KarlinBlkGappedLoadFromTables(Blast_KarlinBlk *kbp, Int4 gap_open, Int4 gap_extend, const char *matrix_name, Boolean standard_only)
Attempts to fill KarlinBlk for given gap opening, extensions etc.
Definition: blast_stat.c:3577
static Int2 s_AdjustGapParametersByGcd(array_of_8 *normal, array_of_8 *linear, int size, Int4 *gap_existence_max, Int4 *gap_extend_max, int divisor)
Adjust Lambda and H if reward and penalty have a non-1 gcd.
Definition: blast_stat.c:3186
static Int4 blosum62_prefs[12]
Quality values for BLOSUM62 matrix, each element corresponds to same element number in array blosum62...
Definition: blast_stat.c:273
static Int2 Blast_ResFreqClr(const BlastScoreBlk *sbp, Blast_ResFreq *rfp)
Sets prob elements of Blast_ResFreq to zero.
Definition: blast_stat.c:2024
static const array_of_8 blastn_values_1_4[]
Karlin-Altschul parameter values for substitution scores 1 and -4.
Definition: blast_stat.c:617
Int2 Blast_GetStdAlphabet(Uint1 alphabet_code, Uint1 *residues, Uint4 residues_size)
Fills a buffer with the 'standard' alphabet (given by STD_AMINO_ACID_FREQS[index]....
Definition: blast_stat.c:1863
SCompressedAlphabet * SCompressedAlphabetNew(BlastScoreBlk *sbp, Int4 compressed_alphabet_size, double matrix_scale_factor)
Allocate a new compressed alphabet and score matrix.
Definition: blast_stat.c:4939
Blast_ScoreFreq * Blast_ScoreFreqFree(Blast_ScoreFreq *sfp)
Deallocates the score frequencies structure.
Definition: blast_stat.c:941
static BLAST_LetterProb nt_prob[]
nucleotide probabilities (25% each letter)
Definition: blast_stat.c:1820
static Int2 BlastResCompStr(const BlastScoreBlk *sbp, Blast_ResComp *rcp, char *str, Int4 length)
Store the composition of a (query) string.
Definition: blast_stat.c:1984
Int2 Blast_GumbelBlkCalc(Blast_GumbelBlk *gbp, Int4 gap_open, Int4 gap_extend, const char *matrix_name, Blast_Message **error_return)
Fills in gumbel parameters to estimate p-value using FSC.
Definition: blast_stat.c:3652
static Blast_ResComp * BlastResCompNew(const BlastScoreBlk *sbp)
Allocated the Blast_ResComp* for a given alphabet.
Definition: blast_stat.c:1952
static const char * s_alphabet10
23-to-10 letter compressed alphabet.
Definition: blast_stat.c:4934
static const array_of_8 blastn_values_1_5[]
Supported substitution and gap costs with corresponding quality values for nucleotide sequence compar...
Definition: blast_stat.c:611
static Int4 BlastKarlinEtoS_simple(double E, const Blast_KarlinBlk *kbp, Int8 searchsp)
Calculates score from expect value and search space.
Definition: blast_stat.c:4024
#define BLAST_KARLIN_K_SUMLIMIT_DEFAULT
K_SUMLIMIT_DEFAULT == sumlimit used in BlastKarlinLHtoK()
Definition: blast_stat.c:64
SBlastScoreMatrix * SBlastScoreMatrixNew(size_t ncols, size_t nrows)
Allocates a new SBlastScoreMatrix structure of the specified dimensions.
Definition: blast_stat.c:760
static array_of_8 pam70_values[9]
Supported values (gap-existence, extension, etc.) for PAM70.
Definition: blast_stat.c:409
Int2 Blast_KarlinBlkNuclGappedCalc(Blast_KarlinBlk *kbp, Int4 gap_open, Int4 gap_extend, Int4 reward, Int4 penalty, Blast_KarlinBlk *kbp_ungap, Boolean *round_down, Blast_Message **error_return)
Retrieves Karlin-Altschul parameters from precomputed tables, given the substitution and gap scores.
Definition: blast_stat.c:3836
static double s_BlastSumPCalc(int r, double s)
Evaluate the following double integral, where r = number of segments.
Definition: blast_stat.c:4253
Blast_ResFreq * Blast_ResFreqNew(const BlastScoreBlk *sbp)
Allocates a new Blast_ResFreq structure and fills in the prob element based upon the contents of sbp.
Definition: blast_stat.c:1708
#define STD_AMINO_ACID_FREQS
points to the standard amino acid frequencies to use.
Definition: blast_stat.c:1818
double BLAST_KarlinStoE_simple(Int4 S, Blast_KarlinBlk *kbp, Int8 searchsp)
Calculates the Expect value based upon the search space and some Karlin-Altschul parameters.
Definition: blast_stat.c:4141
static array_of_8 pam30_values[11]
Supported values (gap-existence, extension, etc.) for PAM30.
Definition: blast_stat.c:379
int BlastScoreBlkCheck(BlastScoreBlk *sbp)
Check that score blk is valid, returns zero if it is.
Definition: blast_stat.c:853
static Int2 Blast_ResFreqNormalize(const BlastScoreBlk *sbp, Blast_ResFreq *rfp, double norm)
Normalizes all the residue frequencies and then normalizes them to "norm".
Definition: blast_stat.c:1835
Int2 Blast_ScoreBlkKbpIdealCalc(BlastScoreBlk *sbp)
Calculates the Karlin-Altschul parameters assuming standard residue compositions for the query and su...
Definition: blast_stat.c:2832
static Blast_GumbelBlk * s_BlastGumbelBlkFree(Blast_GumbelBlk *gbp)
Definition: blast_stat.c:846
static array_of_8 blosum45_values[14]
Supported values (gap-existence, extension, etc.) for BLOSUM45.
Definition: blast_stat.c:183
Boolean BLAST_CheckRewardPenaltyScores(Int4 reward, Int4 penalty)
Check the validity of the reward and penalty scores.
Definition: blast_stat.c:3454
double BLAST_UnevenGapSumE(Int4 query_start_points, Int4 subject_start_points, Int2 num, double xsum, Int4 query_length, Int4 subject_length, Int8 searchsp_eff, double weight_divisor)
Calculates the e-value of a collection multiple distinct alignments with asymmetric gaps between the ...
Definition: blast_stat.c:4475
static double s_OuterIntegralCback(double x, void *vp)
Callback for the Romberg integration function.
Definition: blast_stat.c:4200
void BLAST_GetAlphaBeta(const char *matrixName, double *alpha, double *beta, Boolean gapped, Int4 gap_open, Int4 gap_extend, const Blast_KarlinBlk *kbp_ungapped)
Extract the alpha and beta settings for this matrixName, and these gap open and gap extension costs.
Definition: blast_stat.c:3094
static double RPSfindUngappedLambda(const char *matrixName)
Gets the ungapped lambda calculated for the matrix in question given standard residue composition for...
Definition: blast_stat.c:4594
Blast_ScoreFreq * Blast_ScoreFreqNew(Int4 score_min, Int4 score_max)
Creates a new structure to keep track of score frequencies for a scoring system.
Definition: blast_stat.c:2113
Int2 BLAST_Cutoffs(Int4 *S, double *E, Blast_KarlinBlk *kbp, Int8 searchsp, Boolean dodecay, double gap_decay_rate)
Calculate the cutoff score from the expected number of HSPs or vice versa.
Definition: blast_stat.c:4074
static Int2 Blast_ResFreqString(const BlastScoreBlk *sbp, Blast_ResFreq *rfp, char *string, Int4 length)
Fills in residue frequences for a given sequence.
Definition: blast_stat.c:2078
static Int2 BlastKarlinReportAllowedValues(const char *matrix_name, Blast_Message **error_return)
Fills in error_return with strings describing the allowed values.
Definition: blast_stat.c:3478
static Int2 BlastScoreBlkNucleotideMatrixRead(BlastScoreBlk *sbp, FILE *fp)
Read in a custom nucleotide matrix from the FILE *fp.
Definition: blast_stat.c:1151
static const array_of_8 blastn_values_2_3[]
Karlin-Altschul parameter values for substitution scores 2 and -3.
Definition: blast_stat.c:674
static double NlmKarlinLambdaNR(double *probs, Int4 d, Int4 low, Int4 high, double lambda0, double tolx, Int4 itmax, Int4 maxNewton, Int4 *itn)
Find positive solution to.
Definition: blast_stat.c:2491
static void s_BuildCompressedTranslation(const char *trans_string, Uint1 *table, Int4 compressed_alphabet_size, CompressedReverseLookup rev_table)
parse the string defining the conversion between the ordinary protein alphabet and a compressed alpha...
Definition: blast_stat.c:4763
SPsiBlastScoreMatrix * SPsiBlastScoreMatrixNew(size_t ncols)
Allocates a new SPsiBlastScoreMatrix structure of dimensions ncols by BLASTAA_SIZE.
Definition: blast_stat.c:805
Int2 Blast_KarlinBlkCopy(Blast_KarlinBlk *kbp_to, Blast_KarlinBlk *kbp_from)
Copies contents of one Karlin block to another.
Definition: blast_stat.c:2871
Int2 BLAST_GetNucleotideGapExistenceExtendParams(Int4 reward, Int4 penalty, Int4 *gap_existence, Int4 *gap_extension)
Extract the recommended gap existence and extension values.
Definition: blast_stat.c:3402
#define PAM250_VALUES_MAX
Number of different combinations supported for PAM250.
Definition: blast_stat.c:339
#define BLOSUM50_VALUES_MAX
Number of different combinations supported for BLOSUM50.
Definition: blast_stat.c:218
#define PAM30_VALUES_MAX
Number of different combinations supported for PAM30.
Definition: blast_stat.c:378
#define BLAST_KARLIN_K_ITER_MAX
upper limit on iterations for BlastKarlinLHtoK
Definition: blast_stat.c:72
static array_of_8 blosum50_values[16]
Supported values (gap-existence, extension, etc.) for BLOSUM50.
Definition: blast_stat.c:219