NCBI C++ ToolKit
composition_adjustment.c
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* ===========================================================================
2 *
3 * PUBLIC DOMAIN NOTICE
4 * National Center for Biotechnology Information
5 *
6 * This software/database is a "United States Government Work" under the
7 * terms of the United States Copyright Act. It was written as part of
8 * the author's official duties as a United States Government employee and
9 * thus cannot be copyrighted. This software/database is freely available
10 * to the public for use. The National Library of Medicine and the U.S.
11 * Government have not placed any restriction on its use or reproduction.
12 *
13 * Although all reasonable efforts have been taken to ensure the accuracy
14 * and reliability of the software and data, the NLM and the U.S.
15 * Government do not and cannot warrant the performance or results that
16 * may be obtained by using this software or data. The NLM and the U.S.
17 * Government disclaim all warranties, express or implied, including
18 * warranties of performance, merchantability or fitness for any particular
19 * purpose.
20 *
21 * Please cite the author in any work or product based on this material.
22 *
23 * ===========================================================================*/
24 
25 /** @file composition_adjustment.c
26  * Highest level functions to solve the optimization problem for
27  * compositional score matrix adjustment.
28  *
29  * @author Yi-Kuo Yu, Alejandro Schaffer, E. Michael Gertz
30  */
31 
32 #include <limits.h>
33 #include <assert.h>
41 
42 
43 /**positions of true characters in protein alphabet*/
45 {1,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,22};
46 
47 /**
48  * conversion from 26 letter NCBIstdaa alphabet to 20 letter order
49  * for true amino acids: ARNDCQEGHILKMFPSTWYV. This order is
50  * alphabetical in the standard three-letter abbreviation of each
51  * amino acid */
53  {(-1), 0, (-1), 4, 3, 6, 13, 7, 8, 9, 11, 10, 12, 2, 14, 5, 1, 15,
54  16, 19, 17, (-1), 18, (-1), (-1), (-1), (-1), (-1)};
55 
56 
57 /**
58  * Desired margin between an end of region used for computing a
59  * composition, and the nearest StopChar; the desired margin may
60  * not be attained. */
61 static const int kCompositionMargin = 20;
62 
63 /** iteration limit for Newton's method for computing Lambda */
64 static const int kLambdaIterationLimit = 100;
65 /** bound on error for estimating lambda */
66 static const double kLambdaErrorTolerance = 0.0000001;
67 /** bound on the difference between the expected value of
68  * exp(lambda x S) and 1 */
69 static const double kLambdaFunctionTolerance = 0.00001;
70 
71 
72 /** bound on error for Newton's method */
73 static const double kCompoAdjustErrTolerance = 0.00000001;
74 /** iteration limit for Newton's method */
75 static const int kCompoAdjustIterationLimit = 2000;
76 /** relative entropy of BLOSUM62 */
77 static const double kFixedReBlosum62 = 0.44;
78 /** largest permitted value of score(i,X), when scores are computed using
79  composition adjustment */
80 static const double kMaximumXscore = -1.0;
81 
82 /** The BLOSUM62 matrix for the ARND... alphabet, using standard scale */
84 {{4, -1, -2, -2, 0, -1, -1, 0, -2, -1, -1, -1, -1, -2, -1, 1, 0, -3, -2, 0},
85  {-1, 5, 0, -2, -3, 1, 0, -2, 0, -3, -2, 2, -1, -3, -2, -1, -1, -3, -2, -3},
86  {-2, 0, 6, 1, -3, 0, 0, 0, 1, -3, -3, 0, -2, -3, -2, 1, 0, -4, -2, -3},
87  {-2, -2, 1, 6, -3, 0, 2, -1, -1, -3, -4, -1, -3, -3, -1, 0, -1, -4, -3, -3},
88  {0,-3,-3,-3, 9, -3, -4, -3, -3, -1, -1, -3, -1, -2, -3, -1, -1, -2, -2, -1},
89  {-1, 1, 0, 0, -3, 5, 2, -2, 0, -3, -2, 1, 0, -3, -1, 0, -1, -2, -1, -2},
90  {-1, 0, 0, 2, -4, 2, 5, -2, 0, -3, -3, 1, -2, -3, -1, 0, -1, -3, -2, -2},
91  {0, -2, 0, -1, -3, -2, -2, 6, -2, -4, -4, -2, -3, -3, -2, 0, -2, -2, -3, -3},
92  {-2, 0, 1, -1, -3, 0, 0, -2, 8, -3, -3, -1, -2, -1, -2, -1, -2, -2, 2, -3},
93  {-1, -3, -3, -3, -1, -3, -3, -4, -3, 4, 2, -3, 1, 0, -3, -2, -1, -3, -1, 3},
94  {-1, -2, -3, -4, -1, -2, -3, -4, -3, 2, 4, -2, 2, 0, -3, -2, -1, -2, -1, 1},
95  {-1, 2, 0, -1, -3, 1, 1, -2, -1, -3, -2, 5, -1, -3, -1, 0, -1, -3, -2, -2},
96  {-1, -1, -2, -3, -1, 0, -2, -3, -2, 1, 2, -1, 5, 0, -2, -1, -1, -1, -1, 1},
97  {-2, -3, -3, -3, -2, -3, -3, -3, -1, 0, 0, -3, 0, 6, -4, -2, -2, 1, 3, -1},
98  {-1,-2,-2,-1, -3, -1, -1, -2, -2, -3, -3, -1, -2, -4, 7, -1, -1, -4, -3, -2},
99  {1, -1, 1, 0, -1, 0, 0, 0, -1, -2, -2, 0, -1, -2, -1, 4, 1, -3, -2, -2},
100  {0, -1, 0, -1, -1, -1, -1, -2, -2, -1, -1, -1, -1, -2, -1, 1, 5, -2, -2, 0},
101  {-3,-3,-4,-4, -2, -2, -3, -2, -2, -3, -2, -3, -1, 1, -4, -3, -2, 11, 2, -3},
102  {-2, -2, -2, -3, -2, -1, -2, -3, 2, -1, -1, -2, -1, 3, -3, -2, -2, 2, 7, -1},
103  {0, -3, -3, -3, -1, -2, -2, -3, -3, 3, 1, -2, 1, -1, -2, -2, 0, -3, -1, 4}};
104 
105 /* Documented in composition_adjustment.h. */
106 void
107 Blast_ApplyPseudocounts(double * probs20,
108  int number_of_observations,
109  const double * background_probs20,
110  int pseudocounts)
111 {
112  int i; /* loop index */
113  double weight; /* weight assigned to pseudocounts */
114  double sum; /* sum of the observed frequencies */
115  /* pseudocounts as a double */
116  double dpseudocounts = pseudocounts;
117 
118  /* Normalize probabilities */
119  sum = 0.0;
120  for (i = 0; i < COMPO_NUM_TRUE_AA; i++) {
121  sum += probs20[i];
122  }
123  if (sum == 0.0) { /* Can't normalize a zero vector */
124  sum = 1.0;
125  }
126  weight = dpseudocounts / (number_of_observations + dpseudocounts);
127  for (i = 0; i < COMPO_NUM_TRUE_AA; i++) {
128  probs20[i] = (1.0 - weight) * probs20[i] / sum
129  + weight * background_probs20[i];
130  }
131 }
132 
133 
134 /* Documented in composition_adjustment.h. */
135 double
136 Blast_GetRelativeEntropy(const double A[], const double B[])
137 {
138  int i; /* loop index over letters */
139  double temp; /* intermediate term */
140  double value = 0.0; /* square of relative entropy */
141 
142  for (i = 0; i < COMPO_NUM_TRUE_AA; i++) {
143  temp = (A[i] + B[i]) / 2;
144  if (temp > 0) {
145  if (A[i] > 0) {
146  value += A[i] * log(A[i] / temp) / 2;
147  }
148  if (B[i] > 0) {
149  value += B[i] * log(B[i] / temp) / 2;
150  }
151  }
152  }
153  if (value < 0) { /* must be numerical rounding error */
154  value = 0;
155  }
156  return sqrt(value);
157 }
158 
159 
160 
161 /* Blast_CalcLambdaFullPrecision -- interface documented in
162  * composition_adjustment.h.
163  *
164  * If the average score for a composition is negative, and the maximum
165  * score that occurs with nonzero probability is positive, then
166  * statistical parameter Lambda exists and is the unique, positive
167  * solution to
168  *
169  * phi(lambda) = sum_{i,j} P_1(i) P_2(j) exp(S_{ij} lambda) - 1 = 0,
170  *
171  * where S_{ij} is the matrix "score" and P_1 and P_2 are row_probs and
172  * col_probs respectively.
173  *
174  * It is simpler to solve this problem in x = exp(-lambda) than it is
175  * to solve it in lambda, because we know that for x, a solution lies
176  * in [0,1]. Furthermore, if M is the largest S_{ij} so that P_1(i)
177  * and P_2(j) are nonzero, then the function
178  *
179  * f(x) = -x^M - sum_{i,j} P_1(i) P_2(j) x^{M - S_{ij}},
180  *
181  * obtained by multiplying phi(lambda) by x^M, is well behaved in
182  * (0,1] -- if the scores are integers, it is a polynomial. Since x = 0
183  * is not a solution, x solves f(x) = 0 in [0,1), if and only if
184  * lambda = -ln(x) is a positive solution to phi(lambda). Therefore,
185  * we may define a safeguarded Newton iteration to find a solution of
186  * f(x) = 0.
187  *
188  * For the most part, this is a standard safeguarded Newton iteration:
189  * define an interval of uncertainty [a,b] with f(a) > 0 and f(b) < 0
190  * (except for the initial value b = 1, where f(b) = 0); evaluate the
191  * function and use the sign of that value to shrink the interval of
192  * uncertainty; compute a Newton step; and if the Newton step suggests
193  * a point outside the interval of uncertainty or fails to decrease
194  * the function sufficiently, then bisect. There are three further
195  * details needed to understand the algorithm:
196  *
197  * 1) If y the unique solution in [0,1], then f is positive to the left of
198  * y, and negative to the right. Therefore, we may determine whether
199  * the Newton step -f(x)/f'(x) is moving toward, or away from, y by
200  * examining the sign of f'(x). If f'(x) >= 0, we bisect, instead
201  * of taking the Newton step.
202  * 2) There is a neighborhood around x = 1 for which f'(x) >= 0,
203  * so (1) prevents convergence to x = 1.
204  * 3) Conditions like fabs(p) < lambda_tolerance * x * (1-x) are used in
205  * convergence criteria because these values translate to a bound
206  * on the relative error in lambda. This is proved in the
207  * "Blast Scoring Parameters" document that accompanies the BLAST
208  * code.
209  *
210  * We have observed that in typical cases the safeguarded Newton
211  * iteration on f(x) requires half the iterations of a Newton
212  * iteration on phi(lambda). More importantly, the iteration on f(x)
213  * is robust and doesn't overflow; defining a robust safeguarded
214  * Newton iteration on phi(lambda) that cannot converge to zero and
215  * that is protected against overflow is more difficult. So (despite
216  * the length of this comment) the Newton iteration on f(x) is the
217  * simpler solution.
218  */
219 void
220 Blast_CalcLambdaFullPrecision(double * plambda, int *piterations,
221  double **score, int alphsize,
222  const double row_prob[],
223  const double col_prob[],
224  double lambda_tolerance,
225  double function_tolerance,
226  int max_iterations)
227 {
228  double f = 4; /* The current function value; initially
229  set to a value greater than any possible
230  real value of f */
231  double left = 0, right = 1; /* (left, right) is an interval containing
232  a solution */
233  double x = 0.367879441171; /* The current iterate; initially exp(-1) */
234  int is_newton = 0; /* true if the last iteration was a Newton
235  step; initially false */
236  int i, j, k; /* iteration indices */
237  /* maximum score that occurs with nonzero probability */
238  double max_score = COMPO_SCORE_MIN;
239  /* average score */
240  double avg_score = 0.0;
241 
242  /* Find the maximum score with nonzero probability */
243  for (i = 0; i < alphsize; i++) {
244  if (row_prob[i] == 0.0) {
245  continue;
246  }
247  for (j = 0; j < alphsize; j++) {
248  if (col_prob[j] == 0.0) {
249  continue;
250  }
251  if (max_score < score[i][j]) {
252  max_score = score[i][j];
253  }
254  avg_score += row_prob[i] * col_prob[j] * score[i][j];
255  }
256  }
257  if (max_score <= 0.0 || avg_score >= 0) {
258  /* The iteration cannot converge if max_score is nonpositive
259  * or the average score is nonnegative; lambda doesn't exist */
260  *piterations = max_iterations;
261  *plambda = -1.0;
262  return;
263  }
264  for (k = 0; k < max_iterations; k++) {
265  double slope; /* slope of f at x */
266  double fold = f; /* previous value of f */
267  double x_pow_max_score; /* x raised to the power max_score */
268  double lambda = -log(x); /* the iterate lambda, see above */
269  int was_newton = is_newton; /* true if the previous iteration
270  was a Newton step; instead of a
271  bisection step */
272  /* Evaluate the function and its derivative */
273  x_pow_max_score = exp(-max_score * lambda);
274  f = -x_pow_max_score;
275  slope = max_score * f / x;
276  for (i = 0; i < alphsize; i++) {
277  if (row_prob[i] == 0.0) {
278  continue;
279  }
280  for (j = 0; j < alphsize; j++) {
281  double ff; /* a term in the sum used to compute f */
282 
283  if (col_prob[j] == 0.0) {
284  continue;
285  }
286  if (max_score != score[i][j]) {
287  double diff_score = max_score - score[i][j];
288 
289  ff = row_prob[i] * col_prob[j] * exp(-lambda * diff_score);
290  slope += diff_score * ff / x;
291  } else {
292  ff = row_prob[i] * col_prob[j];
293  }
294  f += ff;
295  }
296  }
297  /* Finished evaluating the function and its derivative */
298  if (f > 0) {
299  left = x; /* move the left endpoint */
300  } else if (f < 0) {
301  right = x; /* move the right endpoint */
302  } else { /* f == 0 */
303  break; /* x is an exact solution */
304  }
305  if (right - left <= 2 * left * (1 - right) * lambda_tolerance &&
306  fabs(f/x_pow_max_score) <= function_tolerance) {
307  /* The midpoint of the interval converged */
308  x = (left + right) / 2;
309  break;
310  }
311  if ((was_newton && fabs(f) > .9 * fabs(fold))
312  /* if the previous iteration was a Newton step but didn't
313  * decrease f sufficiently; or */
314  || slope >= 0
315  /* if a Newton step will move us away from the desired solution */
316  ) {/* then */
317  x = (left + right)/2; /* bisect */
318  } else {
319  double p = -f/slope; /* The Newton step */
320  double y = x + p; /* The proposed next iterate */
321  if (y <= left || y >= right) { /* The proposed iterate is
322  not in (left,right) */
323  x = (left + right)/2; /* bisect */
324  } else {/* The proposed iterate is in (left,right). Accept it. */
325  is_newton = 1;
326  x = y;
327  if (fabs(p) <= lambda_tolerance * x * (1-x) &&
328  fabs(f/x_pow_max_score) <= function_tolerance) break;
329  }
330  }
331  } /* End for all iterations k */
332  *plambda = (k < max_iterations) ? -log(x) : -1.0;
333  *piterations = k;
334 }
335 
336 
337 /* Documented in composition_adjustment.h. */
338 double
339 Blast_MatrixEntropy(double ** matrix, int alphsize, const double row_prob[],
340  const double col_prob[], double Lambda)
341 {
342  int i, j;
343  double entropy = 0.0;
344  for (i = 0; i < alphsize; i++) {
345  for (j = 0; j < alphsize; j++) {
346  /* the score at (i,j), rescaled to nats */
347  double nat_score = Lambda * matrix[i][j];
348  entropy += nat_score * exp(nat_score) * row_prob[i] * col_prob[j];
349  }
350  }
351  return entropy;
352 }
353 
354 
355 
356 /* Documented in composition_adjustment.h. */
357 double
358 Blast_TargetFreqEntropy(double ** target_freq)
359 {
360  int i, j; /* iteration indices */
361  double entropy; /* the entropy to be returned */
362  /* Row probabilities consistent with the target frequencies */
363  double row_prob[COMPO_NUM_TRUE_AA] = {0,};
364  double col_prob[COMPO_NUM_TRUE_AA] = {0,};
365 
366  for (i = 0; i < COMPO_NUM_TRUE_AA; i++) {
367  for (j = 0; j < COMPO_NUM_TRUE_AA; j++) {
368  row_prob[i] += target_freq[i][j];
369  col_prob[j] += target_freq[i][j];
370  }
371  }
372  entropy = 0.0;
373  for (i = 0; i < COMPO_NUM_TRUE_AA; i++) {
374  for (j = 0; j < COMPO_NUM_TRUE_AA; j++) {
375  double freq = target_freq[i][j];
376  entropy += freq * log(freq / row_prob[i] / col_prob[j]);
377  }
378  }
379  return entropy;
380 }
381 
382 
383 /* Documented in composition_adjustment.h. */
384 void
385 Blast_CalcFreqRatios(double ** ratios, int alphsize,
386  double row_prob[], double col_prob[])
387 {
388  int i, j;
389  for (i = 0; i < alphsize; i++) {
390  if (row_prob[i] > 0) {
391  for (j = 0; j < alphsize; j++) {
392  if (col_prob[j] > 0) {
393  ratios[i][j] /= (row_prob[i] * col_prob[j]);
394  }
395  }
396  }
397  }
398 }
399 
400 
401 /* Documented in composition_adjustment.h. */
402 void
403 Blast_FreqRatioToScore(double ** matrix, int rows, int cols, double Lambda)
404 {
405  int i;
406  for (i = 0; i < rows; i++) {
407  int j;
408  for (j = 0; j < cols; j++) {
409  if (0.0 == matrix[i][j]) {
410  matrix[i][j] = COMPO_SCORE_MIN;
411  } else {
412  matrix[i][j] = log(matrix[i][j])/Lambda;
413  }
414  }
415  }
416 }
417 
418 
419 /**
420  * Convert letter probabilities from the NCBIstdaa alphabet to
421  * a 20 letter ARND... amino acid alphabet. (@see alphaConvert)
422  *
423  * @param outputLetterProbs the ARND letter probabilities [out]
424  * @param inputLetterProbs the NCBIstdaa letter probabilities [in]
425  * @param alphsize the size of the alphabet
426  */
427 static void
428 s_GatherLetterProbs(double * outputLetterProbs,
429  const double * inputLetterProbs, int alphsize)
430 {
431  int c; /*index over characters*/
432 
433  for (c = 0; c < alphsize; c++) {
434  if ((-1) != alphaConvert[c]) {
435  outputLetterProbs[alphaConvert[c]] = inputLetterProbs[c];
436  }
437  }
438 }
439 
440 /**
441  * Convert letter probabilities from a a 20 letter ARND... amino acid
442  * alphabet to the NCBIstdaa alphabet, (@see alphaConvert) setting
443  * probabilities for nonstandard characters to zero.
444  *
445  * @param std_probs the NCBIstdaa letter probabilities [out]
446  * @param alphsize size of the NCBIstdaa alphabet [in]
447  * @param probs the ARND letter probabilities [in]
448  */
449 static void
450 s_UnpackLetterProbs(double std_probs[], int alphsize, const double probs[])
451 {
452  int c; /*index over characters*/
453 
454  for (c = 0; c < alphsize; c++) {
455  if ((-1) != alphaConvert[c]) {
456  std_probs[c] = probs[alphaConvert[c]];
457  } else {
458  std_probs[c] = 0.0;
459  }
460  }
461 }
462 
463 
464 /**
465  * Set the probabilities for the two-character ambiguity characters.
466  * @param probs the probabilities for the NCBIstdaa alphabet.
467  * On entry, values for the standard amino acids must
468  * be set; on exit, values for the two-character
469  * ambiguities will also be set.
470  * @param alphsize the size of the alphabet
471  */
472 static void
473 s_SetPairAmbigProbsToSum(double probs[], int alphsize)
474 {
475  probs[eBchar] = probs[eDchar] + probs[eNchar];
476  probs[eZchar] = probs[eEchar] + probs[eQchar];
477  if (alphsize > eJchar) {
478  probs[eJchar] = probs[eIchar] + probs[eLchar];
479  }
480 }
481 
482 
483 /* Documented in composition_adjustment.h. */
484 int
486  double * Lambda,
487  int * iter_count,
488  double ** target_freq,
489  const double row_prob[],
490  const double col_prob[])
491 {
492  /* iteration indices */
493  int i, j;
494  /* Status flag; will be set to zero on success */
495  int status = 1;
496  /* A matrix of scores in the context consistent with the target
497  * frequencies */
498  double **scores;
499  /* Row and column probabilities consistent with the target
500  * frequencies; the old context */
501  double old_col_prob[COMPO_NUM_TRUE_AA] = {0.0,};
502  double old_row_prob[COMPO_NUM_TRUE_AA] = {0.0,};
503 
504  *entropy = 0;
505  status = 1;
506 
507  /* Calculate the matrix "scores" from the target frequencies */
509  if (scores == NULL) {
510  return -1;
511  }
512  for (i = 0; i < COMPO_NUM_TRUE_AA; i++) {
513  for (j = 0; j < COMPO_NUM_TRUE_AA; j++) {
514  old_row_prob[i] += target_freq[i][j];
515  old_col_prob[j] += target_freq[i][j];
516  }
517  }
518  for (i = 0; i < COMPO_NUM_TRUE_AA; i++) {
519  memcpy(scores[i], target_freq[i], COMPO_NUM_TRUE_AA * sizeof(double));
520  }
522  old_row_prob, old_col_prob);
524  /* Finished calculating the matrix "scores" */
525 
526  Blast_CalcLambdaFullPrecision(Lambda, iter_count, scores,
527  COMPO_NUM_TRUE_AA, row_prob,
528  col_prob, kLambdaErrorTolerance,
531  if (*iter_count < kLambdaIterationLimit) {
532  *entropy = Blast_MatrixEntropy(scores, COMPO_NUM_TRUE_AA,
533  row_prob, col_prob, *Lambda);
534  status = 0;
535  }
536  Nlm_DenseMatrixFree(&scores);
537  return status;
538 }
539 
540 
541 /* Documented in composition_adjustment.h. */
542 void
543 Blast_TrueAaToStdTargetFreqs(double ** StdFreq, int StdAlphsize,
544  double ** freq)
545 {
546  /* Note I'm using a rough convention for this routine that uppercase
547  * letters refer to quantities in the standard (larger) alphabet
548  * and lowercase letters refer to the true amino acid (smaller)
549  * alphabet.
550  */
551  /* Shorter names for the sizes of the two alphabets */
552  const int small_alphsize = COMPO_NUM_TRUE_AA;
553  int A, B; /* characters in the std (big) alphabet */
554  int a, b; /* characters in the small alphabet */
555  double sum; /* sum of values in target_freq; used to normalize */
556  sum = 0.0;
557  for (a = 0; a < small_alphsize; a++) {
558  for (b = 0; b < small_alphsize; b++) {
559  sum += freq[a][b];
560  }
561  }
562  for (A = 0; A < StdAlphsize; A++) {
563  /* for all rows */
564  if (alphaConvert[A] < 0) {
565  /* the row corresponds to a nonstandard reside */
566  for (B = 0; B < StdAlphsize; B++) {
567  StdFreq[A][B] = 0.0;
568  }
569  } else {
570  /* the row corresponds to a standard reside */
571  a = alphaConvert[A];
572 
573  for (B = 0; B < StdAlphsize; B++) {
574  /* for all columns */
575  if (alphaConvert[B] < 0) {
576  /* the column corresponds to a nonstandard reside */
577  StdFreq[A][B] = 0.0;
578  } else {
579  /* the column corresponds to a standard reside */
580  b = alphaConvert[B];
581  StdFreq[A][B] = freq[a][b] / sum;
582  }
583  }
584  /* Set values for two-character ambiguities */
585  StdFreq[A][eBchar] = StdFreq[A][eDchar] + StdFreq[A][eNchar];
586  StdFreq[A][eZchar] = StdFreq[A][eEchar] + StdFreq[A][eQchar];
587  if (StdAlphsize > eJchar) {
588  StdFreq[A][eJchar] = StdFreq[A][eIchar] + StdFreq[A][eLchar];
589  }
590  }
591  }
592  /* Add rows to set values for two-character ambiguities */
593  memcpy(StdFreq[eBchar], StdFreq[eDchar], StdAlphsize * sizeof(double));
594  Nlm_AddVectors(StdFreq[eBchar], StdAlphsize, 1.0, StdFreq[eNchar]);
595 
596  memcpy(StdFreq[eZchar], StdFreq[eEchar], StdAlphsize * sizeof(double));
597  Nlm_AddVectors(StdFreq[eZchar], StdAlphsize, 1.0, StdFreq[eQchar]);
598 
599  if (StdAlphsize > eJchar) {
600  memcpy(StdFreq[eJchar],StdFreq[eIchar], StdAlphsize * sizeof(double));
601  Nlm_AddVectors(StdFreq[eJchar], StdAlphsize, 1.0, StdFreq[eLchar]);
602  }
603 }
604 
605 
606 /**
607  * Calculate the average score of a row or column of a matrix.
608  *
609  * @param M a vector of scores
610  * @param alphsize the size of the alphabet
611  * @param incM stride between elements of M; used to pass a column
612  * of a score matrix to this routine by setting incM
613  * to the number of elements in a column.
614  * @param probs background probabilities for a sequence.
615  */
616 static double
617 s_CalcAvgScore(double * M, int alphsize, int incM, const double probs[])
618 {
619  int j; /* iteration index */
620  double score_iX = 0.0; /* score of character i substituted by X */
621 
622  for (j = 0; j < alphsize; j++) {
623  if (alphaConvert[j] >= 0) {
624  /* If the column corresponds to a true amino acid */
625  score_iX += M[j * incM] * probs[j];
626  }
627  }
628  return score_iX;
629 }
630 
631 /**
632  * Calculate values for the X ambiguity score, give an vector of scores and
633  * a set of character probabilities.
634  * @param M a vector of scores
635  * @param alphsize the size of the alphabet
636  * @param incM stride between elements of M; used to pass a column
637  * of a score matrix to this routine by setting incM
638  * to the number of elements in a column.
639  * @param probs background probabilities for a sequence.
640  */
641 static double
642 s_CalcXScore(double * M, int alphsize, int incM, const double probs[])
643 {
644  return MIN( s_CalcAvgScore(M, alphsize, incM, probs), kMaximumXscore);
645 }
646 
647 
648 /**
649  * Given a standard matrix with the scores for true amino acid, and
650  * pairwise ambiguity scores set, calculate and set the scores for the
651  * X, U (Selenocystiene) and O (pyrrolysine) characters.
652  *
653  * @param M a scoring matrix
654  * @param alphsize the size of the alphabet
655  * @param row_probs character frequencies for the sequence corresponding
656  * to the rows of M.
657  * @param col_probs character frequencies for the sequence corresponding
658  * to the columns of M.
659  */
660 static void
661 s_SetXUOScores(double ** M, int alphsize,
662  const double row_probs[], const double col_probs[])
663 {
664  int i; /* iteration index */
665  double score_XX = 0.0; /* score of matching an X to an X */
666  /* the matrix has alphsize colums (this variable exists just to
667  make things easier to read) */
668  const int cols = alphsize;
669 
670  for (i = 0; i < alphsize; i++) {
671  if (alphaConvert[i] >= 0) {
672  double avg_iX = s_CalcAvgScore(M[i], alphsize, 1, col_probs);
673  M[i][eXchar] = MIN(avg_iX, kMaximumXscore);
674  score_XX += avg_iX * row_probs[i];
675 
676  M[eXchar][i] = s_CalcXScore(&M[0][i], alphsize, cols, row_probs);
677  }
678  }
679  M[eXchar][eXchar] = MIN(score_XX, kMaximumXscore);
680 
681  /* Set X scores for pairwise ambiguity characters */
682  M[eBchar][eXchar] = s_CalcXScore(M[eBchar], alphsize, 1, col_probs);
683  M[eXchar][eBchar] = s_CalcXScore(&M[0][eBchar], alphsize, cols, row_probs);
684 
685  M[eZchar][eXchar] = s_CalcXScore(M[eZchar], alphsize, 1, col_probs);
686  M[eXchar][eZchar] = s_CalcXScore(&M[0][eZchar], alphsize, cols, row_probs);
687  if( alphsize > eJchar) {
688  M[eJchar][eXchar] = s_CalcXScore(M[eJchar], alphsize, 1, col_probs);
689  M[eXchar][eJchar] =
690  s_CalcXScore(&M[0][eJchar], alphsize, cols, row_probs);
691  }
692  /* Copy C scores to U */
693  memcpy(M[eSelenocysteine], M[eCchar], alphsize * sizeof(double));
694  for (i = 0; i < alphsize; i++) {
695  M[i][eSelenocysteine] = M[i][eCchar];
696  }
697  /* Copy X scores to O */
698  if (alphsize > eOchar) {
699  memcpy(M[eOchar], M[eXchar], alphsize * sizeof(double));
700  for (i = 0; i < alphsize; i++) {
701  M[i][eOchar] = M[i][eXchar];
702  }
703  }
704 }
705 
706 
707 /** Return the nearest integer to x. */
708 static long Nint(double x)
709 {
710  x += (x >= 0. ? 0.5 : -0.5);
711  return (long)x;
712 }
713 
714 
715 /**
716  * Round a floating point matrix of scores to produce an integer matrix of
717  * scores.
718  *
719  * @param matrix the matrix of integer valued scores [out]
720  * @param floatScoreMatrix the matrix of floating point valued
721  * scores [in]
722  * @param rows the number of rows of the matrices.
723  * @param cols the number of columns in matrix.
724  */
725 static void
726 s_RoundScoreMatrix(int **matrix, int rows, int cols,
727  double **floatScoreMatrix)
728 {
729  int p, c; /*indices over positions and characters*/
730 
731  for (p = 0; p < rows; p++) {
732  for (c = 0; c < cols; c++) {
733  if (floatScoreMatrix[p][c] < INT_MIN) {
734  matrix[p][c] = INT_MIN;
735  } else {
736  matrix[p][c] = Nint(floatScoreMatrix[p][c]);
737  }
738  }
739  }
740 }
741 
742 
743 /**
744  * Given a set of target frequencies and two sets of character
745  * probabilities for the true amino acids in the ARND alphabet,
746  * calculate a scoring matrix that has valid entries for all
747  * characters in the NCBIstdaa amino acid alphabet.
748  *
749  * @param Matrix the newly computed matrix
750  * @param Alphsize the size of the NCBIstdaa alphabet
751  * @param target_freq target frequencies for true amino acids (20x20)
752  * @param StartMatrix a matrix containing values for the stop character
753  * @param row_prob probabilities of true amino acids in the sequence
754  * corresponding to the rows of matrix (length = 20)
755  * @param col_prob probabilities of true amino acids in the sequence
756  * corresponding to the columns of matrix (length = 20)
757  * @param Lambda the desired scale of this matrix
758  */
759 static int
760 s_ScoresStdAlphabet(int ** Matrix, int Alphsize,
761  double ** target_freq, int ** StartMatrix,
762  const double row_prob[], const double col_prob[],
763  double Lambda)
764 {
765  /* Note: I'm using a rough convention for this routine that uppercase
766  * letters refer to quantities in the standard (larger) alphabet
767  * and lowercase letters refer to the true amino acid (smaller)
768  * alphabet.
769  */
770  int i;
771  /* row and column probabilities in the NCBIstdaa alphabet */
772  double RowProb[COMPO_LARGEST_ALPHABET];
773  double ColProb[COMPO_LARGEST_ALPHABET];
774  /* A double precision score matrix */
775  double ** Scores = Nlm_DenseMatrixNew(Alphsize, Alphsize);
776  if (Scores == NULL) {
777  return -1;
778  }
779  s_UnpackLetterProbs(RowProb, Alphsize, row_prob);
780  s_SetPairAmbigProbsToSum(RowProb, Alphsize);
781 
782  s_UnpackLetterProbs(ColProb, Alphsize, col_prob);
783  s_SetPairAmbigProbsToSum(ColProb, Alphsize);
784 
785  Blast_TrueAaToStdTargetFreqs(Scores, Alphsize, target_freq);
786  Blast_CalcFreqRatios(Scores, Alphsize, RowProb, ColProb);
787  Blast_FreqRatioToScore(Scores, Alphsize, Alphsize, Lambda);
788  s_SetXUOScores(Scores, Alphsize, RowProb, ColProb);
789 
790  s_RoundScoreMatrix(Matrix, Alphsize, Alphsize, Scores);
791  Nlm_DenseMatrixFree(&Scores);
792 
793  for (i = 0; i < Alphsize; i++) {
794  Matrix[i][eStopChar] = StartMatrix[i][eStopChar];
795  Matrix[eStopChar][i] = StartMatrix[eStopChar][i];
796  }
797  return 0;
798 }
799 
800 
801 /**
802  * Find the range of scores contained in an scoring matrix.
803  * @param obs_min smallest value in the matrix
804  * @param obs_max largest value in the matrix
805  * @param matrix a scoring matrix for the ncbistdaa alphabet
806  * @param rows number of rows in the matrix
807  */
808 static void s_GetScoreRange(int * obs_min, int * obs_max,
809  int ** matrix, int rows)
810 {
811  int aa; /* index of an amino-acid in the 20
812  letter alphabet */
813  int irow, jcol; /* matrix row and column indices */
814  int minScore, maxScore; /* largest and smallest observed scores */
815 
816  minScore = maxScore = 0;
817  for (irow = 0; irow < rows; irow++) {
818  for (aa = 0; aa < COMPO_NUM_TRUE_AA; aa++) {
819  jcol = trueCharPositions[aa];
820  if (matrix[irow][jcol] < minScore &&
821  matrix[irow][jcol] > COMPO_SCORE_MIN)
822  minScore = matrix[irow][jcol];
823  if (matrix[irow][jcol] > maxScore)
824  maxScore = matrix[irow][jcol];
825  }
826  }
827  *obs_min = minScore;
828  *obs_max = maxScore;
829 }
830 
831 
832 /**
833  * Compute the score probabilities for a given amino acid substitution matrix
834  * in the context of given query and subject amino acid frequencies.
835  *
836  * @param *obs_min the smallest score in the score matrix [out]
837  * @param *obs_max the largest score in the score matrix [out]
838  * @param *scoreProb the new array, of length (*obs_max - *obs_min + 1),
839  * of score probabilities, where (*scoreProb)[0] is
840  * the probability for score *obs_min.
841  * @param matrix a amino-acid substitution matrix (not
842  * position-specific)
843  * @param alphsize the size of the alphabet
844  * @param subjectProbArray is an array containing the probability of
845  * occurrence of each residue in the subject
846  * @param queryProbArray is an array containing the probability of
847  * occurrence of each residue in the query
848  * @param scoreProb is an array of probabilities for each score
849  * that is to be used as a field in return_sfp
850  * @return 0 on success, -1 on out-of-memory
851  */
852 static int
853 s_GetMatrixScoreProbs(double **scoreProb, int * obs_min, int * obs_max,
854  int **matrix, int alphsize,
855  const double *subjectProbArray,
856  const double *queryProbArray)
857 {
858  int aa; /* index of an amino-acid in the 20 letter
859  alphabet */
860  int irow, jcol; /* matrix row and column indices */
861  double * sprob; /* a pointer to the element of the score
862  probabilities array that represents the
863  probability of the score 0*/
864  int minScore; /* smallest score in matrix; the same value as
865  (*obs_min). */
866  int range; /* the range of scores in the matrix */
867 
868  s_GetScoreRange(obs_min, obs_max, matrix, alphsize);
869  minScore = *obs_min;
870  range = *obs_max - *obs_min + 1;
871  *scoreProb = calloc(range, sizeof(double));
872  if (*scoreProb == NULL) {
873  return -1;
874  }
875  sprob = &((*scoreProb)[-(*obs_min)]); /*center around 0*/
876  for (irow = 0; irow < alphsize; irow++) {
877  for (aa = 0; aa < COMPO_NUM_TRUE_AA; aa++) {
878  jcol = trueCharPositions[aa];
879  if (matrix[irow][jcol] >= minScore) {
880  sprob[matrix[irow][jcol]] +=
881  (queryProbArray[irow] * subjectProbArray[jcol]);
882  }
883  }
884  }
885  return 0;
886 }
887 
888 
889 /**
890  * Compute the score probabilities for a given amino acid position-specific
891  * substitution matrix in the context of a given set of subject amino
892  * acid frequencies.
893  *
894  * @param *obs_min the smallest score in the score matrix [out]
895  * @param *obs_max the largest score in the score matrix [out]
896  * @param *scoreProb the new array, of length (*obs_max - *obs_min + 1),
897  * of score probabilities, where (*scoreProb)[0] is
898  * the probability for score *obs_min.
899  * @param matrix a position-specific amino-acid substitution matrix.
900  * @param rows the number of rows in matrix.
901  * @param subjectProbArray is an array containing the probability of
902  * occurrence of each residue in the subject
903  * @return 0 on success, -1 on out-of-memory
904  */
905 static int
906 s_GetPssmScoreProbs(double ** scoreProb, int * obs_min, int * obs_max,
907  int **matrix, int rows,
908  const double *subjectProbArray)
909 {
910  int aa; /* index of an amino-acid in the 20 letter
911  alphabet */
912  int irow, jcol; /* matrix row and column indices */
913  double onePosFrac; /* matrix length as a double*/
914  double * sprob; /* pointer to the element of the score
915  * probabilities array the represents the
916  * probability of zero */
917  int minScore; /* smallest score in matrix; the same value as
918  (*obs_min). */
919  int range; /* the range of scores in the matrix */
920 
921  s_GetScoreRange(obs_min, obs_max, matrix, rows);
922  minScore = *obs_min;
923  range = *obs_max - *obs_min + 1;
924  *scoreProb = calloc(range, sizeof(double));
925  if (*scoreProb == NULL) {
926  return -1;
927  }
928  sprob = &((*scoreProb)[-(*obs_min)]); /*center around 0*/
929  onePosFrac = 1.0/ ((double) rows);
930  for (irow = 0; irow < rows; irow++) {
931  for (aa = 0; aa < COMPO_NUM_TRUE_AA; aa++) {
932  jcol = trueCharPositions[aa];
933  if (matrix[irow][jcol] >= minScore) {
934  sprob[matrix[irow][jcol]] +=
935  onePosFrac * subjectProbArray[jcol];
936  }
937  }
938  }
939  return 0;
940 }
941 
942 
943 /* Documented in composition_adjustment.h. */
944 void
945 Blast_Int4MatrixFromFreq(int **matrix, int alphsize,
946  double ** freq, double Lambda)
947 {
948  /* A row of the matrix in double precision */
949  double dMatrixStore[COMPO_LARGEST_ALPHABET];
950  double * dMatrix[1];
951  int i;
952 
953  dMatrix[0] = dMatrixStore;
954 
955  for (i = 0; i < alphsize; i++) {
956  memcpy(dMatrix[0], freq[i], alphsize * sizeof(double));
957  Blast_FreqRatioToScore(dMatrix, 1, alphsize, Lambda);
958  s_RoundScoreMatrix(&matrix[i], 1, alphsize, dMatrix);
959  }
960 }
961 
962 
963 /* Documented in composition_adjustment.h. */
965 {
966  if (*ss != NULL) {
967  free((*ss)->matrixName);
968  Nlm_Int4MatrixFree(&(*ss)->startMatrix);
969  Nlm_DenseMatrixFree(&(*ss)->startFreqRatios);
970  free(*ss);
971  *ss = NULL;
972  }
973 }
974 
975 
976 /* Documented in composition_adjustment.h. */
978 Blast_MatrixInfoNew(int rows, int cols, int positionBased)
979 {
980  int i; /* loop index */
981  Blast_MatrixInfo * ss = malloc(sizeof(Blast_MatrixInfo));
982  if (ss != NULL) {
983  ss->rows = rows;
984  ss->cols = cols;
985  ss->positionBased = positionBased;
986 
987  ss->matrixName = NULL;
988  ss->startMatrix = NULL;
989  ss->startFreqRatios = NULL;
990 
991  ss->startMatrix = Nlm_Int4MatrixNew(rows + 1, cols);
992  if (ss->startMatrix == NULL)
993  goto error_return;
994  ss->startFreqRatios = Nlm_DenseMatrixNew(rows + 1, cols);
995 
996  if (ss->startFreqRatios == NULL)
997  goto error_return;
998  for (i = 0; i < cols; i++) {
999  ss->startMatrix[rows][i] = COMPO_SCORE_MIN;
1000  ss->startFreqRatios[rows][i] = (double) COMPO_SCORE_MIN;
1001  }
1002 
1003  }
1004  goto normal_return;
1005 error_return:
1006  Blast_MatrixInfoFree(&ss);
1007 normal_return:
1008  return ss;
1009 }
1010 
1011 
1012 /**
1013  * Fill in all scores for a PSSM at a given scale Lambda.
1014  *
1015  * @param matrix the newly computed matrix [output]
1016  * @param rows number of positions (rows) in the PSSM
1017  * @param cols the number of columns in the PSSM; the alphabet size
1018  * @param freq_ratios frequency ratios defining the PSSM
1019  * @param start_matrix an existing PSSM; used to set values for the
1020  * stop character.
1021  * @param col_prob letter probabilities
1022  * @param Lambda scale of the new matrix
1023  */
1024 static void
1025 s_ScalePSSM(int **matrix, int rows, int cols, double ** freq_ratios,
1026  int ** start_matrix, const double col_prob[], double Lambda)
1027 {
1028  int p; /* index over matrix rows */
1029  /* A row of scores corresponding to one position in the PSSM */
1030  double row[COMPO_LARGEST_ALPHABET];
1031  /* A matrix with one row */
1032  double * row_matrix[1];
1033 
1034  row_matrix[0] = row;
1035 
1036  for (p = 0; p < rows; p++) {
1037  double Xscore;
1038  memcpy(row, freq_ratios[p], cols * sizeof(double));
1039 
1040  Blast_FreqRatioToScore(row_matrix, 1, cols, Lambda);
1041  row[eXchar] = Xscore = s_CalcXScore(row, cols, 1, col_prob);
1042  /* use Cysteine score for Selenocysteine */
1044  if (cols > eOchar) {
1045  row[eOchar] = Xscore;
1046  }
1047  s_RoundScoreMatrix(&matrix[p], 1, cols, row_matrix);
1048 
1049  matrix[p][eStopChar] = start_matrix[p][eStopChar];
1050  }
1051 }
1052 
1053 
1054 /**
1055  * Fill in all scores for a scoring matrix for the NCBIstdaa alphabet
1056  * at a given scale Lambda.
1057  *
1058  * @param matrix the newly computed matrix [output]
1059  * @param alphsize the alphabet size
1060  * @param freq_ratios frequency ratios defining the PSSM
1061  * @param start_matrix an existing matrix; used to set values for the
1062  * stop character.
1063  * @param row_prob letter probabilities for the sequence
1064  * corresponding to the rows of the matrix.
1065  * @param col_prob letter probabilities for the sequence
1066  * corresponding to the columns of the matrix.
1067  * @param Lambda scale of the new matrix
1068  */
1069 static int
1070 s_ScaleSquareMatrix(int **matrix, int alphsize,
1071  double ** freq_ratios, int ** start_matrix,
1072  const double row_prob[], const double col_prob[],
1073  double Lambda)
1074 {
1075  double ** scores; /* a double precision matrix of scores */
1076  int i; /* iteration index */
1077 
1078  scores = Nlm_DenseMatrixNew(alphsize, alphsize);
1079  if (scores == 0) return -1;
1080 
1081  for (i = 0; i < alphsize; i++) {
1082  memcpy(scores[i], freq_ratios[i], alphsize * sizeof(double));
1083  }
1084  Blast_FreqRatioToScore(scores, alphsize, alphsize, Lambda);
1085  s_SetXUOScores(scores, alphsize, row_prob, col_prob);
1086  s_RoundScoreMatrix(matrix, alphsize, alphsize, scores);
1087  for (i = 0; i < alphsize; i++) {
1088  matrix[i][eStopChar] = start_matrix[i][eStopChar];
1089  matrix[eStopChar][i] = start_matrix[eStopChar][i];
1090  }
1091  Nlm_DenseMatrixFree(&scores);
1092 
1093  return 0;
1094 }
1095 
1096 
1097 /** LambdaRatioLowerBound is used when the expected score is too large
1098  * causing impalaKarlinLambdaNR to give a Lambda estimate that
1099  * is too small, or to fail entirely returning -1*/
1100 #define LambdaRatioLowerBound 0.5
1101 
1102 
1103 /* Documented in composition_adjustment.h. */
1104 int
1105 Blast_CompositionBasedStats(int ** matrix, double * LambdaRatio,
1106  const Blast_MatrixInfo * ss,
1107  const double queryProb[], const double resProb[],
1108  double (*calc_lambda)(double*,int,int,double),
1109  int pValueAdjustment)
1110 {
1111  double correctUngappedLambda; /* new value of ungapped lambda */
1112  int obs_min, obs_max; /* smallest and largest score in the
1113  unscaled matrix */
1114  double *scoreArray; /* an array of score probabilities */
1115  int out_of_memory; /* status flag to indicate out of memory */
1116 
1117  if (ss->positionBased) {
1118  out_of_memory =
1119  s_GetPssmScoreProbs(&scoreArray, &obs_min, &obs_max,
1120  ss->startMatrix, ss->rows, resProb);
1121  } else {
1122  out_of_memory =
1123  s_GetMatrixScoreProbs(&scoreArray, &obs_min, &obs_max,
1124  ss->startMatrix, ss->cols,
1125  resProb, queryProb);
1126  }
1127  if (out_of_memory)
1128  return -1;
1129  correctUngappedLambda =
1130  calc_lambda(scoreArray, obs_min, obs_max, ss->ungappedLambda);
1131 
1132  /* calc_lambda will return -1 in the case where the
1133  * expected score is >=0; however, because of the MAX statement 3
1134  * lines below, LambdaRatio should always be > 0; the succeeding
1135  * test is retained as a vestige, in case one wishes to remove the
1136  * MAX statement and allow LambdaRatio to take on the error value
1137  * -1 */
1138  *LambdaRatio = correctUngappedLambda / ss->ungappedLambda;
1139  if (0 == pValueAdjustment)
1140  *LambdaRatio = MIN(1, *LambdaRatio);
1141  *LambdaRatio = MAX(*LambdaRatio, LambdaRatioLowerBound);
1142 
1143  if (*LambdaRatio > 0) {
1144  double scaledLambda = ss->ungappedLambda/(*LambdaRatio);
1145  if (ss->positionBased) {
1146  s_ScalePSSM(matrix, ss->rows, ss->cols, ss->startFreqRatios,
1147  ss->startMatrix, resProb, scaledLambda);
1148  } else {
1149  s_ScaleSquareMatrix(matrix, ss->cols,
1150  ss->startFreqRatios, ss->startMatrix,
1151  queryProb, resProb, scaledLambda);
1152  }
1153  }
1154  free(scoreArray);
1155 
1156  return 0;
1157 }
1158 
1159 
1160 /* Documented in composition_adjustment.h. */
1161 void
1163  int alphsize,
1164  const Uint1 * sequence, int length)
1165 {
1166  int i; /* iteration index */
1167 
1168  /* fields of composition as local variables */
1169  int numTrueAminoAcids = 0;
1170  double * prob = composition->prob;
1171 
1172  for (i = 0; i < alphsize; i++) {
1173  prob[i] = 0.0;
1174  }
1175  for (i = 0; i < length; i++) {
1176  if (alphaConvert[sequence[i]] >= 0 || sequence[i] == eSelenocysteine) {
1177  prob[sequence[i]]++;
1178  numTrueAminoAcids++;
1179  }
1180  }
1181 
1182  /* Count Selenocysteine (U) as if it was Cysteine (C) to avoid a result
1183  where C aligned to U is scored higher than C aligned to C. Otherwise,
1184  because U is not counted as a true amino acid, sequences with U would
1185  appear more complex than those composed only of the 20 "true" amino
1186  acids. Therefore computationally adjusted alignment score would be
1187  larger for sequences containing U than for sequences with C. */
1188  if (prob[eSelenocysteine] > 0.0) {
1190  prob[eSelenocysteine] = 0.0;
1191  }
1192 
1193  composition->numTrueAminoAcids = numTrueAminoAcids;
1194  if (numTrueAminoAcids > 0) {
1195  for (i = 0; i < alphsize; i++) {
1196  prob[i] /= numTrueAminoAcids;
1197  }
1198  }
1199 }
1200 
1201 
1202 /* Documented in composition_adjustment.h. */
1203 void
1204 Blast_GetCompositionRange(int * pleft, int * pright,
1205  const Uint1 * subject_data, int length,
1206  int start, int finish)
1207 {
1208  int i; /* iteration index */
1209  int left, right;
1210 
1211  left = start;
1212  /* Search leftward for a StopChar */
1213  for (i = left; i > 0; i--) {
1214  if (subject_data[i - 1] == eStopChar) {
1215  /* We have found a StopChar. Unless the StopChar is
1216  * too close to the start of the subject region of the
1217  * HSP, */
1218  if (i + kCompositionMargin < left) {
1219  /* reset the left endpoint. */
1220  left = i + kCompositionMargin;
1221  }
1222  break;
1223  }
1224  }
1225  if (i == 0) {
1226  /* No stop codon was found to the left. */
1227  left = 0;
1228  }
1229  right = finish;
1230  /* Search rightward for a StopChar */
1231  for (i = right; i < length; i++) {
1232  if (subject_data[i] == eStopChar) {
1233  /* We have found a StopChar. Unless the StopChar is
1234  * too close to the end of the subject region of the
1235  * HSP, */
1236  if (i - kCompositionMargin > right) {
1237  /* reset the right endpoint */
1238  right = i - kCompositionMargin;
1239  }
1240  break;
1241  }
1242  }
1243  if (i == length) {
1244  /* No stop codon was found to the right. */
1245  right = length;
1246  }
1247  *pleft = left; *pright = right;
1248 }
1249 
1250 
1251 /* Documented in composition_adjustment.h. */
1252 void
1254 {
1255  Blast_CompositionWorkspace * NRrecord = *pNRrecord;
1256 
1257  if (NRrecord != NULL) {
1258  free(NRrecord->first_standard_freq);
1259  free(NRrecord->second_standard_freq);
1260 
1261  Nlm_DenseMatrixFree(&NRrecord->mat_final);
1262  Nlm_DenseMatrixFree(&NRrecord->mat_b);
1263 
1264  free(NRrecord);
1265  }
1266  *pNRrecord = NULL;
1267 }
1268 
1269 
1270 /* Documented in composition_adjustment.h. */
1272 {
1273  Blast_CompositionWorkspace * NRrecord; /* record to allocate
1274  and return */
1275  int i; /* loop index */
1276 
1277  NRrecord = (Blast_CompositionWorkspace *)
1279  if (NRrecord == NULL) goto error_return;
1280 
1281  NRrecord->first_standard_freq = NULL;
1282  NRrecord->second_standard_freq = NULL;
1283  NRrecord->mat_final = NULL;
1284  NRrecord->mat_b = NULL;
1285 
1286  NRrecord->first_standard_freq =
1287  (double *) malloc(COMPO_NUM_TRUE_AA * sizeof(double));
1288  if (NRrecord->first_standard_freq == NULL) goto error_return;
1289 
1290  NRrecord->second_standard_freq =
1291  (double *) malloc(COMPO_NUM_TRUE_AA * sizeof(double));
1292  if (NRrecord->second_standard_freq == NULL) goto error_return;
1293 
1296  if (NRrecord->mat_final == NULL) goto error_return;
1297 
1300  if (NRrecord->mat_b == NULL) goto error_return;
1301 
1302  for (i = 0; i < COMPO_NUM_TRUE_AA; i++) {
1303  NRrecord->first_standard_freq[i] =
1304  NRrecord->second_standard_freq[i] = 0.0;
1305  }
1306 
1307  goto normal_return;
1308 error_return:
1309  Blast_CompositionWorkspaceFree(&NRrecord);
1310 normal_return:
1311  return NRrecord;
1312 }
1313 
1314 
1315 /* Documented in composition_adjustment.h. */
1316 int
1318  const char *matrixName)
1319 {
1320  if (0 == Blast_GetJointProbsForMatrix(NRrecord->mat_b,
1321  NRrecord->first_standard_freq,
1322  NRrecord->second_standard_freq,
1323  matrixName)) {
1324  return 0;
1325  } else {
1326  fprintf(stderr,
1327  "Matrix %s not currently supported for RE based adjustment\n",
1328  matrixName);
1329  return -1;
1330  }
1331 }
1332 
1333 
1334 /* Documented in composition_adjustment.h. */
1335 int
1337  int alphsize,
1338  EMatrixAdjustRule matrix_adjust_rule,
1339  int length1,
1340  int length2,
1341  const double * stdaa_row_probs,
1342  const double * stdaa_col_probs,
1343  int pseudocounts,
1344  double specifiedRE,
1345  Blast_CompositionWorkspace * NRrecord,
1346  const Blast_MatrixInfo * matrixInfo)
1347 {
1348  int iteration_count, status;
1349  double row_probs[COMPO_NUM_TRUE_AA], col_probs[COMPO_NUM_TRUE_AA];
1350  /* Target RE when optimizing the matrix; zero if the relative
1351  entropy should not be constrained. */
1352  double dummy, desired_re = 0.0;
1353 
1354  s_GatherLetterProbs(row_probs, stdaa_row_probs, alphsize);
1355  s_GatherLetterProbs(col_probs, stdaa_col_probs, alphsize);
1356 
1357  switch (matrix_adjust_rule) {
1359  desired_re = 0.0;
1360  break;
1362  /* Calculate the desired re using the new marginal probs with
1363  the old matrix */
1364  status = Blast_EntropyOldFreqNewContext(&desired_re, &dummy,
1365  &iteration_count,
1366  NRrecord->mat_b,
1367  row_probs, col_probs);
1368  if (status < 0) /* Error, e.g. memory */
1369  return status;
1370  else if (status > 0) /* we could not calculate the desired re */
1371  desired_re = 0.0; /* so, leave the re unconstrained */
1372 
1373  break;
1375  desired_re = Blast_TargetFreqEntropy(NRrecord->mat_b);
1376  break;
1378  desired_re = specifiedRE;
1379  break;
1380  default: /* I assert that we can't get here */
1381  fprintf(stderr, "Unknown flag for setting relative entropy"
1382  "in composition matrix adjustment");
1383  exit(1);
1384  }
1385  Blast_ApplyPseudocounts(row_probs, length1,
1386  NRrecord->first_standard_freq, pseudocounts);
1387  Blast_ApplyPseudocounts(col_probs, length2,
1388  NRrecord->second_standard_freq, pseudocounts);
1389 
1390  status =
1391  Blast_OptimizeTargetFrequencies(&NRrecord->mat_final[0][0],
1393  &iteration_count,
1394  &NRrecord->mat_b[0][0],
1395  row_probs, col_probs,
1396  (desired_re > 0.0),
1397  desired_re,
1400 
1401  if (status != 0) /* Did not compute the target freqs */
1402  return status;
1403 
1404  return
1405  s_ScoresStdAlphabet(matrix, alphsize, NRrecord->mat_final,
1406  matrixInfo->startMatrix,
1407  row_probs, col_probs,
1408  matrixInfo->ungappedLambda);
1409 }
1410 
1411 
1412 /* Documented in composition_adjustment.h. */
1413 int
1414 Blast_AdjustScores(int ** matrix,
1415  const Blast_AminoAcidComposition * query_composition,
1416  int queryLength,
1417  const Blast_AminoAcidComposition * subject_composition,
1418  int subjectLength,
1419  const Blast_MatrixInfo * matrixInfo,
1420  ECompoAdjustModes composition_adjust_mode,
1421  int RE_pseudocounts,
1422  Blast_CompositionWorkspace *NRrecord,
1423  EMatrixAdjustRule *matrix_adjust_rule,
1424  double calc_lambda(double *,int,int,double),
1425  double *pvalueForThisPair,
1426  int compositionTestIndex,
1427  double *ratioToPassBack)
1428 {
1429  const int alphsize = matrixInfo->cols;
1430 
1431  double lambdaForPair; /*lambda for this pair of compositions*/
1432  int iter_count; /*used as argument to Blast_CalcLambdaFullPrecision*/
1433 
1434  /* The next two arrays are letter probabilities of query and
1435  * match in 20 letter ARND... alphabet. */
1436  double permutedQueryProbs[COMPO_NUM_TRUE_AA];
1437  double permutedMatchProbs[COMPO_NUM_TRUE_AA];
1438 
1439  if (query_composition->numTrueAminoAcids == 0 ||
1440  subject_composition->numTrueAminoAcids == 0) {
1441  /* Either the query or subject contains only ambiguity
1442  characters, most likely because the entire subject has been
1443  SEGed. Compositional adjustment is meaningless. */
1444  return 1;
1445  }
1446 
1447  if ((compositionTestIndex > 0) ||
1448  ((!(matrixInfo->positionBased)) &&
1449  (composition_adjust_mode != eCompositionBasedStats))) {
1450  s_GatherLetterProbs(permutedQueryProbs,
1451  query_composition->prob, alphsize);
1452  s_GatherLetterProbs(permutedMatchProbs,
1453  subject_composition->prob, alphsize);
1454  }
1455 
1456  if (compositionTestIndex > 0) {
1457  int i,j; /*loop indices*/
1458  /* a score matrix to pass*/
1459  double **scores = Nlm_DenseMatrixNew(alphsize, alphsize);
1460 
1461  if (scores == NULL) {
1462  return -1;
1463  }
1464  for (i = 0; i < COMPO_NUM_TRUE_AA; i++)
1465  for (j = 0; j < COMPO_NUM_TRUE_AA; j++)
1466  scores[i][j] = BLOS62[i][j];
1467  Blast_CalcLambdaFullPrecision(&lambdaForPair, &iter_count,
1468  scores,
1470  permutedQueryProbs,
1471  permutedMatchProbs,
1475  if (iter_count >= kLambdaIterationLimit) {
1476  /* The algorithm for lambda didn't converge, likely
1477  * because the matrix has positive expected score;
1478  * set lambda to the smallest value in the table. */
1479  lambdaForPair = COMPO_MIN_LAMBDA;
1480  }
1481  /*use lengths of query and subject not counting X's */
1482  *pvalueForThisPair = Blast_CompositionPvalue(lambdaForPair);
1483 
1484  Nlm_DenseMatrixFree(&scores);
1485  }
1486 
1487  if (matrixInfo->positionBased ||
1488  composition_adjust_mode == eCompositionBasedStats) {
1489  /* Use old-style composition-based statistics unconditionally. */
1490  *matrix_adjust_rule = eCompoScaleOldMatrix;
1491  } else {
1492  /* else call Yi-Kuo's code to choose mode for matrix adjustment. */
1493  *matrix_adjust_rule =
1494  Blast_ChooseMatrixAdjustRule(queryLength, subjectLength,
1495  permutedQueryProbs,
1496  permutedMatchProbs,
1497  matrixInfo->matrixName,
1498  composition_adjust_mode);
1499  } /* end else call Yi-Kuo's code to choose mode for matrix adjustment. */
1500 
1501  if (eCompoScaleOldMatrix != *matrix_adjust_rule) {
1502  /* Try matrix optimization, if it fails to converge, we
1503  fall back to traditional scaling below */
1504  int status =
1506  alphsize,
1507  *matrix_adjust_rule,
1508  query_composition->
1509  numTrueAminoAcids,
1510  subject_composition->
1511  numTrueAminoAcids,
1512  query_composition->prob,
1513  subject_composition->prob,
1514  RE_pseudocounts,
1516  NRrecord,
1517  matrixInfo);
1518 
1519  *ratioToPassBack = 1.0; /* meaningless for this mode */
1520  if (status <= 0)
1521  return status; /* Success (=0) or fatal error (<0)*/
1522 
1523  } /* End try matrix optimization */
1524 
1525  *matrix_adjust_rule = eCompoScaleOldMatrix;
1526  return Blast_CompositionBasedStats(matrix, ratioToPassBack, matrixInfo,
1527  query_composition->prob,
1528  subject_composition->prob,
1529  calc_lambda,
1530  (compositionTestIndex > 0));
1531 }
static CBioSource dummy
EMatrixAdjustRule Blast_ChooseMatrixAdjustRule(int length1, int length2, const double *probArray1, const double *probArray2, const char *matrixName, ECompoAdjustModes composition_adjust_mode)
Choose how the relative entropy should be constrained based on properties of the two sequences to be ...
Blast_CompositionWorkspace * Blast_CompositionWorkspaceNew(void)
Create a new Blast_CompositionWorkspace object, allocating memory for all its component arrays.
void Blast_ApplyPseudocounts(double *probs20, int number_of_observations, const double *background_probs20, int pseudocounts)
Find the weighted average of a set of observed probabilities with a set of "background" probabilities...
static double s_CalcXScore(double *M, int alphsize, int incM, const double probs[])
Calculate values for the X ambiguity score, give an vector of scores and a set of character probabili...
static void s_UnpackLetterProbs(double std_probs[], int alphsize, const double probs[])
Convert letter probabilities from a a 20 letter ARND...
int Blast_AdjustScores(int **matrix, const Blast_AminoAcidComposition *query_composition, int queryLength, const Blast_AminoAcidComposition *subject_composition, int subjectLength, const Blast_MatrixInfo *matrixInfo, ECompoAdjustModes composition_adjust_mode, int RE_pseudocounts, Blast_CompositionWorkspace *NRrecord, EMatrixAdjustRule *matrix_adjust_rule, double calc_lambda(double *, int, int, double), double *pvalueForThisPair, int compositionTestIndex, double *ratioToPassBack)
Compute a compositionally adjusted scoring matrix.
static int alphaConvert[COMPO_LARGEST_ALPHABET]
conversion from 26 letter NCBIstdaa alphabet to 20 letter order for true amino acids: ARNDCQEGHILKMFP...
static void s_ScalePSSM(int **matrix, int rows, int cols, double **freq_ratios, int **start_matrix, const double col_prob[], double Lambda)
Fill in all scores for a PSSM at a given scale Lambda.
static void s_SetXUOScores(double **M, int alphsize, const double row_probs[], const double col_probs[])
Given a standard matrix with the scores for true amino acid, and pairwise ambiguity scores set,...
static double s_CalcAvgScore(double *M, int alphsize, int incM, const double probs[])
Calculate the average score of a row or column of a matrix.
int Blast_CompositionWorkspaceInit(Blast_CompositionWorkspace *NRrecord, const char *matrixName)
Initialize the fields of a Blast_CompositionWorkspace for a specific underlying scoring matrix.
static const double kCompoAdjustErrTolerance
bound on error for Newton's method
static const int kCompositionMargin
Desired margin between an end of region used for computing a composition, and the nearest StopChar; t...
int Blast_EntropyOldFreqNewContext(double *entropy, double *Lambda, int *iter_count, double **target_freq, const double row_prob[], const double col_prob[])
Compute the relative entropy of the scoring matrix that is consistent with a set of target frequencie...
static int trueCharPositions[COMPO_NUM_TRUE_AA]
positions of true characters in protein alphabet
static void s_GetScoreRange(int *obs_min, int *obs_max, int **matrix, int rows)
Find the range of scores contained in an scoring matrix.
void Blast_Int4MatrixFromFreq(int **matrix, int alphsize, double **freq, double Lambda)
Compute an integer-valued amino-acid score matrix from a set of score frequencies.
static const double kLambdaFunctionTolerance
bound on the difference between the expected value of exp(lambda x S) and 1
void Blast_FreqRatioToScore(double **matrix, int rows, int cols, double Lambda)
Convert a matrix of frequency ratios to a matrix of scores.
static void s_SetPairAmbigProbsToSum(double probs[], int alphsize)
Set the probabilities for the two-character ambiguity characters.
static int s_GetMatrixScoreProbs(double **scoreProb, int *obs_min, int *obs_max, int **matrix, int alphsize, const double *subjectProbArray, const double *queryProbArray)
Compute the score probabilities for a given amino acid substitution matrix in the context of given qu...
static const double kFixedReBlosum62
relative entropy of BLOSUM62
static void s_RoundScoreMatrix(int **matrix, int rows, int cols, double **floatScoreMatrix)
Round a floating point matrix of scores to produce an integer matrix of scores.
static int s_GetPssmScoreProbs(double **scoreProb, int *obs_min, int *obs_max, int **matrix, int rows, const double *subjectProbArray)
Compute the score probabilities for a given amino acid position-specific substitution matrix in the c...
Blast_MatrixInfo * Blast_MatrixInfoNew(int rows, int cols, int positionBased)
Create a Blast_MatrixInfo object.
double Blast_GetRelativeEntropy(const double A[], const double B[])
Compute the symmetric form of the relative entropy of two probability vectors.
double Blast_TargetFreqEntropy(double **target_freq)
Compute the entropy of the scoring matrix implicit in a set of target substitution frequencies.
static void s_GatherLetterProbs(double *outputLetterProbs, const double *inputLetterProbs, int alphsize)
Convert letter probabilities from the NCBIstdaa alphabet to a 20 letter ARND...
static const double kLambdaErrorTolerance
bound on error for estimating lambda
int Blast_CompositionMatrixAdj(int **matrix, int alphsize, EMatrixAdjustRule matrix_adjust_rule, int length1, int length2, const double *stdaa_row_probs, const double *stdaa_col_probs, int pseudocounts, double specifiedRE, Blast_CompositionWorkspace *NRrecord, const Blast_MatrixInfo *matrixInfo)
Use compositional score matrix adjustment, as described in.
void Blast_MatrixInfoFree(Blast_MatrixInfo **ss)
Free memory associated with a Blast_MatrixInfo object.
int Blast_CompositionBasedStats(int **matrix, double *LambdaRatio, const Blast_MatrixInfo *ss, const double queryProb[], const double resProb[], double(*calc_lambda)(double *, int, int, double), int pValueAdjustment)
Use composition-based statistics to adjust the scoring matrix, as described in.
static const int kLambdaIterationLimit
iteration limit for Newton's method for computing Lambda
static int s_ScaleSquareMatrix(int **matrix, int alphsize, double **freq_ratios, int **start_matrix, const double row_prob[], const double col_prob[], double Lambda)
Fill in all scores for a scoring matrix for the NCBIstdaa alphabet at a given scale Lambda.
static int s_ScoresStdAlphabet(int **Matrix, int Alphsize, double **target_freq, int **StartMatrix, const double row_prob[], const double col_prob[], double Lambda)
Given a set of target frequencies and two sets of character probabilities for the true amino acids in...
#define LambdaRatioLowerBound
LambdaRatioLowerBound is used when the expected score is too large causing impalaKarlinLambdaNR to gi...
void Blast_TrueAaToStdTargetFreqs(double **StdFreq, int StdAlphsize, double **freq)
Convert a matrix of target frequencies for the ARND alphabet of true amino acids to a set of target f...
void Blast_GetCompositionRange(int *pleft, int *pright, const Uint1 *subject_data, int length, int start, int finish)
Get the range of a sequence to be included when computing a composition.
static double BLOS62[COMPO_NUM_TRUE_AA][COMPO_NUM_TRUE_AA]
The BLOSUM62 matrix for the ARND...
void Blast_CalcLambdaFullPrecision(double *plambda, int *piterations, double **score, int alphsize, const double row_prob[], const double col_prob[], double lambda_tolerance, double function_tolerance, int max_iterations)
Given a score matrix the character frequencies in two sequences, compute the ungapped statistical par...
double Blast_MatrixEntropy(double **matrix, int alphsize, const double row_prob[], const double col_prob[], double Lambda)
Calculate the entropy of a matrix relative to background probabilities for two sequences.
void Blast_CalcFreqRatios(double **ratios, int alphsize, double row_prob[], double col_prob[])
Given a matrix of target frequencies, divide all elements by the character probabilities to get a mat...
static const int kCompoAdjustIterationLimit
iteration limit for Newton's method
static long Nint(double x)
Return the nearest integer to x.
static const double kMaximumXscore
largest permitted value of score(i,X), when scores are computed using composition adjustment
void Blast_CompositionWorkspaceFree(Blast_CompositionWorkspace **pNRrecord)
Free memory associated with a record of type Blast_CompositionWorkspace.
void Blast_ReadAaComposition(Blast_AminoAcidComposition *composition, int alphsize, const Uint1 *sequence, int length)
Compute the true amino acid composition of a sequence, ignoring ambiguity characters and other nonsta...
Definitions used in compositional score matrix adjustment.
@ eSelenocysteine
Constants used in compositional score matrix adjustment.
ECompoAdjustModes
An collection of constants that specify all permissible modes of composition adjustment.
@ eCompositionBasedStats
Composition-based statistics as in NAR 29:2994-3005, 2001.
#define COMPO_NUM_TRUE_AA
Number of standard amino acids.
EMatrixAdjustRule
An collection of constants that specify all rules that may be used to generate a compositionally adju...
@ eCompoScaleOldMatrix
@ eRelEntropyOldMatrixOldContext
@ eRelEntropyOldMatrixNewContext
@ eUserSpecifiedRelEntropy
@ eUnconstrainedRelEntropy
#define COMPO_LARGEST_ALPHABET
The largest alphabet supported by this code (the code supports 26 or 28 character amino acid alphabet...
#define COMPO_SCORE_MIN
Minimum score in a matrix.
#define A(i)
Definition: ecp_curves.c:948
#define NULL
Definition: ncbistd.hpp:225
uint8_t Uint1
1-byte (8-bit) unsigned integer
Definition: ncbitype.h:99
n font weight
exit(2)
int i
Definitions used to get joint probabilities for a scoring matrix.
int Blast_GetJointProbsForMatrix(double **probs, double row_sums[], double col_sums[], const char *matrixName)
Get joint probabilities for the named matrix.
range(_Ty, _Ty) -> range< _Ty >
const GenericPointer< typename T::ValueType > T2 value
Definition: pointer.h:1227
#define fabs(v)
Definition: ncbi_dispd.c:46
unsigned int a
Definition: ncbi_localip.c:102
Type and macro definitions from C toolkit that are not defined in C++ toolkit.
#define MIN(a, b)
returns smaller of a and b.
Definition: ncbi_std.h:112
#define MAX(a, b)
returns larger of a and b.
Definition: ncbi_std.h:117
T prob(T x_)
double lambda(size_t dimMatrix_, const Int4 *const *scoreMatrix_, const double *q_)
double f(double x_, const double &y_)
Definition: njn_root.hpp:188
Declarations for several linear algebra routines.
double ** Nlm_DenseMatrixNew(int nrows, int ncols)
Create and return a new, dense matrix.
void Nlm_DenseMatrixFree(double ***mat)
Free a matrix created by Nlm_DenseMatrixNew or Nlm_LtriangMatrixNew.
void Nlm_AddVectors(double y[], int n, double alpha, const double x[])
Let y = y + alpha * x.
void Nlm_Int4MatrixFree(int ***mat)
Free a matrix created by Nlm_DenseMatrixNew or Nlm_LtriangMatrixNew.
int ** Nlm_Int4MatrixNew(int nrows, int ncols)
Create and return a new Int4 matrix.
Routines for finding an optimal set of target frequencies for the purpose of generating a composition...
int Blast_OptimizeTargetFrequencies(double x[], int alphsize, int *iterations, const double q[], const double row_sums[], const double col_sums[], int constrain_rel_entropy, double relative_entropy, double tol, int maxits)
Find an optimal set of target frequencies for the purpose of generating a compositionally adjusted sc...
#define row(bind, expected)
Definition: string_bind.c:73
Represents the composition of an amino-acid sequence, in the ncbistdaa alphabet.
int numTrueAminoAcids
number of true amino acids in the sequence, omitting nonstandard amino acids
double prob[COMPO_LARGEST_ALPHABET]
probabilities of each amino acid
Work arrays used to perform composition-based matrix adjustment.
double * first_standard_freq
background frequency vector of the first sequence
double ** mat_final
optimized target frequencies
double * second_standard_freq
background frequency vector of the second sequence
double ** mat_b
joint probabilities for the matrix in standard context
Information about a amino-acid substitution matrix.
double ** startFreqRatios
frequency ratios used to calculate matrix scores
int cols
the number of columns in the scoring matrix, i.e.
char * matrixName
name of the matrix
int positionBased
is the matrix position-based
int rows
the number of rows in the scoring matrix.
double ungappedLambda
ungapped Lambda value for this matrix in standard context
int ** startMatrix
Rescaled values of the original matrix.
Headers for computing a "composition" p-value of a match, and for computing a unified p-value combini...
#define COMPO_MIN_LAMBDA
the smallest value of lambda in the table of lambda's empirical distribution function.
double Blast_CompositionPvalue(double lambda)
Conditionally compute a compositional p-value.
void free(voidpf ptr)
voidp malloc(uInt size)
voidp calloc(uInt items, uInt size)
Modified on Sun Apr 14 05:28:04 2024 by modify_doxy.py rev. 669887