NCBI C++ ToolKit
compo_mode_condition.c
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* ===========================================================================
2 *
3 * PUBLIC DOMAIN NOTICE
4 * National Center for Biotechnology Information
5 *
6 * This software/database is a "United States Government Work" under the
7 * terms of the United States Copyright Act. It was written as part of
8 * the author's official duties as a United States Government employee and
9 * thus cannot be copyrighted. This software/database is freely available
10 * to the public for use. The National Library of Medicine and the U.S.
11 * Government have not placed any restriction on its use or reproduction.
12 *
13 * Although all reasonable efforts have been taken to ensure the accuracy
14 * and reliability of the software and data, the NLM and the U.S.
15 * Government do not and cannot warrant the performance or results that
16 * may be obtained by using this software or data. The NLM and the U.S.
17 * Government disclaim all warranties, express or implied, including
18 * warranties of performance, merchantability or fitness for any particular
19 * purpose.
20 *
21 * Please cite the author in any work or product based on this material.
22 *
23 * ===========================================================================*/
24 
25 /**
26  * @file compo_mode_condition.c
27  * Functions to test whether conditional score matrix adjustment
28  * should be applied for a pair of matching sequences.
29  *
30  * Authors: Alejandro Schaffer, Yi-Kuo Yu
31  */
32 
37 
38 /** 180 degrees in half a circle */
39 #define HALF_CIRCLE_DEGREES 180
40 /** some digits of PI */
41 #define PI 3.1415926543
42 /** @{ thresholds used to determine which composition mode to use */
43 #define QUERY_MATCH_DISTANCE_THRESHOLD 0.16
44 #define LENGTH_RATIO_THRESHOLD 3.0
45 #define ANGLE_DEGREE_THRESHOLD 70.0
46 #define HIGH_PAIR_THRESHOLD 0.4
47 #define LENGTH_LOWER_THRESHOLD 50
48 /** @} */
49 
50 /** type of function used to choose a mode for composition-based
51  * statistics. The variables are Queryseq_length, Matchseq_length,
52  * query_amino_count, match_amino_account and matrix_name.*/
54 (*Condition) (int, int, const double *, const double *,
55  const char *);
56 
57 
58 /** Return true if length > 50 and the two most frequent letters
59  * occur a total of more that 40% of the time. */
60 static int
61 s_HighPairFrequencies(const double * letterProbs, int length)
62 {
63  int i; /*index*/
64  double max, second; /*two highest letter probabilities*/
65 
66  if (length <= LENGTH_LOWER_THRESHOLD) {
67  return FALSE;
68  }
69  max = 0;
70  second = 0;
71  for (i = 0; i < COMPO_NUM_TRUE_AA; i++) {
72  if (letterProbs[i] > second) {
73  second = letterProbs[i];
74  if (letterProbs[i] > max) {
75  second = max;
76  max = letterProbs[i];
77  }
78  }
79  }
80  return (max + second) > HIGH_PAIR_THRESHOLD;
81 }
82 
83 /**
84  * Return true if either the query or the matching sequences
85  * passes the test in s_HighPairFrequencies. */
86 static int
87 s_HighPairEitherSeq(const double * P_query, int length1,
88  const double * P_match, int length2)
89 {
90  int result1, result2;
91 
92  result1 = s_HighPairFrequencies(P_query, length1);
93  result2 = s_HighPairFrequencies(P_match, length2);
94 
95  return result1 || result2;
96 }
97 
98 
99 /** Return eDontAdjustMatrix unconditionally */
100 static EMatrixAdjustRule
101 s_NeverAdjustMatrix(int Len_query, int Len_match,
102  const double * P_query, const double * P_match,
103  const char *matrix_name)
104 {
105  /* Suppress unused variable warnings */
106  (void) Len_query;
107  (void) Len_match;
108  (void) P_query;
109  (void) P_match;
110  (void) matrix_name;
111 
112  return eDontAdjustMatrix;
113 }
114 
115 
116 /** Return eCompoScaleOldMatrix unconditionally */
117 static EMatrixAdjustRule
118 s_JustScaleOldMatrix(int Len_query, int Len_match,
119  const double * P_query, const double * P_match,
120  const char *matrix_name)
121 {
122  /* Suppress unused variable warnings */
123  (void) Len_query;
124  (void) Len_match;
125  (void) P_query;
126  (void) P_match;
127  (void) matrix_name;
128 
129  return eCompoScaleOldMatrix;
130 }
131 
132 
133 /** A function used to choose a mode for composition-based statistics.
134  * If this function is used relative-entropy score adjustment is
135  * always applied, with a fixed value as the target relative entropy*/
136 static EMatrixAdjustRule
138  int Len_match,
139  const double * P_query,
140  const double * P_match,
141  const char *matrix_name)
142 {
143  /* Suppress unused variable warnings */
144  (void) Len_query;
145  (void) Len_match;
146  (void) P_query;
147  (void) P_match;
148  (void) matrix_name;
149 
151 }
152 
153 
154 /**
155  * A function used to choose a mode for composition-based statistics.
156  * Decide whether a relative-entropy score adjustment should be used
157  * based on lengths and letter counts of the two matched sequences;
158  * matrix_name is the underlying score matrix */
159 static EMatrixAdjustRule
161  int Len_match,
162  const double * P_query,
163  const double * P_match,
164  const char *matrix_name)
165 {
166  EMatrixAdjustRule which_rule; /* which relative entropy mode to
167  return */
168  int i; /* loop indices */
169  double p_query[COMPO_NUM_TRUE_AA];
170  double p_match[COMPO_NUM_TRUE_AA]; /*letter probabilities
171  for query and match*/
172  const double *p_matrix; /* letter probabilities used in
173  constructing matrix name*/
174  double D_m_mat, D_q_mat, D_m_q; /* distances between match and
175  original between query and
176  original between match and
177  query*/
178  double corr_factor = 0.0; /* correlation between how p_query
179  and p_match deviate from p_matrix
180  */
181  double len_q, len_m; /* lengths of query and matching
182  sequence in floating point */
183  double len_large, len_small; /* store the larger and smaller of
184  len_q and len_m */
185  double angle; /* angle between query and match
186  probabilities */
187 
188  p_matrix = Blast_GetMatrixBackgroundFreq(matrix_name);
189 
190  for (i = 0; i < COMPO_NUM_TRUE_AA; i++) {
191  p_query[i] = P_query[i];
192  p_match[i] = P_match[i];
193  corr_factor +=
194  (p_query[i] - p_matrix[i]) * (p_match[i] - p_matrix[i]);
195  }
196  D_m_mat = Blast_GetRelativeEntropy(p_match, p_matrix);
197  D_q_mat = Blast_GetRelativeEntropy(p_query, p_matrix);
198  D_m_q = Blast_GetRelativeEntropy(p_match, p_query);
199 
200  angle =
201  acos((D_m_mat * D_m_mat + D_q_mat * D_q_mat -
202  D_m_q * D_m_q) / 2.0 / D_m_mat / D_q_mat);
203  /* convert from radians to degrees */
204  angle = angle * HALF_CIRCLE_DEGREES / PI;
205 
206  len_q = 1.0 * Len_query;
207  len_m = 1.0 * Len_match;
208  if (len_q > len_m) {
209  len_large = len_q;
210  len_small = len_m;
211  } else {
212  len_large = len_m;
213  len_small = len_q;
214  }
215  if (s_HighPairEitherSeq(P_query, Len_query, P_match, Len_match)) {
216  which_rule = eUserSpecifiedRelEntropy;
217  } else {
218  if ((D_m_q > QUERY_MATCH_DISTANCE_THRESHOLD) &&
219  (len_large / len_small > LENGTH_RATIO_THRESHOLD) &&
220  (angle > ANGLE_DEGREE_THRESHOLD)) {
221  which_rule = eCompoScaleOldMatrix;
222  } else {
223  which_rule = eUserSpecifiedRelEntropy;
224  }
225  }
226  return which_rule;
227 }
228 
229 
230 /**
231  * An array of functions that can be used to decide which optimization
232  * formulation should be used for score adjustment */
233 static Condition Cond_func[] = {
238  NULL
239 };
240 
241 
242 /* Documented in compo_mode_condition.h. */
245  int length2,
246  const double * probArray1,
247  const double * probArray2,
248  const char *matrixName,
249  ECompoAdjustModes composition_adjust_mode)
250 {
251  int testFunctionIndex = (int) composition_adjust_mode;
252 
253  return
254  Cond_func[testFunctionIndex] (length1, length2,
255  probArray1, probArray2, matrixName);
256 }
EMatrixAdjustRule(* Condition)(int, int, const double *, const double *, const char *)
type of function used to choose a mode for composition-based statistics.
#define ANGLE_DEGREE_THRESHOLD
static EMatrixAdjustRule s_JustScaleOldMatrix(int Len_query, int Len_match, const double *P_query, const double *P_match, const char *matrix_name)
Return eCompoScaleOldMatrix unconditionally.
#define QUERY_MATCH_DISTANCE_THRESHOLD
#define PI
some digits of PI
static Condition Cond_func[]
An array of functions that can be used to decide which optimization formulation should be used for sc...
#define HALF_CIRCLE_DEGREES
180 degrees in half a circle
static int s_HighPairFrequencies(const double *letterProbs, int length)
Return true if length > 50 and the two most frequent letters occur a total of more that 40% of the ti...
static EMatrixAdjustRule s_TestToApplyREAdjustmentUnconditional(int Len_query, int Len_match, const double *P_query, const double *P_match, const char *matrix_name)
A function used to choose a mode for composition-based statistics.
EMatrixAdjustRule Blast_ChooseMatrixAdjustRule(int length1, int length2, const double *probArray1, const double *probArray2, const char *matrixName, ECompoAdjustModes composition_adjust_mode)
Choose how the relative entropy should be constrained based on properties of the two sequences to be ...
static EMatrixAdjustRule s_NeverAdjustMatrix(int Len_query, int Len_match, const double *P_query, const double *P_match, const char *matrix_name)
Return eDontAdjustMatrix unconditionally.
#define HIGH_PAIR_THRESHOLD
static EMatrixAdjustRule s_TestToApplyREAdjustmentConditional(int Len_query, int Len_match, const double *P_query, const double *P_match, const char *matrix_name)
A function used to choose a mode for composition-based statistics.
static int s_HighPairEitherSeq(const double *P_query, int length1, const double *P_match, int length2)
Return true if either the query or the matching sequences passes the test in s_HighPairFrequencies.
#define LENGTH_LOWER_THRESHOLD
#define LENGTH_RATIO_THRESHOLD
Declarations of functions used to choose the mode for composition-based statistics.
Definitions used in compositional score matrix adjustment.
double Blast_GetRelativeEntropy(const double A[], const double B[])
Compute the symmetric form of the relative entropy of two probability vectors.
ECompoAdjustModes
An collection of constants that specify all permissible modes of composition adjustment.
#define COMPO_NUM_TRUE_AA
Number of standard amino acids.
EMatrixAdjustRule
An collection of constants that specify all rules that may be used to generate a compositionally adju...
@ eDontAdjustMatrix
@ eCompoScaleOldMatrix
@ eUserSpecifiedRelEntropy
#define NULL
Definition: ncbistd.hpp:225
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
int i
Definitions used to get joint probabilities for a scoring matrix.
const double * Blast_GetMatrixBackgroundFreq(const char *matrix_name)
Return true if frequency data is available for the given matrix name.
Type and macro definitions from C toolkit that are not defined in C++ toolkit.
#define FALSE
bool replacment for C indicating false.
Definition: ncbi_std.h:101
T max(T x_, T y_)
Modified on Thu Apr 11 15:07:14 2024 by modify_doxy.py rev. 669887