NCBI C++ ToolKit
signal_seq.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: signal_seq.cpp 46773 2010-08-09 18:35:30Z dicuccio $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Josh Cherry
27  *
28  * File Description: Prediction of signal sequences from protein sequence
29  * according to von Heijne, 1986 and 1987
30  *
31  */
32 
33 
34 #include <ncbi_pch.hpp>
36 
39 
40 // Scoring matrix for eukaryotic signal sequences
41 // we have to declare these extern to be accessible from template on ForteCC.
42 extern const double const_EukMat[26][15];
43 extern const double const_BacMat[26][15];
44 
45 const double const_EukMat[26][15] = {
46  {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
47  {0.0984401, -0.109199, -0.0350913, 0.0339016, 0.321584,
48  0.216223, 0.216223, 0.159065, 0.544727, 0.0339016,
49  1.176, -0.882389, 1.70788, 0.216223, -0.882389}, // A
50  {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // B (?)
51  {-0.405465, 0.287682, 0.693147, 0.441833, 0.693147,
52  1.13498, 0.287682, 0.575364, 0.105361, 0.287682,
53  1.44036, -0.405465, 0.693147, 0.575364, -0.405465}, // C
54  {-2.18605, -2.18605, -2.18605, -2.18605, -2.18605,
55  -2.18605, -2.18605, -2.18605, -0.576613, -1.08744,
56  -25.2119, -0.576613, -25.2119, 0.116534, 0.211844}, // D
57  {-2.30259, -2.30259, -2.30259, -2.30259, -2.30259,
58  -2.30259, -2.30259, -2.30259, -1.20397, -0.356675,
59  -25.3284, -0.356675, -25.3284, 0.262364, 0.336472}, // E
60  {0.842183, 0.474458, 0.675129, 0.675129, 0.0689929,
61  0.223144, 1.16761, 0.842183, -0.336472, -0.113329,
62  -24.7486, 0.842183, -24.7486, 0.0689929, -0.336472}, // F
63  {-1.10691, -1.10691, -1.39459, -0.701446, -1.39459,
64  0.0717439, -1.39459, -1.80006, 0.451234, 1.03316,
65  -0.883768, -0.547295, 1.17036, -0.19062, -0.547295}, // G
66  {-1.22378, -1.22378, -1.22378, -1.22378, -1.22378,
67  -1.22378, -1.22378, -1.22378, 0.385662, -1.22378,
68  -24.2496, 0.567984, -24.2496, 0.162519, -0.530628}, // H
69  {0.70657, 0.70657, 0.0779615, -0.209721, 0.396415,
70  -0.392042, -0.615186, 0.0779615, -0.392042, -2.00148,
71  0.301105, -0.392042, -25.0273, 0.0779615, -0.0555699}, // I
72  {-2.4248, -2.4248, -2.4248, -2.4248, -2.4248,
73  -2.4248, -2.4248, -2.4248, -2.4248, -1.03851,
74  -25.4507, -1.73166, -25.4507, -0.0269075, -0.227578}, // K
75  {1.76947, 1.7263, 1.78346, 1.87624, 1.8635,
76  1.31346, 1.66568, 1.39861, -0.19062, 0.642289,
77  -0.413764, 0.502527, -2.49321, -0.413764, -1.10691}, // L
78  {-0.993252, 0.105361, 0.952658, 0.393043, -0.993252,
79  0.798508, -0.300105, -0.300105, -0.993252, -0.993252,
80  -24.0191, -0.993252, -24.0191, -0.993252, -0.300105}, // M
81  {-1.96009, -1.96009, -1.96009, -1.96009, -1.96009,
82  -1.96009, -1.96009, -1.96009, -0.861482, -0.861482,
83  -24.9859, 0.34249, -24.9859, -0.5738, -0.0141846}, // N
84  {-1.30833, -2.00148, -1.30833, -2.00148, -2.00148,
85  -0.615186, -2.00148, 0.0779615, 0.994252, 0.637577,
86  -25.0273, -2.00148, -0.902868, -2.00148, 1.08956}, // P
87  {-1.84055, -1.84055, -1.84055, -1.84055, -1.84055,
88  -0.0487902, -1.84055, -1.84055, 0.462035, 0.238892,
89  -24.8664, 1.04982, -0.741937, 1.10389, 0.462035}, // Q
90  {-1.335, -2.02815, -2.02815, -2.02815, -2.02815,
91  -2.02815, -2.02815, -2.02815, -0.0822381, -0.641854,
92  -25.054, 0.679902, -25.054, 0.456758, 0.169076}, // R
93  {-0.236389, -1.335, -0.354172, -0.641854, 0.131336,
94  -0.131028, 0.274437, 0.338975, 0.824483, -0.0357181,
95  0.701881, 0.3996, 0.562119, 0.274437, -0.131028}, // S
96  {-1.57898, 0.0304592, -0.662688, -0.885832, -0.662688,
97  0.292823, -0.326216, -0.326216, 0.212781, -0.480366,
98  0.561087, -0.192684, -0.480366, -1.17351, 0.0304592}, // T
99  {0.588787, 0.811931, 0.301105, 0.483427, 0.158004,
100  0.301105, -0.00904984, 0.888892, -2.40695, 0.0779615,
101  1.05879, -1.30833, -25.4328, -0.327504, 0.426268}, // V
102  {0.798508, 0.510826, 0.510826, -0.587787, -0.587787,
103  0.105361, 1.20397, 0.510826, -0.587787, 0.510826,
104  -23.6136, 1.60944, -23.6136, 0.105361, -0.587787}, // W
105  {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // X (?)
106  {-1.72277, -1.72277, -0.336472, -1.72277, -1.72277,
107  -1.72277, -0.624154, -1.72277, -1.72277, -1.02962,
108  -24.7486, -0.113329, -24.7486, -1.72277, 0.223144}, // Y
109  {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // Z (?)
110  {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // U (?)
111  {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} // * (?)
112 };
113 
114 
115 // Scoring matrix for bacterial signal sequences
116 const double const_BacMat[26][15] = {
117  {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
118  {1.13943, 0.916291, 0.916291, 1.03407, 0.628609,
119  0.782759, 0.446287, 0.628609, 0.782759, 0.782759,
120  2.0149, -0.470004, 2.27084, 1.72722, 0.223144}, // A
121  {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // B (?)
122  {0, 0, 0, 0, 0,
123  0, 0, 0, 0, 0,
124  -23.0259, 0, -23.0259, 0, 0}, // C
125  {-0.693147, -0.693147, -0.693147, -0.693147, -0.693147,
126  -0.693147, -0.693147, -0.693147, -0.693147, -0.693147,
127  -23.719, -0.693147, -23.719, 0, 1.38629}, // D
128  {-0.788457, -0.788457, -0.788457, -0.788457, -0.788457,
129  -0.788457, -0.788457, -0.788457, -0.788457, -0.788457,
130  -23.8143, -0.788457, -23.8143, 0.597837, 1.29098}, // E
131  {0.430783, 1.12393, 0.836248, 1.12393, -0.262364,
132  -0.262364, 1.81708, -0.262364, 1.12393, -0.262364,
133  -23.2882, 1.68355, -23.2882, -0.262364, -0.262364}, // F
134  {0.393043, -0.300105, -0.300105, -0.300105, 0.105361,
135  0.616186, -0.300105, 0.393043, -0.300105, -0.300105,
136  -24.0191, -0.300105, -0.300105, -0.993252, -0.993252}, // G
137  {0.223144, 0.223144, 0.223144, 0.223144, 0.223144,
138  0.223144, 0.223144, 0.223144, 0.223144, 0.223144,
139  -22.8027, 2.16905, -22.8027, 0.223144, 0.223144}, // H
140  {0.567984, -0.530628, 1.07881, -0.530628, 1.07881,
141  -0.530628, -0.530628, 0.567984, -0.530628, -0.530628,
142  -23.5565, -0.530628, -23.5565, -0.530628, 0.162519}, // I
143  {-0.916291, -0.916291, -0.916291, -0.916291, -0.916291,
144  -0.916291, -0.916291, -0.916291, -0.916291, -0.916291,
145  -23.9421, -0.223144, -23.9421, 0.182322, -0.916291}, // K
146  {1.08619, 1.40464, 1.20397, 1.08619, 1.20397,
147  1.5717, -0.993252, -0.993252, -0.300105, -0.300105,
148  -0.993252, -0.300105, -24.0191, -0.993252, -0.993252}, // L
149  {0.510826, 1.20397, 0.510826, 0.510826, 1.60944,
150  1.20397, 1.60944, 0.510826, 0.510826, 1.20397,
151  -22.515, 1.89712, -22.515, 0.510826, 0.510826}, // M
152  {-0.470004, -0.470004, -0.470004, -0.470004, -0.470004,
153  -0.470004, -0.470004, -0.470004, -0.470004, -0.470004,
154  -23.4959, 0.628609, -23.4959, -0.470004, 0.916291}, // N
155  {-0.530628, -0.530628, -0.530628, -0.530628, -0.530628,
156  -0.530628, 0.162519, 0.567984, 1.07881, 0.162519,
157  -23.5565, -0.530628, -23.5565, -0.530628, 1.07881}, // P
158  {-0.336472, -0.336472, -0.336472, -0.336472, -0.336472,
159  -0.336472, -0.336472, -0.336472, 0.356675, 0.356675,
160  -23.3623, 0.76214, -23.3623, -0.336472, -0.336472}, // Q
161  {-0.530628, -0.530628, -0.530628, -0.530628, -0.530628,
162  -0.530628, -0.530628, -0.530628, -0.530628, -0.530628,
163  -23.5565, -0.530628, -23.5565, -0.530628, -0.530628}, // R
164  {-0.955511, -0.955511, -0.955511, 0.430783, 0.430783,
165  -0.955511, 0.653926, 1.75254, 0.653926, 1.12393,
166  0.653926, -0.262364, -0.262364, -0.955511, -0.955511}, // S
167  {-0.0953102, -0.788457, 0.597837, -0.0953102, -0.0953102,
168  -0.0953102, -0.0953102, -0.0953102, 0.820981, -0.788457,
169  0.310155, -0.788457, -0.788457, -0.788457, -0.0953102}, // T
170  {0.693147, 1.02962, -0.916291, 0.182322, -0.916291,
171  0.470004, 1.02962, -0.916291, -0.916291, 0.470004,
172  0.182322, -0.916291, -23.9421, -0.223144, -0.916291}, // V
173  {0.916291, 0.916291, 0.916291, 0.916291, 0.916291,
174  0.916291, 0.916291, 0.916291, 0.916291, 0.916291,
175  -22.1096, 0.916291, -22.1096, 0.916291, 0.916291}, // W
176  {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // X (?)
177  {-0.262364, -0.262364, -0.262364, -0.262364, -0.262364,
178  -0.262364, -0.262364, -0.262364, -0.262364, 0.836248,
179  -23.2882, -0.262364, -23.2882, -0.262364, -0.262364}, // Y
180  {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // Z (?)
181  {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // U (?)
182  {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} // * (?)
183 };
184 
185 
186 template<class Seq>
187 void x_PredictSignalSeq(const Seq& seq, CSignalSeq::EDomain domain,
188  TSeqPos max_pos, TSeqPos& pos, double& score)
189 {
190 
191  if (seq.size() < 15) {
192  throw runtime_error("Sequence length must be at least 15 "
193  "to predict signal sequence");
194  }
195  if (max_pos < 12) {
196  throw runtime_error("max_pos must be at least 12 "
197  "to predict signal sequence");
198  }
199 
200  const double (*Mat)[15];
201  if (domain == CSignalSeq::eBacterial) {
202  Mat = const_BacMat;
203  } else {
204  Mat = const_EukMat;
205  }
206 
207  TSeqPos max_index = min((TSeqPos)seq.size() - 15, max_pos - 12);
208 
209  double max_score = -1e6;
210  TSeqPos max_loc = 0; // initialize to avoid compiler warning
211  for (unsigned int i = 0; i <= max_index; i++) {
212  double sum = 0;
213  for (unsigned int j = 0; j < 15; j++) {
214  int sub = (unsigned char)seq[i + j];
215  sum += Mat[sub][j];
216  }
217  if (sum > max_score) {
218  max_score = sum;
219  max_loc = i;
220  }
221  }
222  score = max_score;
223  pos = max_loc + 12; // position before cut
224 }
225 
226 
227 void CSignalSeq::Predict(const string& seq, EDomain domain,
228  TSeqPos max_pos, TSeqPos& pos, double& score)
229 {
230  x_PredictSignalSeq(seq, domain, max_pos, pos, score);
231 }
232 
233 
234 void CSignalSeq::Predict(const vector<char>& seq, EDomain domain,
235  TSeqPos max_pos, TSeqPos& pos, double& score)
236 {
237  x_PredictSignalSeq(seq, domain, max_pos, pos, score);
238 }
239 
240 
241 void CSignalSeq::Predict(const CSeqVector& seq, EDomain domain,
242  TSeqPos max_pos, TSeqPos& pos, double& score)
243 {
244  string seq_ncbistdaa;
245  CSeqVector vec(seq);
246  vec.SetNcbiCoding();
247  vec.GetSeqData(0, vec.size(), seq_ncbistdaa);
248  x_PredictSignalSeq(seq_ncbistdaa, domain, max_pos, pos, score);
249 }
250 
251 
CSeqVector –.
Definition: seq_vector.hpp:65
friend void x_PredictSignalSeq(const Seq &seq, CSignalSeq::EDomain domain, TSeqPos max_pos, TSeqPos &pos, double &score)
Definition: signal_seq.cpp:187
static void Predict(const string &seq, EDomain domain, TSeqPos max_pos, TSeqPos &pos, double &score)
Find the most likely predicted signal sequence cleavage site (pos) and the associated score (>3....
Definition: signal_seq.cpp:227
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
int i
T min(T x_, T y_)
USING_SCOPE(objects)
void x_PredictSignalSeq(const Seq &seq, CSignalSeq::EDomain domain, TSeqPos max_pos, TSeqPos &pos, double &score)
Definition: signal_seq.cpp:187
const double const_BacMat[26][15]
Definition: signal_seq.cpp:116
const double const_EukMat[26][15]
Definition: signal_seq.cpp:45
Modified on Fri Sep 20 14:57:07 2024 by modify_doxy.py rev. 669887