NCBI C++ ToolKit
make_score_method.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: make_score_method.cpp 46050 2021-01-21 18:06:42Z grichenk $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Robert Smith
27  *
28  * File Description:
29  * Create Alignment score method files from:
30  * 1. Builtin scoring tables (-sm)
31  * 2. aaindex tables and matrixes from
32  * http://www.genome.ad.jp/dbget/aaindex.html
33  *
34  */
35 
36 #include <ncbi_pch.hpp>
37 
38 #include <corelib/ncbiapp.hpp>
39 #include <corelib/ncbireg.hpp>
40 #include <corelib/ncbi_limits.hpp>
41 
43 
44 #include <ctype.h>
45 #include <iostream>
46 #include <iomanip>
47 #include <algorithm>
48 
49 
51 
53 {
54 private:
55  void Init(void);
56  int Run(void);
57 };
58 
59 
61 {
62  unique_ptr<CArgDescriptions> arg_desc(new CArgDescriptions);
63  arg_desc->SetUsageContext(GetArguments().GetProgramBasename(),
64  "Make alignment scoring method files");
65 
66  arg_desc->AddDefaultKey("out", "output", "File name for scoring method",
68 
69  arg_desc->AddOptionalKey("sm", "matrix", "name of score matrix to use",
71  arg_desc->SetConstraint
72  ("sm", &(*new CArgAllow_Strings,
73  "blosum45", "blosum62", "blosum80", "pam30", "pam70", "pam250"));
74 
75  arg_desc->AddOptionalKey("aa", "accession",
76  "Amino Acid Index Database accession number",
78  arg_desc->AddDefaultKey("in", "aaindex_file",
79  "Amino Acid Index Database input file.",
81 
82  SetupArgDescriptions(arg_desc.release());
83 }
84 
85 
86 inline static string s_FormatAA(int aa) {
87  return isprint((unsigned char) aa) ? string(1, (char) aa) : NStr::IntToString(aa);
88 }
89 
90 
91 static void s_BuiltInSM(const string& sm, map<string, string>& out_map)
92 {
93  const SNCBIPackedScoreMatrix* psm = NULL;
94  string desc;
95 
96  if (sm == "blosum45") {
97  psm = &NCBISM_Blosum45;
98  desc =
99  " Matrix made by matblas from blosum45.iij\\\n"
100  " BLOSUM Clustered Scoring Matrix in 1/3 Bit Units\\\n"
101  " Blocks Database = /data/blocks_5.0/blocks.dat\\\n"
102  " Cluster Percentage: >= 45\\\n"
103  " Entropy = 0.3795, Expected = -0.2789";
104  } else if (sm == "blosum62") {
105  psm = &NCBISM_Blosum62;
106  desc =
107  " Matrix made by matblas from blosum62.iij\\\n"
108  " BLOSUM Clustered Scoring Matrix in 1/2 Bit Units\\\n"
109  " Blocks Database = /data/blocks_5.0/blocks.dat\\\n"
110  " Cluster Percentage: >= 62\\\n"
111  " Entropy = 0.6979, Expected = -0.5209";
112  } else if (sm == "blosum80") {
113  psm = &NCBISM_Blosum80;
114  desc =
115  " Matrix made by matblas from blosum80.iij\\\n"
116  " BLOSUM Clustered Scoring Matrix in 1/2 Bit Units\\\n"
117  " Blocks Database = /data/blocks_5.0/blocks.dat\\\n"
118  " Cluster Percentage: >= 80\\\n"
119  " Entropy = 0.9868, Expected = -0.7442";
120  } else if (sm == "pam30") {
121  psm = &NCBISM_Pam30;
122  desc =
123  " This matrix was produced by \\\"pam\\\" Version 1.0.6 [28-Jul-93]\\\n"
124  " PAM 30 substitution matrix, scale = ln(2)/2 = 0.346574\\\n"
125  " Expected score = -5.06, Entropy = 2.57 bits\\\n"
126  " Lowest score = -17, Highest score = 13";
127  } else if (sm == "pam70") {
128  psm = &NCBISM_Pam70;
129  desc =
130  " This matrix was produced by \\\"pam\\\" Version 1.0.6 [28-Jul-93]\\\n"
131  " PAM 70 substitution matrix, scale = ln(2)/2 = 0.346574\\\n"
132  " Expected score = -2.77, Entropy = 1.60 bits\\\n"
133  " Lowest score = -11, Highest score = 13";
134  } else if (sm == "pam250") {
135  psm = &NCBISM_Pam250;
136  desc =
137  " This matrix was produced by \\\"pam\\\" Version 1.0.7 [01-Feb-98]\\\n"
138  " using Dayhoff et al. (1978) mutability data.\\\n"
139  " PAM 250 substitution matrix, scale = ln(2)/3 = 0.231049\\\n"
140  " Expected score = -0.844, Entropy = 0.354 bits\\\n"
141  " Lowest score = -8, Highest score = 17";
142  } else {
143  _TROUBLE;
144  }
145 
146  out_map["Name"] = sm;
147  out_map["Builtin"] = sm;
148  out_map["Description"] = desc.empty() ? sm : desc;
149 
150 
151  int maxscore = numeric_limits<int>::min();
152  int minscore = numeric_limits<int>::max();
153 
154  int l = strlen(psm->symbols);
155 
156  // find max and min.
157  for (int i = 0; i < l; ++i) {
158  if (isupper((unsigned char) psm->symbols[i])) {
159  for (int j = 0; j < l; ++j) {
160  int value = psm->scores[i * l + j];
161  maxscore = max(value, maxscore);
162  minscore = min(value, minscore);
163  }
164  }
165  }
166  out_map["MinimumValue"] = NStr::IntToString(minscore);
167  out_map["MaximumValue"] = NStr::IntToString(maxscore);
168 }
169 
170 
171 static void sWriteLine(CNcbiOstream& out, map<string, string>& out_map, const string& name)
172 {
173  if ( ! out_map[name].empty()) {
174  out << name << " = " << out_map[name] << endl;
175  }
176 }
177 
178 
179 static bool sReadAA_M(
180  CNcbiIstream& in,
181  const string& m_line,
182  map<string, string>& out_map )
183 {
184  out_map["Method"] = "MatrixScore";
185 
186  // parse the rest of the M line.
187  // look for '='. The token after the first is the rows.
188  // the token after the second '=' is the columns.
189  list<string> toks;
190  NStr::Split(m_line, " ,", toks, NStr::fSplit_Tokenize);
191  list<string>::iterator tok_it;
192  tok_it = find(toks.begin(), toks.end(), "=");
193  if (tok_it == toks.end())
194  return false;
195  ++tok_it;
196  string row_bases(*tok_it);
197 
198  tok_it = find(++tok_it, toks.end(), "=");
199  if (tok_it == toks.end())
200  return false;
201  ++tok_it;
202  string col_bases(*tok_it);
203 
204  // make the Columns line.
205  int cols = col_bases.size();
206  string symbols(1, col_bases[0]);
207  for (int i = 1; i < cols; ++i) {
208  char c = col_bases[i];
209  if (isupper((unsigned char) c) || c == '-') {
210  symbols += " ";
211  symbols += c;
212  }
213  }
214  out_map["Columns"] = symbols;
215 
216  // Make all the TableRows lines.
217  string line;
218  string out_rows;
219  int rows = row_bases.size();
220  for (int r = 0; r < rows; ++r) {
221  if (! getline(in, line))
222  break;
223  out_rows += row_bases[r];
224  out_rows += " =";
225  out_rows += line;
226  out_rows += '\n';
227  }
228  out_map["TableRows"] = out_rows;
229  return true;
230 }
231 
232 // file aaindex1, with one score per amino acid,
233 // always has the same amino acid order. So here it is:
234 static const string kAAIndexOrder("ARNDCQEGHILKMFPSTWYV");
235 
237 {
238  out_map["Method"] = "ColumnScore";
239 
240  string line1, line2;
241  string out_rows;
242  getline(in, line1);
243  getline(in, line2);
244  list<string> scores;
245  NStr::Split(line1 + line2, " ", scores, NStr::fSplit_Tokenize);
246 
247  int r = 0;
248  ITERATE(list<string>, score_it, scores) {
249  out_rows += kAAIndexOrder[r];
250  out_rows += " = ";
251  out_rows += *score_it;
252  out_rows += '\n';
253  ++r;
254  }
255  out_map["TableRows"] = out_rows;
256  return true;
257 }
258 
259 
260 bool s_ReadAAIndex(const string& accession, CNcbiIstream& in, map<string, string>& out_map)
261 {
262  string line;
263  // skip till we find the accession.
264  while (getline(in, line)) {
265  if (line == "H " + accession)
266  break;
267  }
268  if ( ! in.good()) {
269  cerr << "Accession \"" << accession << "\" not found." << endl;
270  return false;
271  }
272 
273  out_map["Name"] = NStr::TruncateSpaces(line.substr(2));
274 
275  string current_key;
276  while (getline(in, line)) {
277  if (line.empty())
278  continue;
279  char command = line[0];
280  line = NStr::TruncateSpaces(line.substr(2));
281  switch(command) {
282  case 'D': // Data description
283  // assume that 'D' will come before the following.
284  current_key = "Description";
285  out_map[current_key] = line;
286  break;
287  case 'R': // LITDB entry number
288  case 'A': // Author(s)
289  case 'T': // Title of the article
290  case 'J': // Journal reference
291  // tack all of these on to the description.
292  current_key = "Description";
293  out_map[current_key] += "\\\n " + line;
294  break;
295  case ' ': // continuation lines.
296  if ( ! current_key.empty()) {
297  out_map[current_key] += "\\\n " +line;
298  }
299  break;
300  case '*': // Comment or missing
301  case 'C': // Accession numbers of similar entries
302  // ignore these.
303  current_key.erase();
304  break;
305  case 'M': // Matrix data
306  if (! sReadAA_M(in, line, out_map)) {
307  cerr << "Bad format in M section at accession \""
308  << accession << "\"" << endl;
309  return false;
310  }
311  return true;
312  case 'I': // Amino acid index data
313  if (! sReadAA_I(in, out_map)) {
314  cerr << "Bad format in I section at accession \""
315  << accession << "\"" << endl;
316  return false;
317  }
318  return true;
319  case '/':
320  return false;
321  }
322  }
323  cerr << "No I or M section at accession \""
324  << accession << "\"" << endl;
325  return false;
326 }
327 
328 
330 {
331  CArgs args = GetArgs();
332  CNcbiOstream& out = args["out"].AsOutputFile();
333 
334  if (args["sm"] && args["aa"] ) {
335  string msg = "Options -sm and -aa are mutually exclusive.\n";
336  cerr << GetArgDescriptions()->PrintUsage(msg);
337  return 1;
338  }
339 
340  // We could use a CNcbiRegistry to store and write out the lines
341  // instead of this map.
342  // But then we couldn't specify the order the lines in the file
343  // and basically the resulting files would work just as well
344  // but would look a lot worse.
345 
346  map<string, string> out_line;
347 
348  out_line["Method"] = "MatrixScore";
349  out_line["Type"] = "Protein";
350  out_line["MinimumColor"] = "yellow3";
351  out_line["MaximumColor"] = "royal blue";
352 
353  if (args["sm"]) {
354  s_BuiltInSM(args["sm"].AsString(), out_line);
355  } else if (args["aa"]) {
356  if (! s_ReadAAIndex(args["aa"].AsString(), args["in"].AsInputFile(), out_line)) {
357  return 1;
358  }
359  }
360 
361  // specify order of the output lines here.
362  out << "[Info]" << endl;
363  sWriteLine(out, out_line, "Name");
364  sWriteLine(out, out_line, "Description");
365  sWriteLine(out, out_line, "Method");
366  sWriteLine(out, out_line, "Type");
367 
368  out << "[Table]" << endl;
369  sWriteLine(out, out_line, "MinimumValue");
370  sWriteLine(out, out_line, "MinimumColor");
371  sWriteLine(out, out_line, "MaximumValue");
372  sWriteLine(out, out_line, "MaximumColor");
373  if (args["sm"]) {
374  sWriteLine(out, out_line, "Builtin");
375  } else {
376  sWriteLine(out, out_line, "Columns");
377 
378  out << "[TableRows]" << endl;
379  out << out_line["TableRows"];
380  }
381 
382  return 0;
383 }
384 
385 
387 
389 
390 int NcbiSys_main(int argc, ncbi::TXChar* argv[])
391 {
392  // Execute main application function
393  return CMakeScoreMethodApp().AppMain(argc, argv);
394 }
CArgAllow_Strings –.
Definition: ncbiargs.hpp:1641
CArgDescriptions –.
Definition: ncbiargs.hpp:541
CArgs –.
Definition: ncbiargs.hpp:379
int Run(void)
Run the application.
void Init(void)
Initialize the application.
void erase(iterator pos)
Definition: map.hpp:167
size_type size() const
Definition: map.hpp:148
bool empty() const
Definition: map.hpp:149
std::ofstream out("events_result.xml")
main entry point for tests
static char line1[1024 *16]
Definition: t0016.c:98
static char line2[1024 *16]
Definition: t0016.c:99
virtual const CArgs & GetArgs(void) const
Get parsed command line arguments.
Definition: ncbiapp.cpp:305
int AppMain(int argc, const char *const *argv, const char *const *envp=0, EAppDiagStream diag=eDS_Default, const char *conf=NcbiEmptyCStr, const string &name=NcbiEmptyString)
Main function (entry point) for the NCBI application.
Definition: ncbiapp.cpp:819
virtual void SetupArgDescriptions(CArgDescriptions *arg_desc)
Setup the command line argument descriptions.
Definition: ncbiapp.cpp:1195
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
const CArgDescriptions * GetArgDescriptions(void) const
Get argument descriptions (set by SetupArgDescriptions)
const CNcbiArguments & GetArguments(void) const
Get the application's cached unprocessed command-line arguments.
virtual string & PrintUsage(string &str, bool detailed=false) const
Print usage message to end of specified string.
Definition: ncbiargs.cpp:3815
@ eInputFile
Name of file (must exist and be readable)
Definition: ncbiargs.hpp:595
@ eString
An arbitrary string.
Definition: ncbiargs.hpp:589
@ eOutputFile
Name of file (must be writable)
Definition: ncbiargs.hpp:596
string
Definition: cgiapp.hpp:687
#define NULL
Definition: ncbistd.hpp:225
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
Definition: ncbistre.hpp:149
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
Definition: ncbistre.hpp:146
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
Definition: ncbistr.cpp:3461
char TXChar
Definition: ncbistr.hpp:172
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5084
static string TruncateSpaces(const string &str, ETrunc where=eTrunc_Both)
Truncate spaces in a string.
Definition: ncbistr.cpp:3186
@ fSplit_Tokenize
All delimiters are merged and trimmed, to get non-empty tokens only.
Definition: ncbistr.hpp:2508
int i
static bool sReadAA_I(CNcbiIstream &in, map< string, string > &out_map)
static const string kAAIndexOrder("ARNDCQEGHILKMFPSTWYV")
int NcbiSys_main(int argc, ncbi::TXChar *argv[])
static void s_BuiltInSM(const string &sm, map< string, string > &out_map)
static void sWriteLine(CNcbiOstream &out, map< string, string > &out_map, const string &name)
static bool sReadAA_M(CNcbiIstream &in, const string &m_line, map< string, string > &out_map)
bool s_ReadAAIndex(const string &accession, CNcbiIstream &in, map< string, string > &out_map)
static string s_FormatAA(int aa)
USING_NCBI_SCOPE
constexpr bool empty(list< Ts... >) noexcept
const GenericPointer< typename T::ValueType > T2 value
Definition: pointer.h:1227
Defines the CNcbiApplication and CAppException classes for creating NCBI applications.
int isprint(Uchar c)
Definition: ncbictype.hpp:67
int isupper(Uchar c)
Definition: ncbictype.hpp:70
Process information in the NCBI Registry, including working with configuration files.
const char * command
T max(T x_, T y_)
T min(T x_, T y_)
std::istream & in(std::istream &in_, double &x_)
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
const SNCBIPackedScoreMatrix NCBISM_Pam30
Definition: sm_pam30.c:92
const SNCBIPackedScoreMatrix NCBISM_Blosum62
Definition: sm_blosum62.c:92
const SNCBIPackedScoreMatrix NCBISM_Pam250
Definition: sm_pam250.c:92
const SNCBIPackedScoreMatrix NCBISM_Blosum80
Definition: sm_blosum80.c:92
const SNCBIPackedScoreMatrix NCBISM_Pam70
Definition: sm_pam70.c:92
const SNCBIPackedScoreMatrix NCBISM_Blosum45
The standard matrices.
Definition: sm_blosum45.c:92
const TNCBIScore * scores
strlen(symbols) x strlen(symbols)
Definition: raw_scoremat.h:48
const char * symbols
order of residues
Definition: raw_scoremat.h:47
#define _TROUBLE
Modified on Wed Jun 19 17:03:50 2024 by modify_doxy.py rev. 669887