NCBI C++ ToolKit
nwa.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: nwa.cpp 92154 2020-12-22 17:11:30Z grichenk $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Yuri Kapustin
27  *
28  * File Description: Pairwise global alignment utility.
29  *
30 */
31 
32 #include <ncbi_pch.hpp>
33 
34 #include "nwa.hpp"
35 
39 
42 
45 
47 {
49 
50  unique_ptr<CArgDescriptions> argdescr(new CArgDescriptions);
51  argdescr->SetUsageContext(GetArguments().GetProgramName(),
52  "Demo application using xalgoalign library");
53 
54  argdescr->AddDefaultKey
55  ("matrix", "matrix", "scoring matrix",
57 
58  argdescr->AddKey
59  ("seq1", "seq1",
60  "the first input sequence in fasta file",
62  argdescr->AddKey
63  ("seq2", "seq2",
64  "the second input sequence in fasta file",
66 
67  argdescr->AddDefaultKey
68  ("esf", "esf",
69  "End-space free alignment. Format: lrLR where each character "
70  "can be z (free end) or x (regular end) representing "
71  "left and right ends. First sequence's ends are specified first.",
73  "xxxx");
74 
75  argdescr->AddDefaultKey
76  ("gp", "gp",
77  "gap preference: earlier or later",
79  "later");
80 
81  argdescr->AddDefaultKey
82  ("Wm", "match", "match bonus (nucleotide sequences)",
85 
86  argdescr->AddDefaultKey
87  ("Wms", "mismatch", "mismatch penalty (nucleotide sequences)",
90 
91  argdescr->AddDefaultKey
92  ("Wg", "gap", "gap opening penalty",
95 
96  argdescr->AddDefaultKey
97  ("Ws", "space", "gap extension (space) penalty",
100 
101  argdescr->AddDefaultKey
102  ("band", "band", "Band width in banded alignment",
104 
105  argdescr->AddDefaultKey
106  ("shift", "shift",
107  "Band shift in banded alignment "
108  "(specify negative value to indicate second sequence)",
110 
111  argdescr->AddFlag("sw",
112  "run local alignment (Smith-Waterman)");
113 
114  argdescr->AddFlag("mm",
115  "Use linear-memory alignment algorithm (Myers & Miller)");
116 
117  argdescr->AddFlag("mt", "Use multiple threads");
118 
119  // output formats
120  argdescr->AddOptionalKey
121  ("o1", "o1", "Filename for type 1 output", CArgDescriptions::eString);
122 
123  argdescr->AddOptionalKey
124  ("o2", "o2", "Filename for type 2 output", CArgDescriptions::eString);
125 
126  argdescr->AddOptionalKey
127  ("ofasta", "ofasta",
128  "Generate gapped FastA output for the aligner sequences",
130 
131  argdescr->AddOptionalKey
132  ("oasn", "oasn", "ASN.1 output filename", CArgDescriptions::eString);
133 
134  CArgAllow_Strings* paa_st = new CArgAllow_Strings;
135  paa_st->Allow("nucl")->Allow("blosum62");
136  argdescr->SetConstraint("matrix", paa_st);
137 
138  CArgAllow_Strings* paa_esf = new CArgAllow_Strings;
139  paa_esf->Allow("xxxx")->Allow("xxxz")->Allow("xxzx")->Allow("xxzz");
140  paa_esf->Allow("xzxx")->Allow("xzxz")->Allow("xzzx")->Allow("xzzz");
141  paa_esf->Allow("zxxx")->Allow("zxxz")->Allow("zxzx")->Allow("zxzz");
142  paa_esf->Allow("zzxx")->Allow("zzxz")->Allow("zzzx")->Allow("zzzz");
143  argdescr->SetConstraint("esf", paa_esf);
144 
145  CArgAllow_Strings* paa_gp = new CArgAllow_Strings;
146  paa_gp->Allow("earlier")->Allow("later");
147  argdescr->SetConstraint("gp", paa_gp);
148 
149  SetupArgDescriptions(argdescr.release());
150 }
151 
152 
154 {
155  x_RunOnPair();
156  return 0;
157 }
158 
159 
160 unique_ptr<ofstream> open_ofstream (const string& filename) {
161 
162  unique_ptr<ofstream> pofs0 ( new ofstream (filename.c_str()) );
163  if(*pofs0) {
164  return pofs0;
165  }
166  else {
168  eCannotWriteFile,
169  "Cannot write to file" + filename);
170  }
171 }
172 
173 
175 {
176  const CArgs& args = GetArgs();
177 
178  // analyze parameters
179  const bool bMM = args["mm"];
180  const bool bMT = args["mt"];
181 
182  bool output_type1 ( args["o1"] );
183  bool output_type2 ( args["o2"] );
184  bool output_asn ( args["oasn"] );
185  bool output_fasta ( args["ofasta"] );
186 
187  int band (args["band"].AsInteger());
188  int shift(args["shift"].AsInteger());
189 
190  if(bMT && !bMM) {
192  eInconsistentParameters,
193  "Mutliple thread mode supported "
194  "for Myers-Miller method only (invoke with -mm)");
195  }
196 
197  if(bMM && band >= 0) {
199  eInconsistentParameters,
200  "-mm and -band are inconsistent with each other");
201  }
202 
203 #ifndef NCBI_THREADS
204  if(bMT) {
206  eNotSupported,
207  "This application was built without multithreading support. "
208  "To run in multiple threads, please re-configure and rebuild"
209  " with proper options.");
210  }
211 
212 #endif
213 
214  // read input sequences
215  vector<char> v1, v2;
216 
217  CRef<CSeq_id> seqid1 = x_ReadFastaFile(args["seq1"].AsString(), &v1);
218  CRef<CSeq_id> seqid2 = x_ReadFastaFile(args["seq2"].AsString(), &v2);
219 
220  // determine sequence/score matrix type
221  const SNCBIPackedScoreMatrix* psm =
222  (args["matrix"].AsString() == "blosum62")? &NCBISM_Blosum62: 0;
223 
224  CNWAligner* pnwaligner = 0;
225  if(bMM) {
226  pnwaligner = new CMMAligner(&v1[0], v1.size(), &v2[0], v2.size(), psm);
227  }
228  else if (band < 0) {
229  pnwaligner = new CNWAligner(&v1[0], v1.size(), &v2[0], v2.size(), psm);
230  }
231  else {
232  CBandAligner * ba = new CBandAligner(&v1[0], v1.size(), &v2[0], v2.size(),
233  psm, band);
234  Uint1 where = shift >= 0? 0: 1;
235  ba->SetShift(where,abs(shift));
236  pnwaligner = ba;
237  }
238 
239  unique_ptr<CNWAligner> aligner (pnwaligner);
240 
241  if(psm == NULL) {
242  aligner->SetWm (args["Wm"]. AsInteger());
243  aligner->SetWms (args["Wms"].AsInteger());
244  aligner->SetScoreMatrix(NULL);
245  }
246  aligner->SetWg (args["Wg"]. AsInteger());
247  aligner->SetWs (args["Ws"]. AsInteger());
248 
249  aligner->SetScoreMatrix(psm); // re-set score matrix to handle
250  // possible ambiguity chars
251 
252  if(bMT && bMM) {
253  CMMAligner* pmma = static_cast<CMMAligner*> (aligner.get());
254  pmma -> EnableMultipleThreads();
255  }
256 
257  unique_ptr<ofstream> pofs1;
258  unique_ptr<ofstream> pofs2;
259  unique_ptr<ofstream> pofsAsn;
260  unique_ptr<ofstream> pofsFastA;
261 
262  if(output_type1) {
263  pofs1.reset(open_ofstream (args["o1"].AsString()).release());
264  }
265 
266  if(output_type2) {
267  pofs2.reset(open_ofstream (args["o2"].AsString()).release());
268  }
269 
270  if(output_asn) {
271  pofsAsn.reset(open_ofstream (args["oasn"].AsString()).release());
272  }
273 
274  if(output_fasta) {
275  pofsFastA.reset(open_ofstream (args["ofasta"].AsString()).release());
276  }
277 
278  {{ // setup end penalties
279  string ends = args["esf"].AsString();
280  bool L1 = ends[0] == 'z';
281  bool R1 = ends[1] == 'z';
282  bool L2 = ends[2] == 'z';
283  bool R2 = ends[3] == 'z';
284  aligner->SetEndSpaceFree(L1, R1, L2, R2);
285  }}
286 
287  aligner->SetSmithWaterman(args["sw"]);
288 
289  if( args["gp"].AsString() == "earlier" ) {
290  aligner->SetGapPreference(CNWAligner::eEarlier);
291  } else if (args["gp"].AsString() == "later") {
292  aligner->SetGapPreference(CNWAligner::eLater);
293  } else {
294  NCBI_THROW(CException, eUnknown, "unknown value for \"gp\"");
295  }
296 
297  int score = aligner->Run();
298  cerr << "Score = " << score << endl;
299 
300  CNWFormatter formatter (*aligner);
301  formatter.SetSeqIds(seqid1, seqid2);
302 
303  const size_t line_width = 100;
304  string s;
305  if(pofs1.get()) {
306  formatter.AsText(&s, CNWFormatter::eFormatType1, line_width);
307  *pofs1 << s;
308  }
309 
310  if(pofs2.get()) {
311  formatter.AsText(&s, CNWFormatter::eFormatType2, line_width);
312  *pofs2 << s;
313  }
314 
315  if(pofsAsn.get()) {
316  formatter.AsText(&s, CNWFormatter::eFormatAsn, line_width);
317  *pofsAsn << s;
318  }
319 
320  if(pofsFastA.get()) {
321  formatter.AsText(&s, CNWFormatter::eFormatFastA, line_width);
322  *pofsFastA << s;
323  }
324 
325  if(!output_type1 && !output_type2
326  && !output_asn && !output_fasta)
327  {
328  formatter.AsText(&s, CNWFormatter::eFormatType2, line_width);
329  cout << s;
330  }
331 }
332 
333 
335 {
336  return;
337 }
338 
339 
340 CRef<CSeq_id> CAppNWA::x_ReadFastaFile (const string& filename,
341  vector<char>* sequence) const
342 {
343  vector<char>& vOut = *sequence;
344  vOut.clear();
345 
346  ifstream ifs(filename.c_str());
347 
348  // read the defline
349  string str;
350  getline(ifs, str);
351 
352  if( str[0] == '>' ) { //cut leading '>'
353  str = str.substr(1);
354  }
355 
356  CRef<CSeq_id> seqid;
357  try {
358  seqid.Reset(new CSeq_id(str));
359  } catch (CSeqIdException&) {
360  seqid.Reset(new CSeq_id(CSeq_id::e_Local, str));
361  }
362 
363  // read the sequence
364  while ( ifs ) {
365  string s;
366  ifs >> s;
367  NStr::ToUpper(s);
368  copy(s.begin(), s.end(), back_inserter(vOut));
369  }
370 
371  return seqid;
372 }
373 
374 
376 
377 
379 
380 int main(int argc, const char* argv[])
381 {
382  return CAppNWA().AppMain(argc, argv, 0, eDS_Default, 0);
383 }
Definition: nwa.hpp:86
virtual int Run()
Run the application.
Definition: nwa.cpp:153
CRef< objects::CSeq_id > x_ReadFastaFile(const string &filename, vector< char > *sequence) const
Definition: nwa.cpp:340
virtual void Exit()
Cleanup on application exit.
Definition: nwa.cpp:334
void x_RunOnPair() const
Definition: nwa.cpp:174
virtual void Init()
Initialize the application.
Definition: nwa.cpp:46
CArgAllow_Strings –.
Definition: ncbiargs.hpp:1641
CArgDescriptions –.
Definition: ncbiargs.hpp:541
CArgs –.
Definition: ncbiargs.hpp:379
CSeqIdException –.
Definition: Seq_id.hpp:1001
static const char * str(char *buf, int n)
Definition: stats.c:84
void AsText(string *output, ETextFormatType type, size_t line_width=100) const
void SetSeqIds(CConstRef< objects::CSeq_id > id1, CConstRef< objects::CSeq_id > id2)
static TScore GetDefaultWg(void)
Definition: nw_aligner.hpp:161
static TScore GetDefaultWms(void)
Definition: nw_aligner.hpp:160
static TScore GetDefaultWs(void)
Definition: nw_aligner.hpp:162
void SetShift(Uint1 where, size_t offset)
static TScore GetDefaultWm(void)
Definition: nw_aligner.hpp:159
void HideStdArgs(THideStdArgs hide_mask)
Set the hide mask for the Hide Std Flags.
Definition: ncbiapp.cpp:1325
virtual const CArgs & GetArgs(void) const
Get parsed command line arguments.
Definition: ncbiapp.cpp:305
int AppMain(int argc, const char *const *argv, const char *const *envp=0, EAppDiagStream diag=eDS_Default, const char *conf=NcbiEmptyCStr, const string &name=NcbiEmptyString)
Main function (entry point) for the NCBI application.
Definition: ncbiapp.cpp:832
virtual void SetupArgDescriptions(CArgDescriptions *arg_desc)
Setup the command line argument descriptions.
Definition: ncbiapp.cpp:1208
const CNcbiArguments & GetArguments(void) const
Get the application's cached unprocessed command-line arguments.
@ fHideLogfile
Hide log file description.
@ fHideConffile
Hide configuration file description.
@ fHideVersion
Hide version description.
CArgAllow_Strings * Allow(const string &value)
Add allowed string values.
Definition: ncbiargs.cpp:4598
@ eString
An arbitrary string.
Definition: ncbiargs.hpp:589
@ eInteger
Convertible into an integer number (int or Int8)
Definition: ncbiargs.hpp:592
#define NULL
Definition: ncbistd.hpp:225
@ eDS_Default
Try standard log file (app.name + ".log") in /log/, use stderr on failure.
Definition: ncbidiag.hpp:1790
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
const CVect2< U > & v2
Definition: globals.hpp:440
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
uint8_t Uint1
1-byte (8-bit) unsigned integer
Definition: ncbitype.h:99
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5086
static string & ToUpper(string &str)
Convert string to upper case – string& version.
Definition: ncbistr.cpp:424
@ e_Local
local use
Definition: Seq_id_.hpp:95
#define abs(a)
Definition: ncbi_heapmgr.c:130
#define GetProgramName
Avoid name clash with the NCBI C Toolkit.
Definition: ncbienv.hpp:49
void copy(Njn::Matrix< S > *matrix_, const Njn::Matrix< T > &matrix0_)
Definition: njn_matrix.hpp:613
USING_SCOPE(objects)
unique_ptr< ofstream > open_ofstream(const string &filename)
Definition: nwa.cpp:160
int main(int argc, const char *argv[])
Definition: nwa.cpp:380
USING_NCBI_SCOPE
Definition: nwa.cpp:378
const SNCBIPackedScoreMatrix NCBISM_Blosum62
Definition: sm_blosum62.c:92
Modified on Wed Sep 04 15:05:46 2024 by modify_doxy.py rev. 669887