1 /*
2 * ===========================================================================
3 *
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's offical duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================*/
26 /*****************************************************************************
28 File name: cobalt_app.cpp
30 Author: Jason Papadopoulos
32 Contents: C++ driver for COBALT multiple alignment algorithm
34 ******************************************************************************/
36 #include <ncbi_pch.hpp>
37 #include <corelib/ncbiapp.hpp>
38 #include <corelib/ncbifile.hpp>
39 #include <corelib/ncbitime.hpp>
43 #include <serial/iterator.hpp>
52 #include <algo/cobalt/cobalt.hpp>
53 #include <algo/cobalt/version.hpp>
55 #include "cobalt_app_util.hpp"
59 USING_SCOPE(align_format);
60 USING_SCOPE(cobalt);
63 {
64 public:
67  version->SetVersionInfo(new CMultiAlignerVersion());
71  if (m_UsageReport.IsEnabled()) {
72  m_UsageReport.AddParam(blast::CBlastUsageReport::eVersion,
73  GetVersion().Print());
74  m_UsageReport.AddParam(blast::CBlastUsageReport::eProgram,
75  (string)"cobalt");
76  }
77  }
80  m_UsageReport.AddParam(blast::CBlastUsageReport::eRunTime, m_StopWatch.Elapsed());
81  }
83 private:
84  virtual void Init(void);
85  virtual int Run(void);
86  virtual void Exit(void);
89  blast::CBlastUsageReport m_UsageReport;
91 };
93 // Get tree computation method as string that can be used to initialize
94 // default command line option value
96 {
97  switch (method) {
98  case CMultiAlignerOptions::eFastME : return "fastme";
99  case CMultiAlignerOptions::eNJ : return "nj";
100  case CMultiAlignerOptions::eClusters : return "clust";
101  default: return "";
102  }
103 }
105 // Get k-mer alphabet as string that can be used to initialize
106 // default command line option value
109 {
110  switch (alph) {
111  case CMultiAligner::TKMethods::eRegular : return "regular";
112  case CMultiAligner::TKMethods::eSE_V10 : return "se-v10";
113  case CMultiAligner::TKMethods::eSE_B15 : return "se-b15";
114  default : return "";
115  }
116 }
119 {
121  | fHideDryRun);
123  unique_ptr<CArgDescriptions> arg_desc(new CArgDescriptions);
125  arg_desc->SetUsageContext(GetArguments().GetProgramBasename(),
126  "COBALT multiple sequence alignment utility");
128  // Input sequences
129  arg_desc->SetCurrentGroup("Input");
130  arg_desc->AddOptionalKey("i", "infile", "File containing input sequences "
131  "in FASTA format", CArgDescriptions::eInputFile);
133  arg_desc->AddOptionalKey("in_msa1", "infile", "File containing input "
134  "alignment in FASTA format",
137  arg_desc->AddOptionalKey("in_msa2", "infile", "File containing input "
138  "alignment in FASTA format",
141  arg_desc->AddOptionalKey("ind1", "numbers", "Coma separated list of "
142  "sequence indices in MSA1 to be used for "
143  "constraints generation",
146  arg_desc->AddOptionalKey("ind2", "numbers", "Coma separated list of "
147  "sequence indices in MSA2 to be used for "
148  "constraints generation",
151  arg_desc->SetDependency("i", CArgDescriptions::eExcludes, "in_msa1");
152  arg_desc->SetDependency("i", CArgDescriptions::eExcludes, "in_msa2");
153  arg_desc->SetDependency("i", CArgDescriptions::eExcludes, "ind1");
154  arg_desc->SetDependency("i", CArgDescriptions::eExcludes, "ind2");
156  arg_desc->SetDependency("in_msa1", CArgDescriptions::eRequires, "in_msa2");
157  arg_desc->SetDependency("in_msa2", CArgDescriptions::eRequires, "in_msa1");
159  arg_desc->SetDependency("ind1", CArgDescriptions::eRequires, "in_msa1");
160  arg_desc->SetDependency("ind2", CArgDescriptions::eRequires, "in_msa2");
162  arg_desc->AddFlag("parse_deflines", "Should the sequence deflines be "
163  "parsed?");
166  // Conserved domain options
167  arg_desc->SetCurrentGroup("Conserved domain options");
168  arg_desc->AddOptionalKey("rpsdb", "database", "Conserved domain database "
169  "name\nEither database or -norps option must be "
170  "specified", CArgDescriptions::eString);
171  arg_desc->AddDefaultKey("norps", "norps", "Do not perform initial "
172  "RPS-BLAST search",
174  arg_desc->AddDefaultKey("rps_evalue", "evalue",
175  "E-value threshold for selecting conserved domains"
176  " from results of RPS-BLAST search",
179  arg_desc->AddDefaultKey("num_domain_hits", "number", "Maximum number of "
180  "of domain hits for each sequence",
183  arg_desc->AddOptionalKey("p", "patternfile",
184  "Filename containing regular expression patterns "
185  "for conserved domains",
187  arg_desc->AddDefaultKey("dfb", "domain_res_boost",
188  "When assigning domain residue frequencies, the amount of "
189  "extra weight (0..1) to give to the actual sequence letter "
190  "at that position",
194  arg_desc->AddOptionalKey("domain_hits", "infile", "Results of pre-computed"
195  " domain search in BLAST archive format",
198  arg_desc->SetDependency("domain_hits", CArgDescriptions::eRequires, "rpsdb");
199  arg_desc->SetDependency("domain_hits", CArgDescriptions::eRequires,
200  "parse_deflines");
203  // User conststraints options
204  arg_desc->SetCurrentGroup("Constraints options");
205  arg_desc->AddOptionalKey("c", "constraintfile",
206  "Filename containing pairwise alignment constraints, "
207  "one per line, each represented by 6 integers:\n"
208  " -zero-based index of sequence 1 in the input file\n"
209  " -zero-based start position in sequence 1\n"
210  " -zero-based stop position in sequence 1\n"
211  " -zero-based index of sequence 2 in the input file\n"
212  " -zero-based start position in sequence 2\n"
213  " -zero-based stop position in sequence 2\n",
217  // Multiple alignment options
218  arg_desc->SetCurrentGroup("Multiple alignment options");
219  arg_desc->AddDefaultKey("treemethod", "method",
220  "Method for generating progressive alignment guide tree",
223  arg_desc->SetConstraint("treemethod", &(*new CArgAllow_Strings,
224  "clust", "nj", "fastme"));
225  arg_desc->AddDefaultKey("iter", "iterate",
226  "After the first iteration search for conserved columns "
227  "and realign if any are found",
229  arg_desc->AddDefaultKey("ccc", "conserved_cutoff",
230  "Minimum average score needed for a multiple alignment "
231  "column to be considered as conserved",
234  arg_desc->AddDefaultKey("pseudo", "pseudocount",
235  "Pseudocount constant",
238  arg_desc->AddDefaultKey("ffb", "filler_res_boost",
239  "When assigning filler residue frequencies, the amount of "
240  "extra weight (0..1) to give to the actual sequence letter "
241  "at that position",
246  // Pairwise alignment options
247  arg_desc->SetCurrentGroup("Pairwise alignment options");
248  arg_desc->AddDefaultKey("matrix", "matrix",
249  "Score matrix to use",
251  arg_desc->AddDefaultKey("end_gapopen", "penalty",
252  "Gap open penalty for terminal gaps",
255  arg_desc->AddDefaultKey("end_gapextend", "penalty",
256  "Gap extend penalty for terminal gaps",
259  arg_desc->AddDefaultKey("gapopen", "penalty",
260  "Gap open penalty for internal gaps",
263  arg_desc->AddDefaultKey("gapextend", "penalty",
264  "Gap extend penalty for internal gaps",
267  arg_desc->AddDefaultKey("blast_evalue", "evalue",
268  "E-value threshold for selecting segments matched "
269  "by BLASTP",
274  // Query clustering options
275  arg_desc->SetCurrentGroup("Query clustering options");
276  arg_desc->AddDefaultKey("clusters", "clusters",
277  "Use query clustering for faster alignment",
279  arg_desc->AddDefaultKey("k", "length",
280  "K-mer length for query clustering",
283  arg_desc->AddDefaultKey("max_dist", "distance",
284  "Maximum allowed distance between sequences in a cluster"
285  " (0..1)",
288  arg_desc->AddDefaultKey("alph", "name",
289  "Alphabet for used k-mer counting",
292  arg_desc->SetConstraint("alph", &(*new CArgAllow_Strings, "regular",
293  "se-v10", "se-b15"));
296  // Output options
297  arg_desc->SetCurrentGroup("Output options");
298  arg_desc->AddOptionalKey("seqalign", "file",
299  "Output text seqalign to specified file",
301  arg_desc->AddOptionalKey("outfmt", "format", "Output format for multiple "
302  "alignment", CArgDescriptions::eString);
303  arg_desc->SetConstraint("outfmt", &(*new CArgAllow_Strings, "mfasta",
304  "clustalw", "phylip", "nexus"));
305  arg_desc->AddFlag("v", "Verbose output");
308  SetupArgDescriptions(arg_desc.release());
309 }
312 static void
313 x_LoadConstraints(string constraintfile,
314  vector<CMultiAlignerOptions::SConstraint>& constr)
315 {
316  CNcbiIfstream f(constraintfile.c_str());
317  if (f.bad() ||
318  NCBI_THROW(CMultiAlignerException, eInvalidInput,
319  "Cannot open file with pairwise constraints");
321  int seq1, seq1_start, seq1_end;
322  int seq2, seq2_start, seq2_end;
324  constr.clear();
326  f >> seq1 >> seq1_start >> seq1_end;
327  f >> seq2 >> seq2_start >> seq2_end;
329  c(seq1, seq1_start, seq1_end, seq2, seq2_start, seq2_end);
330  constr.push_back(c);
332  while (!f.eof()) {
333  seq1 = -1;
335  f >> seq1 >> seq1_start >> seq1_end;
336  f >> seq2 >> seq2_start >> seq2_end;
337  if (seq1 >= 0) {
338  constr.push_back(CMultiAlignerOptions::SConstraint(seq1,
339  seq1_start, seq1_end, seq2, seq2_start, seq2_end));
340  }
341  }
342 }
345 static void
346 x_LoadPatterns(string patternsfile,
347  vector<CMultiAlignerOptions::CPattern>& patterns)
348 {
349  CNcbiIfstream f(patternsfile.c_str());
350  if (f.bad() ||
351  NCBI_THROW(CMultiAlignerException, eInvalidInput,
352  "Cannot open patterns file");
354  patterns.clear();
356  while (!f.eof()) {
357  string single_pattern;
359  f >> single_pattern;
361  if (!single_pattern.empty()) {
362  patterns.push_back(single_pattern);
363  }
364  }
365 }
369 {
370  // Allow the fasta reader to complain on
371  // invalid sequence input
374  // Process command line args
375  const CArgs& args = GetArgs();
378  if (args["rpsdb"] && args["norps"].AsBoolean()) {
379  NCBI_THROW(CMultiAlignerException, eInvalidInput,
380  "The options -rpsdb and -norps T are mutually exclusive");
381  }
383  if (!args["rpsdb"] && !args["norps"].AsBoolean()) {
384  NCBI_THROW(CMultiAlignerException, eInvalidInput,
385  "RPS dababase not specified");
386  }
389  // Set up data loaders
396  // PSSM aligner parameters
397  opts->SetGapOpenPenalty(-args["gapopen"].AsInteger());
398  opts->SetGapExtendPenalty(-args["gapextend"].AsInteger());
399  opts->SetEndGapOpenPenalty(-args["end_gapopen"].AsInteger());
400  opts->SetEndGapExtendPenalty(-args["end_gapextend"].AsInteger());
401  opts->SetScoreMatrixName(args["matrix"].AsString());
403  // RPS Blast parameters
404  if (args["rpsdb"]) {
405  opts->SetRpsDb(args["rpsdb"].AsString());
407  // Check whether RPS database and auxialry files exist
408  const string dbname = args["rpsdb"].AsString();
409  CFile rps(dbname + ".rps");
410  if (!rps.Exists()) {
411  NcbiCerr << "Error: RPS database file: " << dbname + ".rps"
412  << " is missing" << NcbiEndl;
413  return 1;
414  }
416  CFile blocks(dbname + ".blocks");
417  if (!blocks.Exists()) {
418  NcbiCerr << "Error: RPS block file: " << dbname + ".blocks"
419  << " is missing" << NcbiEndl;
420  return 1;
421  }
423  CFile freq(dbname + ".freq");
424  if (!freq.Exists()) {
425  NcbiCerr << "Error: RPS frequencies file: " << dbname + ".freq"
426  << " is missing" << NcbiEndl;
427  return 1;
428  }
429  }
430  opts->SetRpsEvalue(args["rps_evalue"].AsDouble());
431  opts->SetDomainResFreqBoost(args["dfb"].AsDouble());
432  opts->SetDomainHitlistSize(args["num_domain_hits"].AsInteger());
434  // Blastp parameters
435  opts->SetBlastpEvalue(args["blast_evalue"].AsDouble());
436  opts->SetLocalResFreqBoost(args["ffb"].AsDouble());
438  // Patterns
439  if (args["p"]) {
440  x_LoadPatterns(args["p"].AsString(), opts->SetCddPatterns());
441  }
443  // User constraints
444  if (args["c"]) {
445  x_LoadConstraints(args["c"].AsString(), opts->SetUserConstraints());
446  }
448  // Progressive alignmenet params
450  if (args["treemethod"].AsString() == "clust") {
451  tree_method = CMultiAlignerOptions::eClusters;
452  }
453  else if (args["treemethod"].AsString() == "nj") {
454  tree_method = CMultiAlignerOptions::eNJ;
455  }
456  else if (args["treemethod"].AsString() == "fastme") {
457  tree_method = CMultiAlignerOptions::eFastME;
458  }
459  else {
460  NcbiCerr << "Error: Incorrect tree method";
461  return 1;
462  }
463  opts->SetTreeMethod(tree_method);
465  // Iterative alignment params
466  opts->SetIterate(args["iter"].AsBoolean());
467  opts->SetConservedCutoffScore(args["ccc"].AsDouble());
468  opts->SetPseudocount(args["pseudo"].AsDouble());
470  // Query clustering params
471  opts->SetUseQueryClusters(args["clusters"].AsBoolean());
472  opts->SetKmerLength(args["k"].AsInteger());
473  opts->SetMaxInClusterDist(args["max_dist"].AsDouble());
477  if (args["alph"]) {
478  if (args["alph"].AsString() == "regular") {
480  }
481  else if (args["alph"].AsString() == "se-v10") {
483  }
484  else if (args["alph"].AsString() == "se-b15") {
486  }
487  }
488  opts->SetKmerAlphabet(alph);
490  // not option of the application
491  opts->SetInClustAlnMethod(args["clusters"].AsBoolean()
496  // set pre-computed domain hits
497  if (args["domain_hits"]) {
499  args["domain_hits"].AsInputFile() >> MSerial_AsnText >> *archive;
500  opts->SetDomainHits(archive);
501  }
503  // Verbose level
504  opts->SetVerbose(args["v"]);
506  // Validate options and print warning messages if any
507  if (!opts->Validate()) {
508  ITERATE(vector<string>, it, opts->GetMessages()) {
509  NcbiCerr << "Warning: " << *it << NcbiEndl;
510  }
511  }
513  CMultiAligner aligner(opts);
515  vector< CRef<objects::CSeq_loc> > queries;
517  scope->AddDefaults();
521  if (!args["parse_deflines"]) {
523  }
525  // if aligning a set of sequences
526  if (args["i"]) {
527  GetSeqLocFromStream(args["i"].AsInputFile(), queries, scope, flags);
529  _ASSERT(!scope.Empty());
530  aligner.SetQueries(queries, scope);
532  m_UsageReport.AddParam(blast::CBlastUsageReport::eNumQueries,
533  (int)queries.size());
534  }
535  else {
537  // aligning two MSAs
539  objects::CSeqIdGenerator id_generator;
540  // this flag sets validation of the read MSA
544  args["in_msa1"].AsInputFile(),
545  scope,
546  flags,
547  id_generator);
550  args["in_msa2"].AsInputFile(),
551  scope,
552  flags,
553  id_generator);
555  _ASSERT(!scope.Empty());
557  set<int> repr1, repr2;
558  size_t num1 = 0, num2 = 0;
559  CTempString delim(",");
560  if (args["ind1"]) {
561  list<string> tokens;
562  NStr::Split((CTempString)args["ind1"].AsString(), delim, tokens,
564  ITERATE (list<string>, it, tokens) {
565  repr1.insert(NStr::StringToInt(*it));
566  num1++;
567  }
568  }
569  if (args["ind2"]) {
570  list<string> tokens;
571  NStr::Split((CTempString)args["ind2"].AsString(), delim, tokens,
573  ITERATE (list<string>, it, tokens) {
574  repr2.insert(NStr::StringToInt(*it));
575  num2++;
576  }
577  }
579  // indeces of sequence representatives in MSAs must be unique
580  if (num1 != repr1.size() || num2 != repr2.size()) {
581  NcbiCerr << "Error: Non-unique indeces of input sequence "
582  << "representatives"
583  << NcbiEndl;
585  return 1;
586  }
588  aligner.SetInputMSAs(*msa1, *msa2, repr1, repr2, scope);
589  }
591  // write error and/or warning messages
592  CMultiAligner::TStatus status = aligner.Run();
593  string msg = status != CMultiAligner::eSuccess ? "Error: " : "Warning: ";
594  ITERATE(vector<string>, it, aligner.GetMessages()) {
595  NcbiCerr << msg << *it << NcbiEndl;
596  }
598  // If aligner returns with error status then exit
599  if (status != CMultiAligner::eSuccess) {
600  return status;
601  }
603  sequence::CDeflineGenerator defline_gen;
605  if (args["outfmt"]) {
606  CMultiAlnPrinter printer(*aligner.GetResults(), *aligner.GetScope(),
608  printer.SetWidth(80);
609  printer.SetGapChar('-');
610  printer.SetEndGapChar('-');
611  if (args["outfmt"].AsString() == "mfasta") {
613  }
614  else if (args["outfmt"].AsString() == "clustalw") {
616  }
617  else if (args["outfmt"].AsString() == "phylip") {
619  }
620  else if (args["outfmt"].AsString() == "nexus") {
622  }
624  printer.Print(NcbiCout);
625  }
626  else {
627  // default format is fasta with one sequence per line
628  const vector<CSequence>& results(aligner.GetSeqResults());
629  CRef<CSeq_align> align = aligner.GetResults();
630  for (int i = 0; i < (int)results.size(); i++) {
632  CBioseq_Handle bhandle = scope->GetBioseqHandle(
633  align->GetSeq_id(i),
636  // try to recreate the defline for parsed Seq-ids
637  if (args["parse_deflines"]) {
638  // if Seq-id is local then, do not print Seq-id type
639  const CSeq_id& id = align->GetSeq_id(i);
640  if (id.IsLocal()) {
641  string label;
642  id.GetLabel(&label, CSeq_id::eContent);
643  printf(">%s", label.c_str());
644  }
645  else {
646  // for non-local Seq-ids print all ids
647  const vector<CSeq_id_Handle>& ids = bhandle.GetId();
648  printf(">");
649  ITERATE (vector<CSeq_id_Handle>, it, ids) {
650  const string id_str = it->GetSeqId()->AsFastaString();
651  printf("%s", id_str.c_str());
652  if (it + 1 != ids.end()) {
653  printf("|");
654  }
655  }
657  }
658  // do not print 'unnamed protein product' for empty title
659  string title = defline_gen.GenerateDefline(bhandle);
660  if (title != "unnamed protein product") {
661  printf(" %s", title.c_str());
662  }
663  printf("\n");
664  }
665  else {
666  printf(">%s\n", defline_gen.GenerateDefline(bhandle).c_str());
667  }
669  for (int j = 0; j < results[i].GetLength(); j++) {
670  printf("%c", results[i].GetPrintableLetter(j));
671  }
672  printf("\n");
673  }
674  }
676  if (args["seqalign"]) {
677  CRef<CSeq_align> sa = aligner.GetResults();
678  CNcbiOstream& out = args["seqalign"].AsOutputFile();
679  out << MSerial_AsnText << *sa;
680  }
682  m_UsageReport.AddParam(blast::CBlastUsageReport::eExitStatus, 0);
683  return 0;
684 }
687 {
688  SetDiagStream(0);
689 }
691 int main(int argc, const char* argv[])
692 {
693  return CMultiApplication().AppMain(argc, argv, 0, eDS_Default, "");
694 }
