NCBI C++ ToolKit
cobalt_app.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /*
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's offical duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================*/
25 
26 /*****************************************************************************
27 
28 File name: cobalt_app.cpp
29 
30 Author: Jason Papadopoulos
31 
32 Contents: C++ driver for COBALT multiple alignment algorithm
33 
34 ******************************************************************************/
35 
36 #include <ncbi_pch.hpp>
37 #include <corelib/ncbiapp.hpp>
38 #include <corelib/ncbifile.hpp>
39 #include <corelib/ncbitime.hpp>
43 #include <serial/iterator.hpp>
48 
50 
52 #include <algo/cobalt/cobalt.hpp>
53 #include <algo/cobalt/version.hpp>
54 
55 #include "cobalt_app_util.hpp"
56 
59 USING_SCOPE(align_format);
60 USING_SCOPE(cobalt);
61 
63 {
64 public:
67  version->SetVersionInfo(new CMultiAlignerVersion());
69 
71  if (m_UsageReport.IsEnabled()) {
72  m_UsageReport.AddParam(blast::CBlastUsageReport::eVersion,
73  GetVersion().Print());
74  m_UsageReport.AddParam(blast::CBlastUsageReport::eProgram,
75  (string)"cobalt");
76  }
77  }
78 
80  m_UsageReport.AddParam(blast::CBlastUsageReport::eRunTime, m_StopWatch.Elapsed());
81  }
82 
83 private:
84  virtual void Init(void);
85  virtual int Run(void);
86  virtual void Exit(void);
87 
89  blast::CBlastUsageReport m_UsageReport;
91 };
92 
93 // Get tree computation method as string that can be used to initialize
94 // default command line option value
96 {
97  switch (method) {
98  case CMultiAlignerOptions::eFastME : return "fastme";
99  case CMultiAlignerOptions::eNJ : return "nj";
100  case CMultiAlignerOptions::eClusters : return "clust";
101  default: return "";
102  }
103 }
104 
105 // Get k-mer alphabet as string that can be used to initialize
106 // default command line option value
109 {
110  switch (alph) {
111  case CMultiAligner::TKMethods::eRegular : return "regular";
112  case CMultiAligner::TKMethods::eSE_V10 : return "se-v10";
113  case CMultiAligner::TKMethods::eSE_B15 : return "se-b15";
114  default : return "";
115  }
116 }
117 
119 {
121  | fHideDryRun);
122 
123  unique_ptr<CArgDescriptions> arg_desc(new CArgDescriptions);
124 
125  arg_desc->SetUsageContext(GetArguments().GetProgramBasename(),
126  "COBALT multiple sequence alignment utility");
127 
128  // Input sequences
129  arg_desc->SetCurrentGroup("Input");
130  arg_desc->AddOptionalKey("i", "infile", "File containing input sequences "
131  "in FASTA format", CArgDescriptions::eInputFile);
132 
133  arg_desc->AddOptionalKey("in_msa1", "infile", "File containing input "
134  "alignment in FASTA format",
136 
137  arg_desc->AddOptionalKey("in_msa2", "infile", "File containing input "
138  "alignment in FASTA format",
140 
141  arg_desc->AddOptionalKey("ind1", "numbers", "Coma separated list of "
142  "sequence indices in MSA1 to be used for "
143  "constraints generation",
145 
146  arg_desc->AddOptionalKey("ind2", "numbers", "Coma separated list of "
147  "sequence indices in MSA2 to be used for "
148  "constraints generation",
150 
151  arg_desc->SetDependency("i", CArgDescriptions::eExcludes, "in_msa1");
152  arg_desc->SetDependency("i", CArgDescriptions::eExcludes, "in_msa2");
153  arg_desc->SetDependency("i", CArgDescriptions::eExcludes, "ind1");
154  arg_desc->SetDependency("i", CArgDescriptions::eExcludes, "ind2");
155 
156  arg_desc->SetDependency("in_msa1", CArgDescriptions::eRequires, "in_msa2");
157  arg_desc->SetDependency("in_msa2", CArgDescriptions::eRequires, "in_msa1");
158 
159  arg_desc->SetDependency("ind1", CArgDescriptions::eRequires, "in_msa1");
160  arg_desc->SetDependency("ind2", CArgDescriptions::eRequires, "in_msa2");
161 
162  arg_desc->AddFlag("parse_deflines", "Should the sequence deflines be "
163  "parsed?");
164 
165 
166  // Conserved domain options
167  arg_desc->SetCurrentGroup("Conserved domain options");
168  arg_desc->AddOptionalKey("rpsdb", "database", "Conserved domain database "
169  "name\nEither database or -norps option must be "
170  "specified", CArgDescriptions::eString);
171  arg_desc->AddDefaultKey("norps", "norps", "Do not perform initial "
172  "RPS-BLAST search",
174  arg_desc->AddDefaultKey("rps_evalue", "evalue",
175  "E-value threshold for selecting conserved domains"
176  " from results of RPS-BLAST search",
179  arg_desc->AddDefaultKey("num_domain_hits", "number", "Maximum number of "
180  "of domain hits for each sequence",
183  arg_desc->AddOptionalKey("p", "patternfile",
184  "Filename containing regular expression patterns "
185  "for conserved domains",
187  arg_desc->AddDefaultKey("dfb", "domain_res_boost",
188  "When assigning domain residue frequencies, the amount of "
189  "extra weight (0..1) to give to the actual sequence letter "
190  "at that position",
193 
194  arg_desc->AddOptionalKey("domain_hits", "infile", "Results of pre-computed"
195  " domain search in BLAST archive format",
197 
198  arg_desc->SetDependency("domain_hits", CArgDescriptions::eRequires, "rpsdb");
199  arg_desc->SetDependency("domain_hits", CArgDescriptions::eRequires,
200  "parse_deflines");
201 
202 
203  // User conststraints options
204  arg_desc->SetCurrentGroup("Constraints options");
205  arg_desc->AddOptionalKey("c", "constraintfile",
206  "Filename containing pairwise alignment constraints, "
207  "one per line, each represented by 6 integers:\n"
208  " -zero-based index of sequence 1 in the input file\n"
209  " -zero-based start position in sequence 1\n"
210  " -zero-based stop position in sequence 1\n"
211  " -zero-based index of sequence 2 in the input file\n"
212  " -zero-based start position in sequence 2\n"
213  " -zero-based stop position in sequence 2\n",
215 
216 
217  // Multiple alignment options
218  arg_desc->SetCurrentGroup("Multiple alignment options");
219  arg_desc->AddDefaultKey("treemethod", "method",
220  "Method for generating progressive alignment guide tree",
223  arg_desc->SetConstraint("treemethod", &(*new CArgAllow_Strings,
224  "clust", "nj", "fastme"));
225  arg_desc->AddDefaultKey("iter", "iterate",
226  "After the first iteration search for conserved columns "
227  "and realign if any are found",
229  arg_desc->AddDefaultKey("ccc", "conserved_cutoff",
230  "Minimum average score needed for a multiple alignment "
231  "column to be considered as conserved",
234  arg_desc->AddDefaultKey("pseudo", "pseudocount",
235  "Pseudocount constant",
238  arg_desc->AddDefaultKey("ffb", "filler_res_boost",
239  "When assigning filler residue frequencies, the amount of "
240  "extra weight (0..1) to give to the actual sequence letter "
241  "at that position",
244 
245 
246  // Pairwise alignment options
247  arg_desc->SetCurrentGroup("Pairwise alignment options");
248  arg_desc->AddDefaultKey("matrix", "matrix",
249  "Score matrix to use",
251  arg_desc->AddDefaultKey("end_gapopen", "penalty",
252  "Gap open penalty for terminal gaps",
255  arg_desc->AddDefaultKey("end_gapextend", "penalty",
256  "Gap extend penalty for terminal gaps",
259  arg_desc->AddDefaultKey("gapopen", "penalty",
260  "Gap open penalty for internal gaps",
263  arg_desc->AddDefaultKey("gapextend", "penalty",
264  "Gap extend penalty for internal gaps",
267  arg_desc->AddDefaultKey("blast_evalue", "evalue",
268  "E-value threshold for selecting segments matched "
269  "by BLASTP",
272 
273 
274  // Query clustering options
275  arg_desc->SetCurrentGroup("Query clustering options");
276  arg_desc->AddDefaultKey("clusters", "clusters",
277  "Use query clustering for faster alignment",
279  arg_desc->AddDefaultKey("k", "length",
280  "K-mer length for query clustering",
283  arg_desc->AddDefaultKey("max_dist", "distance",
284  "Maximum allowed distance between sequences in a cluster"
285  " (0..1)",
288  arg_desc->AddDefaultKey("alph", "name",
289  "Alphabet for used k-mer counting",
292  arg_desc->SetConstraint("alph", &(*new CArgAllow_Strings, "regular",
293  "se-v10", "se-b15"));
294 
295 
296  // Output options
297  arg_desc->SetCurrentGroup("Output options");
298  arg_desc->AddOptionalKey("seqalign", "file",
299  "Output text seqalign to specified file",
301  arg_desc->AddOptionalKey("outfmt", "format", "Output format for multiple "
302  "alignment", CArgDescriptions::eString);
303  arg_desc->SetConstraint("outfmt", &(*new CArgAllow_Strings, "mfasta",
304  "clustalw", "phylip", "nexus"));
305  arg_desc->AddFlag("v", "Verbose output");
306 
307 
308  SetupArgDescriptions(arg_desc.release());
309 }
310 
311 
312 static void
313 x_LoadConstraints(string constraintfile,
314  vector<CMultiAlignerOptions::SConstraint>& constr)
315 {
316  CNcbiIfstream f(constraintfile.c_str());
317  if (f.bad() || f.fail())
318  NCBI_THROW(CMultiAlignerException, eInvalidInput,
319  "Cannot open file with pairwise constraints");
320 
321  int seq1, seq1_start, seq1_end;
322  int seq2, seq2_start, seq2_end;
323 
324  constr.clear();
325 
326  f >> seq1 >> seq1_start >> seq1_end;
327  f >> seq2 >> seq2_start >> seq2_end;
329  c(seq1, seq1_start, seq1_end, seq2, seq2_start, seq2_end);
330  constr.push_back(c);
331 
332  while (!f.eof()) {
333  seq1 = -1;
334 
335  f >> seq1 >> seq1_start >> seq1_end;
336  f >> seq2 >> seq2_start >> seq2_end;
337  if (seq1 >= 0) {
338  constr.push_back(CMultiAlignerOptions::SConstraint(seq1,
339  seq1_start, seq1_end, seq2, seq2_start, seq2_end));
340  }
341  }
342 }
343 
344 
345 static void
346 x_LoadPatterns(string patternsfile,
347  vector<CMultiAlignerOptions::CPattern>& patterns)
348 {
349  CNcbiIfstream f(patternsfile.c_str());
350  if (f.bad() || f.fail())
351  NCBI_THROW(CMultiAlignerException, eInvalidInput,
352  "Cannot open patterns file");
353 
354  patterns.clear();
355 
356  while (!f.eof()) {
357  string single_pattern;
358 
359  f >> single_pattern;
360 
361  if (!single_pattern.empty()) {
362  patterns.push_back(single_pattern);
363  }
364  }
365 }
366 
367 
369 {
370  // Allow the fasta reader to complain on
371  // invalid sequence input
373 
374  // Process command line args
375  const CArgs& args = GetArgs();
376 
377 
378  if (args["rpsdb"] && args["norps"].AsBoolean()) {
379  NCBI_THROW(CMultiAlignerException, eInvalidInput,
380  "The options -rpsdb and -norps T are mutually exclusive");
381  }
382 
383  if (!args["rpsdb"] && !args["norps"].AsBoolean()) {
384  NCBI_THROW(CMultiAlignerException, eInvalidInput,
385  "RPS dababase not specified");
386  }
387 
388 
389  // Set up data loaders
391 
395 
396  // PSSM aligner parameters
397  opts->SetGapOpenPenalty(-args["gapopen"].AsInteger());
398  opts->SetGapExtendPenalty(-args["gapextend"].AsInteger());
399  opts->SetEndGapOpenPenalty(-args["end_gapopen"].AsInteger());
400  opts->SetEndGapExtendPenalty(-args["end_gapextend"].AsInteger());
401  opts->SetScoreMatrixName(args["matrix"].AsString());
402 
403  // RPS Blast parameters
404  if (args["rpsdb"]) {
405  opts->SetRpsDb(args["rpsdb"].AsString());
406 
407  // Check whether RPS database and auxialry files exist
408  const string dbname = args["rpsdb"].AsString();
409  CFile rps(dbname + ".rps");
410  if (!rps.Exists()) {
411  NcbiCerr << "Error: RPS database file: " << dbname + ".rps"
412  << " is missing" << NcbiEndl;
413  return 1;
414  }
415 
416  CFile blocks(dbname + ".blocks");
417  if (!blocks.Exists()) {
418  NcbiCerr << "Error: RPS block file: " << dbname + ".blocks"
419  << " is missing" << NcbiEndl;
420  return 1;
421  }
422 
423  CFile freq(dbname + ".freq");
424  if (!freq.Exists()) {
425  NcbiCerr << "Error: RPS frequencies file: " << dbname + ".freq"
426  << " is missing" << NcbiEndl;
427  return 1;
428  }
429  }
430  opts->SetRpsEvalue(args["rps_evalue"].AsDouble());
431  opts->SetDomainResFreqBoost(args["dfb"].AsDouble());
432  opts->SetDomainHitlistSize(args["num_domain_hits"].AsInteger());
433 
434  // Blastp parameters
435  opts->SetBlastpEvalue(args["blast_evalue"].AsDouble());
436  opts->SetLocalResFreqBoost(args["ffb"].AsDouble());
437 
438  // Patterns
439  if (args["p"]) {
440  x_LoadPatterns(args["p"].AsString(), opts->SetCddPatterns());
441  }
442 
443  // User constraints
444  if (args["c"]) {
445  x_LoadConstraints(args["c"].AsString(), opts->SetUserConstraints());
446  }
447 
448  // Progressive alignmenet params
450  if (args["treemethod"].AsString() == "clust") {
451  tree_method = CMultiAlignerOptions::eClusters;
452  }
453  else if (args["treemethod"].AsString() == "nj") {
454  tree_method = CMultiAlignerOptions::eNJ;
455  }
456  else if (args["treemethod"].AsString() == "fastme") {
457  tree_method = CMultiAlignerOptions::eFastME;
458  }
459  else {
460  NcbiCerr << "Error: Incorrect tree method";
461  return 1;
462  }
463  opts->SetTreeMethod(tree_method);
464 
465  // Iterative alignment params
466  opts->SetIterate(args["iter"].AsBoolean());
467  opts->SetConservedCutoffScore(args["ccc"].AsDouble());
468  opts->SetPseudocount(args["pseudo"].AsDouble());
469 
470  // Query clustering params
471  opts->SetUseQueryClusters(args["clusters"].AsBoolean());
472  opts->SetKmerLength(args["k"].AsInteger());
473  opts->SetMaxInClusterDist(args["max_dist"].AsDouble());
474 
477  if (args["alph"]) {
478  if (args["alph"].AsString() == "regular") {
480  }
481  else if (args["alph"].AsString() == "se-v10") {
483  }
484  else if (args["alph"].AsString() == "se-b15") {
486  }
487  }
488  opts->SetKmerAlphabet(alph);
489 
490  // not option of the application
491  opts->SetInClustAlnMethod(args["clusters"].AsBoolean()
494 
495 
496  // set pre-computed domain hits
497  if (args["domain_hits"]) {
499  args["domain_hits"].AsInputFile() >> MSerial_AsnText >> *archive;
500  opts->SetDomainHits(archive);
501  }
502 
503  // Verbose level
504  opts->SetVerbose(args["v"]);
505 
506  // Validate options and print warning messages if any
507  if (!opts->Validate()) {
508  ITERATE(vector<string>, it, opts->GetMessages()) {
509  NcbiCerr << "Warning: " << *it << NcbiEndl;
510  }
511  }
512 
513  CMultiAligner aligner(opts);
514 
515  vector< CRef<objects::CSeq_loc> > queries;
517  scope->AddDefaults();
518 
521  if (!args["parse_deflines"]) {
523  }
524 
525  // if aligning a set of sequences
526  if (args["i"]) {
527  GetSeqLocFromStream(args["i"].AsInputFile(), queries, scope, flags);
528 
529  _ASSERT(!scope.Empty());
530  aligner.SetQueries(queries, scope);
531 
532  m_UsageReport.AddParam(blast::CBlastUsageReport::eNumQueries,
533  (int)queries.size());
534  }
535  else {
536 
537  // aligning two MSAs
538 
539  objects::CSeqIdGenerator id_generator;
540  // this flag sets validation of the read MSA
542 
544  args["in_msa1"].AsInputFile(),
545  scope,
546  flags,
547  id_generator);
548 
550  args["in_msa2"].AsInputFile(),
551  scope,
552  flags,
553  id_generator);
554 
555  _ASSERT(!scope.Empty());
556 
557  set<int> repr1, repr2;
558  size_t num1 = 0, num2 = 0;
559  CTempString delim(",");
560  if (args["ind1"]) {
561  list<string> tokens;
562  NStr::Split((CTempString)args["ind1"].AsString(), delim, tokens,
564  ITERATE (list<string>, it, tokens) {
565  repr1.insert(NStr::StringToInt(*it));
566  num1++;
567  }
568  }
569  if (args["ind2"]) {
570  list<string> tokens;
571  NStr::Split((CTempString)args["ind2"].AsString(), delim, tokens,
573  ITERATE (list<string>, it, tokens) {
574  repr2.insert(NStr::StringToInt(*it));
575  num2++;
576  }
577  }
578 
579  // indeces of sequence representatives in MSAs must be unique
580  if (num1 != repr1.size() || num2 != repr2.size()) {
581  NcbiCerr << "Error: Non-unique indeces of input sequence "
582  << "representatives"
583  << NcbiEndl;
584 
585  return 1;
586  }
587 
588  aligner.SetInputMSAs(*msa1, *msa2, repr1, repr2, scope);
589  }
590 
591  // write error and/or warning messages
592  CMultiAligner::TStatus status = aligner.Run();
593  string msg = status != CMultiAligner::eSuccess ? "Error: " : "Warning: ";
594  ITERATE(vector<string>, it, aligner.GetMessages()) {
595  NcbiCerr << msg << *it << NcbiEndl;
596  }
597 
598  // If aligner returns with error status then exit
599  if (status != CMultiAligner::eSuccess) {
600  return status;
601  }
602 
603  sequence::CDeflineGenerator defline_gen;
604 
605  if (args["outfmt"]) {
606  CMultiAlnPrinter printer(*aligner.GetResults(), *aligner.GetScope(),
608  printer.SetWidth(80);
609  printer.SetGapChar('-');
610  printer.SetEndGapChar('-');
611  if (args["outfmt"].AsString() == "mfasta") {
613  }
614  else if (args["outfmt"].AsString() == "clustalw") {
616  }
617  else if (args["outfmt"].AsString() == "phylip") {
619  }
620  else if (args["outfmt"].AsString() == "nexus") {
622  }
623 
624  printer.Print(NcbiCout);
625  }
626  else {
627  // default format is fasta with one sequence per line
628  const vector<CSequence>& results(aligner.GetSeqResults());
629  CRef<CSeq_align> align = aligner.GetResults();
630  for (int i = 0; i < (int)results.size(); i++) {
631 
632  CBioseq_Handle bhandle = scope->GetBioseqHandle(
633  align->GetSeq_id(i),
635 
636  // try to recreate the defline for parsed Seq-ids
637  if (args["parse_deflines"]) {
638  // if Seq-id is local then, do not print Seq-id type
639  const CSeq_id& id = align->GetSeq_id(i);
640  if (id.IsLocal()) {
641  string label;
642  id.GetLabel(&label, CSeq_id::eContent);
643  printf(">%s", label.c_str());
644  }
645  else {
646  // for non-local Seq-ids print all ids
647  const vector<CSeq_id_Handle>& ids = bhandle.GetId();
648  printf(">");
649  ITERATE (vector<CSeq_id_Handle>, it, ids) {
650  const string id_str = it->GetSeqId()->AsFastaString();
651  printf("%s", id_str.c_str());
652  if (it + 1 != ids.end()) {
653  printf("|");
654  }
655  }
656 
657  }
658  // do not print 'unnamed protein product' for empty title
659  string title = defline_gen.GenerateDefline(bhandle);
660  if (title != "unnamed protein product") {
661  printf(" %s", title.c_str());
662  }
663  printf("\n");
664  }
665  else {
666  printf(">%s\n", defline_gen.GenerateDefline(bhandle).c_str());
667  }
668 
669  for (int j = 0; j < results[i].GetLength(); j++) {
670  printf("%c", results[i].GetPrintableLetter(j));
671  }
672  printf("\n");
673  }
674  }
675 
676  if (args["seqalign"]) {
677  CRef<CSeq_align> sa = aligner.GetResults();
678  CNcbiOstream& out = args["seqalign"].AsOutputFile();
679  out << MSerial_AsnText << *sa;
680  }
681 
682  m_UsageReport.AddParam(blast::CBlastUsageReport::eExitStatus, 0);
683  return 0;
684 }
685 
687 {
688  SetDiagStream(0);
689 }
690 
691 int main(int argc, const char* argv[])
692 {
693  return CMultiApplication().AppMain(argc, argv, 0, eDS_Default, "");
694 }
User-defined methods of the data storage class.
CMultiAligner version.
Data loader implementation that uses the blast databases.
BLAST usage report api.
CArgAllow_Strings –.
Definition: ncbiargs.hpp:1641
CArgDescriptions –.
Definition: ncbiargs.hpp:541
CArgs –.
Definition: ncbiargs.hpp:379
CBioseq_Handle –.
CBlast4_archive –.
CFile –.
Definition: ncbifile.hpp:1604
Options and parameters for multiple alignement.
Definition: options.hpp:95
ETreeMethod
Method for construction of guide tree for progressive alignment.
Definition: options.hpp:258
@ eNJ
Neighbot Joining.
Definition: options.hpp:259
@ eFastME
Fast Minimum Evolution.
Definition: options.hpp:260
@ eClusters
Clustering dendrogram.
Definition: options.hpp:261
void SetDomainHitlistSize(int size)
Set hitlist size (per sequence) for domain search.
Definition: options.hpp:495
void SetKmerAlphabet(TKMethods::ECompressedAlphabet alph)
Set alphabet for creating word count vectors.
Definition: options.hpp:444
void SetKmerLength(int len)
Set word size for creating word count vectors in query clustering.
Definition: options.hpp:434
void SetEndGapOpenPenalty(TScore penalty)
Set gap opening penalty for end gaps in pairwise global alignment of profiles.
Definition: options.hpp:638
void SetScoreMatrixName(const string &matrix)
Set alignment socre matrix name.
Definition: options.hpp:600
void SetLocalResFreqBoost(double boost)
Set frequency boost for a letter that appears in query sequence in given position.
Definition: options.hpp:585
bool Validate(void)
Validate parameter values.
Definition: options.cpp:86
void SetVerbose(bool verbose)
Set verbose mode.
Definition: options.hpp:683
void SetMaxInClusterDist(double dist)
Set maximum allowed distance between sequences in a cluster.
Definition: options.hpp:470
void SetBlastpEvalue(double evalue)
Set e-value for accepting Blastp hits.
Definition: options.hpp:534
void SetUseQueryClusters(bool use)
Set use of query clustering option.
Definition: options.cpp:75
void SetDomainHits(CConstRef< objects::CBlast4_archive > archive)
Set pre-computed domain hits.
Definition: options.hpp:702
void SetTreeMethod(ETreeMethod method)
Set method for creating tree that guides progressive alignment.
Definition: options.hpp:573
const vector< string > & GetMessages(void)
Get warning messages.
Definition: options.hpp:762
void SetGapOpenPenalty(TScore penalty)
Set gap opening penalty for middle gaps in pairwise global alignment of profiles.
Definition: options.hpp:612
void SetRpsDb(const string &dbname)
Use RPS Blast with given database.
Definition: options.hpp:355
void SetConservedCutoffScore(double score)
Set cutoff score for conserved aligned columns.
Definition: options.hpp:548
TConstraints & SetUserConstraints(void)
Set user constraints.
Definition: options.hpp:403
vector< CPattern > & SetCddPatterns(void)
Set regular expression patterns for identification of conserved domains.
Definition: options.hpp:379
void SetInClustAlnMethod(EInClustAlnMethod method)
Definition: options.hpp:693
void SetIterate(bool use)
Set use of iterative alignment option.
Definition: options.hpp:322
@ fNoPatterns
Do not use conserved domain patterns.
Definition: options.hpp:234
@ fNoRpsBlast
Do not use RPS Blast.
Definition: options.hpp:231
void SetPseudocount(double pseudocount)
Set pseudocount for calculating column entropy.
Definition: options.hpp:559
void SetRpsEvalue(double evalue)
Set e-value threshold for accepting RPS Blast hits.
Definition: options.hpp:484
void SetDomainResFreqBoost(double boost)
Set boost for residue frequencies in conserved domains from RPS data base.
Definition: options.hpp:507
@ eMulti
Alignment guide tree for each cluster is attached to the main alignment guide tree.
Definition: options.hpp:269
@ eNone
No clustering.
Definition: options.hpp:265
void SetGapExtendPenalty(TScore penalty)
Set gap extension penalty for middle gaps in pairwise global alignment of profiles.
Definition: options.hpp:625
void SetEndGapExtendPenalty(TScore penalty)
Set gap extension penalty for end gaps in pairwise global alignment of profiles.
Definition: options.hpp:651
Keeps track of CMultiAligner version.
Definition: version.hpp:52
Simultaneously align multiple protein sequences.
Definition: cobalt.hpp:69
const vector< CSequence > & GetSeqResults(void) const
Retrieve the current aligned results in CSequence format.
Definition: cobalt.hpp:240
CRef< objects::CSeq_align > GetResults(void) const
Retrieve the current aligned results in Seq-align format.
Definition: seqalign.cpp:157
TStatus Run(void)
Align the current set of input sequences (reset any existing alignment information).
Definition: cobalt.cpp:683
const vector< string > & GetMessages(void) const
Get Error/Warning messages.
Definition: cobalt.hpp:284
@ eSuccess
Alignment successfully completed.
Definition: cobalt.hpp:78
void SetInputMSAs(const objects::CSeq_align &msa1, const objects::CSeq_align &msa2, const set< int > &representatives1, const set< int > &representatives2, CRef< objects::CScope > scope)
Set input alignments.
Definition: cobalt.cpp:297
void SetQueries(const vector< CRef< objects::CSeq_loc > > &queries, CRef< objects::CScope > scope)
Set query sequences.
Definition: cobalt.cpp:194
CRef< objects::CScope > GetScope(void)
Get scope.
Definition: cobalt.hpp:188
Printer for popular multiple alignmnet formats.
Definition: aln_printer.hpp:51
void Print(CNcbiOstream &ostr)
Print alignment.
void SetEndGapChar(unsigned char gap)
Set end gap character.
Definition: aln_printer.hpp:99
void SetFormat(EFormat format)
Set format for printing alignment.
Definition: aln_printer.hpp:89
void SetGapChar(unsigned char gap)
Set gap character.
Definition: aln_printer.hpp:94
void SetWidth(int width)
Set text width (number of columns) for alignment output.
Definition: aln_printer.hpp:84
blast::CBlastUsageReport m_UsageReport
Definition: cobalt_app.cpp:89
virtual void Exit(void)
Cleanup on application exit.
Definition: cobalt_app.cpp:686
virtual void Init(void)
Initialize the application.
Definition: cobalt_app.cpp:118
virtual int Run(void)
Run the application.
Definition: cobalt_app.cpp:368
CStopWatch m_StopWatch
Definition: cobalt_app.cpp:90
CRef< CObjectManager > m_ObjMgr
Definition: cobalt_app.cpp:88
CRef –.
Definition: ncbiobj.hpp:618
CScope –.
Definition: scope.hpp:92
const CSeq_id & GetSeq_id(TDim row) const
Get seq-id (the first one if segments have different ids).
Definition: Seq_align.cpp:317
CStopWatch –.
Definition: ncbitime.hpp:1937
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
Definition: tempstr.hpp:65
iterator_bool insert(const value_type &val)
Definition: set.hpp:149
size_type size() const
Definition: set.hpp:132
Interface for CMultiAligner.
USING_SCOPE(objects)
static void x_LoadConstraints(string constraintfile, vector< CMultiAlignerOptions::SConstraint > &constr)
Definition: cobalt_app.cpp:313
string s_GetTreeMethodAsString(CMultiAlignerOptions::ETreeMethod method)
Definition: cobalt_app.cpp:95
string s_GetKmerAlphabetAsString(CMultiAlignerOptions::TKMethods::ECompressedAlphabet alph)
Definition: cobalt_app.cpp:107
int main(int argc, const char *argv[])
Definition: cobalt_app.cpp:691
static void x_LoadPatterns(string patternsfile, vector< CMultiAlignerOptions::CPattern > &patterns)
Definition: cobalt_app.cpp:346
USING_NCBI_SCOPE
Definition: cobalt_app.cpp:57
void GetSeqLocFromStream(CNcbiIstream &instream, vector< CRef< objects::CSeq_loc > > &seqs, CRef< objects::CScope > &scope, objects::CFastaReader::TFlags flags)
Reads fasta sequences from stream, adds them to scope, and returns them as the list of Seq_locs.
CRef< objects::CSeq_align > GetAlignmentFromStream(CNcbiIstream &instream, CRef< objects::CScope > &scope, objects::CFastaReader::TFlags flags, objects::CSeqIdGenerator &id_generator)
Reads fasta sequences as multiple sequence alignment.
void Print(const CCompactSAMApplication::AlignInfo &ai)
API (CDeflineGenerator) for computing sequences' titles ("definitions").
static uch flags
std::ofstream out("events_result.xml")
main entry point for tests
Operators to edit gaps in sequences.
void SetFullVersion(CRef< CVersionAPI > version)
Set version data for the program.
Definition: ncbiapp.cpp:1174
void HideStdArgs(THideStdArgs hide_mask)
Set the hide mask for the Hide Std Flags.
Definition: ncbiapp.cpp:1312
virtual const CArgs & GetArgs(void) const
Get parsed command line arguments.
Definition: ncbiapp.cpp:305
int AppMain(int argc, const char *const *argv, const char *const *envp=0, EAppDiagStream diag=eDS_Default, const char *conf=NcbiEmptyCStr, const string &name=NcbiEmptyString)
Main function (entry point) for the NCBI application.
Definition: ncbiapp.cpp:819
CVersionInfo GetVersion(void) const
Get the program version information.
Definition: ncbiapp.cpp:1184
virtual void SetupArgDescriptions(CArgDescriptions *arg_desc)
Setup the command line argument descriptions.
Definition: ncbiapp.cpp:1195
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
const CNcbiArguments & GetArguments(void) const
Get the application's cached unprocessed command-line arguments.
@ fHideXmlHelp
Hide XML help description.
@ fHideLogfile
Hide log file description.
@ fHideFullVersion
Hide full version description.
@ fHideDryRun
Hide dryrun description.
@ fHideConffile
Hide configuration file description.
@ eRequires
One argument requires another.
Definition: ncbiargs.hpp:956
@ eExcludes
One argument excludes another.
Definition: ncbiargs.hpp:957
@ eInputFile
Name of file (must exist and be readable)
Definition: ncbiargs.hpp:595
@ eBoolean
{'true', 't', 'false', 'f'}, case-insensitive
Definition: ncbiargs.hpp:590
@ eDouble
Convertible into a floating point number (double)
Definition: ncbiargs.hpp:594
@ eString
An arbitrary string.
Definition: ncbiargs.hpp:589
@ eOutputFile
Name of file (must be writable)
Definition: ncbiargs.hpp:596
@ eInteger
Convertible into an integer number (int or Int8)
Definition: ncbiargs.hpp:592
EDiagSev SetDiagPostLevel(EDiagSev post_sev=eDiag_Error)
Set the threshold severity for posting the messages.
Definition: ncbidiag.cpp:6129
void SetDiagStream(CNcbiOstream *os, bool quick_flush=true, FDiagCleanup cleanup=0, void *cleanup_data=0, const string &stream_name="")
Set diagnostic stream.
Definition: ncbidiag.cpp:8083
@ eDS_Default
Try standard log file (app.name + ".log") in /log/, use stderr on failure.
Definition: ncbidiag.hpp:1790
@ eDiag_Warning
Warning message.
Definition: ncbidiag.hpp:652
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
virtual bool Exists(void) const
Check existence of file.
Definition: ncbifile.hpp:4038
#define MSerial_AsnText
I/O stream manipulators –.
Definition: serialbase.hpp:696
long TFlags
binary OR of EFlags
Definition: fasta.hpp:117
@ fNoParseID
Generate an ID (whole defline -> title)
Definition: fasta.hpp:90
@ fForceType
Force specified type regardless of accession.
Definition: fasta.hpp:89
@ fParseRawID
Try to identify raw accessions.
Definition: fasta.hpp:97
@ fValidate
Check (alphabetic) residue validity.
Definition: fasta.hpp:100
@ fAssumeProt
Assume prots unless accns indicate otherwise.
Definition: fasta.hpp:88
@ eContent
Untagged human-readable accession or the like.
Definition: Seq_id.hpp:605
static CRef< CObjectManager > GetInstance(void)
Return the existing object manager or create one.
@ eGetBioseq_All
Search bioseq, load if not loaded yet.
Definition: scope.hpp:128
const TId & GetId(void) const
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
Definition: ncbiobj.hpp:719
#define NcbiEndl
Definition: ncbistre.hpp:548
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
Definition: ncbistre.hpp:149
#define NcbiCout
Definition: ncbistre.hpp:543
#define NcbiCerr
Definition: ncbistre.hpp:544
IO_PREFIX::ifstream CNcbiIfstream
Portable alias for ifstream.
Definition: ncbistre.hpp:439
static string DoubleToString(double value, int precision=-1, TNumToStringFlags flags=0)
Convert double to string.
Definition: ncbistr.hpp:5187
static int StringToInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to int.
Definition: ncbistr.cpp:630
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
Definition: ncbistr.cpp:3461
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5084
@ fSplit_Tokenize
All delimiters are merged and trimmed, to get non-empty tokens only.
Definition: ncbistr.hpp:2508
double Elapsed(void) const
Return time elapsed since first Start() or last Restart() call (in seconds).
Definition: ncbitime.hpp:2775
void Start(void)
Start the timer.
Definition: ncbitime.hpp:2764
#define CVersion
static const char label[]
char * dbname(DBPROCESS *dbproc)
Get name of current database.
Definition: dblib.c:6929
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
int i
static int version
Definition: mdb_load.c:29
Defines the CNcbiApplication and CAppException classes for creating NCBI applications.
Defines classes: CDirEntry, CFile, CDir, CSymLink, CMemoryFile, CFileUtil, CFileLock,...
Defines: CTimeFormat - storage class for time format.
double f(double x_, const double &y_)
Definition: njn_root.hpp:188
The Object manager core.
#define COBALT_GAP_EXTNT
Gap extension score.
Definition: options.hpp:79
#define COBALT_KMER_LEN
K-mer length for sequence clustering.
Definition: options.hpp:84
#define COBALT_END_GAP_OPEN
End gap opening score.
Definition: options.hpp:73
#define COBALT_PSEUDO_COUNT
Pseudocount constant used in multiple alignment.
Definition: options.hpp:62
#define COBALT_CONSERVED_CUTOFF
Conservation score cutoff used for selecting conserved columns in initial MSA.
Definition: options.hpp:65
#define COBALT_LOCAL_BOOST
Weight for sequence residues when creating MSA profules.
Definition: options.hpp:59
#define COBALT_BLAST_EVALUE
Blastp e-value cutoff for creating contraints.
Definition: options.hpp:57
#define COBALT_RPS_EVALUE
Default values for cobalt parameters Rps-Blast e-value cutoff for creating contraints.
Definition: options.hpp:50
#define COBALT_GAP_OPEN
Gap opening score.
Definition: options.hpp:77
#define COBALT_DOMAIN_HITLIST_SIZE
Hitlist size for Rps-Blast searches.
Definition: options.hpp:54
#define COBALT_TREE_METHOD
Default method for computing progressive alignment tree.
Definition: options.hpp:68
#define COBALT_END_GAP_EXTNT
End gap extension score.
Definition: options.hpp:75
#define COBALT_DOMAIN_BOOST
Weight for domain residue frequecies when creating MSA profiles.
Definition: options.hpp:52
#define COBALT_KMER_ALPH
K-mer alphabet for sequence clustering.
Definition: options.hpp:86
#define COBALT_DEFAULT_MATRIX
Default substitution matrix used in multiple alignment.
Definition: options.hpp:71
#define COBALT_MAX_CLUSTER_DIAM
Maximum cluster diameter for pre-alignment sequence clustering.
Definition: options.hpp:82
static patstr * patterns
Definition: pcregrep.c:259
Structure for representing single user constraint for pair-wise alignment.
Definition: options.hpp:194
static DP_BlockInfo * blocks
#define _ASSERT
Modified on Sat May 25 14:18:00 2024 by modify_doxy.py rev. 669887