84 virtual void Init(
void);
85 virtual int Run(
void);
86 virtual void Exit(
void);
125 arg_desc->SetUsageContext(
GetArguments().GetProgramBasename(),
126 "COBALT multiple sequence alignment utility");
129 arg_desc->SetCurrentGroup(
"Input");
130 arg_desc->AddOptionalKey(
"i",
"infile",
"File containing input sequences "
133 arg_desc->AddOptionalKey(
"in_msa1",
"infile",
"File containing input "
134 "alignment in FASTA format",
137 arg_desc->AddOptionalKey(
"in_msa2",
"infile",
"File containing input "
138 "alignment in FASTA format",
141 arg_desc->AddOptionalKey(
"ind1",
"numbers",
"Coma separated list of "
142 "sequence indices in MSA1 to be used for "
143 "constraints generation",
146 arg_desc->AddOptionalKey(
"ind2",
"numbers",
"Coma separated list of "
147 "sequence indices in MSA2 to be used for "
148 "constraints generation",
162 arg_desc->AddFlag(
"parse_deflines",
"Should the sequence deflines be "
167 arg_desc->SetCurrentGroup(
"Conserved domain options");
168 arg_desc->AddOptionalKey(
"rpsdb",
"database",
"Conserved domain database "
169 "name\nEither database or -norps option must be "
171 arg_desc->AddDefaultKey(
"norps",
"norps",
"Do not perform initial "
174 arg_desc->AddDefaultKey(
"rps_evalue",
"evalue",
175 "E-value threshold for selecting conserved domains"
176 " from results of RPS-BLAST search",
179 arg_desc->AddDefaultKey(
"num_domain_hits",
"number",
"Maximum number of "
180 "of domain hits for each sequence",
183 arg_desc->AddOptionalKey(
"p",
"patternfile",
184 "Filename containing regular expression patterns "
185 "for conserved domains",
187 arg_desc->AddDefaultKey(
"dfb",
"domain_res_boost",
188 "When assigning domain residue frequencies, the amount of "
189 "extra weight (0..1) to give to the actual sequence letter "
194 arg_desc->AddOptionalKey(
"domain_hits",
"infile",
"Results of pre-computed"
195 " domain search in BLAST archive format",
204 arg_desc->SetCurrentGroup(
"Constraints options");
205 arg_desc->AddOptionalKey(
"c",
"constraintfile",
206 "Filename containing pairwise alignment constraints, "
207 "one per line, each represented by 6 integers:\n"
208 " -zero-based index of sequence 1 in the input file\n"
209 " -zero-based start position in sequence 1\n"
210 " -zero-based stop position in sequence 1\n"
211 " -zero-based index of sequence 2 in the input file\n"
212 " -zero-based start position in sequence 2\n"
213 " -zero-based stop position in sequence 2\n",
218 arg_desc->SetCurrentGroup(
"Multiple alignment options");
219 arg_desc->AddDefaultKey(
"treemethod",
"method",
220 "Method for generating progressive alignment guide tree",
224 "clust",
"nj",
"fastme"));
225 arg_desc->AddDefaultKey(
"iter",
"iterate",
226 "After the first iteration search for conserved columns "
227 "and realign if any are found",
229 arg_desc->AddDefaultKey(
"ccc",
"conserved_cutoff",
230 "Minimum average score needed for a multiple alignment "
231 "column to be considered as conserved",
234 arg_desc->AddDefaultKey(
"pseudo",
"pseudocount",
235 "Pseudocount constant",
238 arg_desc->AddDefaultKey(
"ffb",
"filler_res_boost",
239 "When assigning filler residue frequencies, the amount of "
240 "extra weight (0..1) to give to the actual sequence letter "
247 arg_desc->SetCurrentGroup(
"Pairwise alignment options");
248 arg_desc->AddDefaultKey(
"matrix",
"matrix",
249 "Score matrix to use",
251 arg_desc->AddDefaultKey(
"end_gapopen",
"penalty",
252 "Gap open penalty for terminal gaps",
255 arg_desc->AddDefaultKey(
"end_gapextend",
"penalty",
256 "Gap extend penalty for terminal gaps",
259 arg_desc->AddDefaultKey(
"gapopen",
"penalty",
260 "Gap open penalty for internal gaps",
263 arg_desc->AddDefaultKey(
"gapextend",
"penalty",
264 "Gap extend penalty for internal gaps",
267 arg_desc->AddDefaultKey(
"blast_evalue",
"evalue",
268 "E-value threshold for selecting segments matched "
275 arg_desc->SetCurrentGroup(
"Query clustering options");
276 arg_desc->AddDefaultKey(
"clusters",
"clusters",
277 "Use query clustering for faster alignment",
279 arg_desc->AddDefaultKey(
"k",
"length",
280 "K-mer length for query clustering",
283 arg_desc->AddDefaultKey(
"max_dist",
"distance",
284 "Maximum allowed distance between sequences in a cluster"
288 arg_desc->AddDefaultKey(
"alph",
"name",
289 "Alphabet for used k-mer counting",
293 "se-v10",
"se-b15"));
297 arg_desc->SetCurrentGroup(
"Output options");
298 arg_desc->AddOptionalKey(
"seqalign",
"file",
299 "Output text seqalign to specified file",
301 arg_desc->AddOptionalKey(
"outfmt",
"format",
"Output format for multiple "
304 "clustalw",
"phylip",
"nexus"));
305 arg_desc->AddFlag(
"v",
"Verbose output");
314 vector<CMultiAlignerOptions::SConstraint>& constr)
317 if (
f.bad() ||
f.fail())
319 "Cannot open file with pairwise constraints");
321 int seq1, seq1_start, seq1_end;
322 int seq2, seq2_start, seq2_end;
326 f >> seq1 >> seq1_start >> seq1_end;
327 f >> seq2 >> seq2_start >> seq2_end;
329 c(seq1, seq1_start, seq1_end, seq2, seq2_start, seq2_end);
335 f >> seq1 >> seq1_start >> seq1_end;
336 f >> seq2 >> seq2_start >> seq2_end;
339 seq1_start, seq1_end, seq2, seq2_start, seq2_end));
347 vector<CMultiAlignerOptions::CPattern>&
patterns)
350 if (
f.bad() ||
f.fail())
352 "Cannot open patterns file");
357 string single_pattern;
361 if (!single_pattern.empty()) {
378 if (args[
"rpsdb"] && args[
"norps"].AsBoolean()) {
380 "The options -rpsdb and -norps T are mutually exclusive");
383 if (!args[
"rpsdb"] && !args[
"norps"].AsBoolean()) {
385 "RPS dababase not specified");
405 opts->
SetRpsDb(args[
"rpsdb"].AsString());
408 const string dbname = args[
"rpsdb"].AsString();
450 if (args[
"treemethod"].AsString() ==
"clust") {
453 else if (args[
"treemethod"].AsString() ==
"nj") {
456 else if (args[
"treemethod"].AsString() ==
"fastme") {
460 NcbiCerr <<
"Error: Incorrect tree method";
478 if (args[
"alph"].AsString() ==
"regular") {
481 else if (args[
"alph"].AsString() ==
"se-v10") {
484 else if (args[
"alph"].AsString() ==
"se-b15") {
497 if (args[
"domain_hits"]) {
515 vector< CRef<objects::CSeq_loc> > queries;
517 scope->AddDefaults();
521 if (!args[
"parse_deflines"]) {
532 m_UsageReport.AddParam(blast::CBlastUsageReport::eNumQueries,
533 (
int)queries.size());
539 objects::CSeqIdGenerator id_generator;
544 args[
"in_msa1"].AsInputFile(),
550 args[
"in_msa2"].AsInputFile(),
558 size_t num1 = 0, num2 = 0;
564 ITERATE (list<string>, it, tokens) {
573 ITERATE (list<string>, it, tokens) {
580 if (num1 != repr1.
size() || num2 != repr2.
size()) {
581 NcbiCerr <<
"Error: Non-unique indeces of input sequence "
588 aligner.
SetInputMSAs(*msa1, *msa2, repr1, repr2, scope);
603 sequence::CDeflineGenerator defline_gen;
605 if (args[
"outfmt"]) {
611 if (args[
"outfmt"].AsString() ==
"mfasta") {
614 else if (args[
"outfmt"].AsString() ==
"clustalw") {
617 else if (args[
"outfmt"].AsString() ==
"phylip") {
620 else if (args[
"outfmt"].AsString() ==
"nexus") {
637 if (args[
"parse_deflines"]) {
643 printf(
">%s",
label.c_str());
647 const vector<CSeq_id_Handle>& ids = bhandle.
GetId();
649 ITERATE (vector<CSeq_id_Handle>, it, ids) {
650 const string id_str = it->GetSeqId()->AsFastaString();
651 printf(
"%s", id_str.c_str());
652 if (it + 1 != ids.end()) {
659 string title = defline_gen.GenerateDefline(bhandle);
660 if (title !=
"unnamed protein product") {
661 printf(
" %s", title.c_str());
666 printf(
">%s\n", defline_gen.GenerateDefline(bhandle).c_str());
669 for (
int j = 0; j <
results[
i].GetLength(); j++) {
670 printf(
"%c",
results[
i].GetPrintableLetter(j));
676 if (args[
"seqalign"]) {
682 m_UsageReport.AddParam(blast::CBlastUsageReport::eExitStatus, 0);
691 int main(
int argc,
const char* argv[])
User-defined methods of the data storage class.
Data loader implementation that uses the blast databases.
Options and parameters for multiple alignement.
ETreeMethod
Method for construction of guide tree for progressive alignment.
@ eFastME
Fast Minimum Evolution.
@ eClusters
Clustering dendrogram.
void SetDomainHitlistSize(int size)
Set hitlist size (per sequence) for domain search.
void SetKmerAlphabet(TKMethods::ECompressedAlphabet alph)
Set alphabet for creating word count vectors.
void SetKmerLength(int len)
Set word size for creating word count vectors in query clustering.
void SetEndGapOpenPenalty(TScore penalty)
Set gap opening penalty for end gaps in pairwise global alignment of profiles.
void SetScoreMatrixName(const string &matrix)
Set alignment socre matrix name.
void SetLocalResFreqBoost(double boost)
Set frequency boost for a letter that appears in query sequence in given position.
bool Validate(void)
Validate parameter values.
void SetVerbose(bool verbose)
Set verbose mode.
void SetMaxInClusterDist(double dist)
Set maximum allowed distance between sequences in a cluster.
void SetBlastpEvalue(double evalue)
Set e-value for accepting Blastp hits.
void SetUseQueryClusters(bool use)
Set use of query clustering option.
void SetDomainHits(CConstRef< objects::CBlast4_archive > archive)
Set pre-computed domain hits.
void SetTreeMethod(ETreeMethod method)
Set method for creating tree that guides progressive alignment.
const vector< string > & GetMessages(void)
Get warning messages.
void SetGapOpenPenalty(TScore penalty)
Set gap opening penalty for middle gaps in pairwise global alignment of profiles.
void SetRpsDb(const string &dbname)
Use RPS Blast with given database.
void SetConservedCutoffScore(double score)
Set cutoff score for conserved aligned columns.
TConstraints & SetUserConstraints(void)
Set user constraints.
vector< CPattern > & SetCddPatterns(void)
Set regular expression patterns for identification of conserved domains.
void SetInClustAlnMethod(EInClustAlnMethod method)
void SetIterate(bool use)
Set use of iterative alignment option.
@ fNoPatterns
Do not use conserved domain patterns.
@ fNoRpsBlast
Do not use RPS Blast.
void SetPseudocount(double pseudocount)
Set pseudocount for calculating column entropy.
void SetRpsEvalue(double evalue)
Set e-value threshold for accepting RPS Blast hits.
void SetDomainResFreqBoost(double boost)
Set boost for residue frequencies in conserved domains from RPS data base.
@ eMulti
Alignment guide tree for each cluster is attached to the main alignment guide tree.
void SetGapExtendPenalty(TScore penalty)
Set gap extension penalty for middle gaps in pairwise global alignment of profiles.
void SetEndGapExtendPenalty(TScore penalty)
Set gap extension penalty for end gaps in pairwise global alignment of profiles.
Keeps track of CMultiAligner version.
Simultaneously align multiple protein sequences.
const vector< CSequence > & GetSeqResults(void) const
Retrieve the current aligned results in CSequence format.
CRef< objects::CSeq_align > GetResults(void) const
Retrieve the current aligned results in Seq-align format.
TStatus Run(void)
Align the current set of input sequences (reset any existing alignment information).
const vector< string > & GetMessages(void) const
Get Error/Warning messages.
@ eSuccess
Alignment successfully completed.
void SetInputMSAs(const objects::CSeq_align &msa1, const objects::CSeq_align &msa2, const set< int > &representatives1, const set< int > &representatives2, CRef< objects::CScope > scope)
Set input alignments.
void SetQueries(const vector< CRef< objects::CSeq_loc > > &queries, CRef< objects::CScope > scope)
Set query sequences.
CRef< objects::CScope > GetScope(void)
Get scope.
Printer for popular multiple alignmnet formats.
void Print(CNcbiOstream &ostr)
Print alignment.
void SetEndGapChar(unsigned char gap)
Set end gap character.
void SetFormat(EFormat format)
Set format for printing alignment.
void SetGapChar(unsigned char gap)
Set gap character.
void SetWidth(int width)
Set text width (number of columns) for alignment output.
blast::CBlastUsageReport m_UsageReport
virtual void Exit(void)
Cleanup on application exit.
virtual void Init(void)
Initialize the application.
virtual int Run(void)
Run the application.
CRef< CObjectManager > m_ObjMgr
const CSeq_id & GetSeq_id(TDim row) const
Get seq-id (the first one if segments have different ids).
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
iterator_bool insert(const value_type &val)
Interface for CMultiAligner.
static void x_LoadConstraints(string constraintfile, vector< CMultiAlignerOptions::SConstraint > &constr)
string s_GetTreeMethodAsString(CMultiAlignerOptions::ETreeMethod method)
string s_GetKmerAlphabetAsString(CMultiAlignerOptions::TKMethods::ECompressedAlphabet alph)
int main(int argc, const char *argv[])
static void x_LoadPatterns(string patternsfile, vector< CMultiAlignerOptions::CPattern > &patterns)
void GetSeqLocFromStream(CNcbiIstream &instream, vector< CRef< objects::CSeq_loc > > &seqs, CRef< objects::CScope > &scope, objects::CFastaReader::TFlags flags)
Reads fasta sequences from stream, adds them to scope, and returns them as the list of Seq_locs.
CRef< objects::CSeq_align > GetAlignmentFromStream(CNcbiIstream &instream, CRef< objects::CScope > &scope, objects::CFastaReader::TFlags flags, objects::CSeqIdGenerator &id_generator)
Reads fasta sequences as multiple sequence alignment.
void Print(const CCompactSAMApplication::AlignInfo &ai)
API (CDeflineGenerator) for computing sequences' titles ("definitions").
std::ofstream out("events_result.xml")
main entry point for tests
Operators to edit gaps in sequences.
void SetFullVersion(CRef< CVersionAPI > version)
Set version data for the program.
void HideStdArgs(THideStdArgs hide_mask)
Set the hide mask for the Hide Std Flags.
virtual const CArgs & GetArgs(void) const
Get parsed command line arguments.
int AppMain(int argc, const char *const *argv, const char *const *envp=0, EAppDiagStream diag=eDS_Default, const char *conf=NcbiEmptyCStr, const string &name=NcbiEmptyString)
Main function (entry point) for the NCBI application.
CVersionInfo GetVersion(void) const
Get the program version information.
virtual void SetupArgDescriptions(CArgDescriptions *arg_desc)
Setup the command line argument descriptions.
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
const CNcbiArguments & GetArguments(void) const
Get the application's cached unprocessed command-line arguments.
@ fHideXmlHelp
Hide XML help description.
@ fHideLogfile
Hide log file description.
@ fHideFullVersion
Hide full version description.
@ fHideDryRun
Hide dryrun description.
@ fHideConffile
Hide configuration file description.
@ eRequires
One argument requires another.
@ eExcludes
One argument excludes another.
@ eInputFile
Name of file (must exist and be readable)
@ eBoolean
{'true', 't', 'false', 'f'}, case-insensitive
@ eDouble
Convertible into a floating point number (double)
@ eString
An arbitrary string.
@ eOutputFile
Name of file (must be writable)
@ eInteger
Convertible into an integer number (int or Int8)
EDiagSev SetDiagPostLevel(EDiagSev post_sev=eDiag_Error)
Set the threshold severity for posting the messages.
void SetDiagStream(CNcbiOstream *os, bool quick_flush=true, FDiagCleanup cleanup=0, void *cleanup_data=0, const string &stream_name="")
Set diagnostic stream.
@ eDS_Default
Try standard log file (app.name + ".log") in /log/, use stderr on failure.
@ eDiag_Warning
Warning message.
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
virtual bool Exists(void) const
Check existence of file.
#define MSerial_AsnText
I/O stream manipulators –.
long TFlags
binary OR of EFlags
@ fNoParseID
Generate an ID (whole defline -> title)
@ fForceType
Force specified type regardless of accession.
@ fParseRawID
Try to identify raw accessions.
@ fValidate
Check (alphabetic) residue validity.
@ fAssumeProt
Assume prots unless accns indicate otherwise.
@ eContent
Untagged human-readable accession or the like.
static CRef< CObjectManager > GetInstance(void)
Return the existing object manager or create one.
@ eGetBioseq_All
Search bioseq, load if not loaded yet.
const TId & GetId(void) const
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
IO_PREFIX::ifstream CNcbiIfstream
Portable alias for ifstream.
static string DoubleToString(double value, int precision=-1, TNumToStringFlags flags=0)
Convert double to string.
static int StringToInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to int.
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
@ fSplit_Tokenize
All delimiters are merged and trimmed, to get non-empty tokens only.
double Elapsed(void) const
Return time elapsed since first Start() or last Restart() call (in seconds).
void Start(void)
Start the timer.
static const char label[]
char * dbname(DBPROCESS *dbproc)
Get name of current database.
unsigned int
A callback function used to compare two keys in a database.
const string version
version string
Defines the CNcbiApplication and CAppException classes for creating NCBI applications.
Defines classes: CDirEntry, CFile, CDir, CSymLink, CMemoryFile, CFileUtil, CFileLock,...
Defines: CTimeFormat - storage class for time format.
#define COBALT_GAP_EXTNT
Gap extension score.
#define COBALT_KMER_LEN
K-mer length for sequence clustering.
#define COBALT_END_GAP_OPEN
End gap opening score.
#define COBALT_PSEUDO_COUNT
Pseudocount constant used in multiple alignment.
#define COBALT_CONSERVED_CUTOFF
Conservation score cutoff used for selecting conserved columns in initial MSA.
#define COBALT_LOCAL_BOOST
Weight for sequence residues when creating MSA profules.
#define COBALT_BLAST_EVALUE
Blastp e-value cutoff for creating contraints.
#define COBALT_RPS_EVALUE
Default values for cobalt parameters Rps-Blast e-value cutoff for creating contraints.
#define COBALT_GAP_OPEN
Gap opening score.
#define COBALT_DOMAIN_HITLIST_SIZE
Hitlist size for Rps-Blast searches.
#define COBALT_TREE_METHOD
Default method for computing progressive alignment tree.
#define COBALT_END_GAP_EXTNT
End gap extension score.
#define COBALT_DOMAIN_BOOST
Weight for domain residue frequecies when creating MSA profiles.
#define COBALT_KMER_ALPH
K-mer alphabet for sequence clustering.
#define COBALT_DEFAULT_MATRIX
Default substitution matrix used in multiple alignment.
#define COBALT_MAX_CLUSTER_DIAM
Maximum cluster diameter for pre-alignment sequence clustering.
static SLJIT_INLINE sljit_ins msg(sljit_gpr r, sljit_s32 d, sljit_gpr x, sljit_gpr b)
Structure for representing single user constraint for pair-wise alignment.
static DP_BlockInfo * blocks