56 virtual int Run(
void);
69 arg_desc->SetUsageContext(
GetArguments().GetProgramBasename(),
70 "Distances and clustering of multiple alignment"
73 arg_desc->AddKey(
"i",
"infile",
"file containing names of alignment files",
75 arg_desc->AddOptionalKey(
"d",
"defline_file",
76 "file containing deflines corrsponding to alignment files",
78 arg_desc->AddDefaultKey(
"first",
"first",
79 "produce alignments from alignment number 'first' "
80 "to all succeeding alignments in the list",
82 arg_desc->AddDefaultKey(
"last",
"index",
83 "Compare alignments from first to last against the rest "
84 "in the list. Indices start from 1 "
85 "(default indicates an all vs. all comparison).",
87 arg_desc->AddDefaultKey(
"g0",
"penalty",
88 "gap open penalty for initial/terminal gaps",
90 arg_desc->AddDefaultKey(
"e0",
"penalty",
91 "gap extend penalty for initial/terminal gaps",
93 arg_desc->AddDefaultKey(
"g1",
"penalty",
94 "gap open penalty for middle gaps",
96 arg_desc->AddDefaultKey(
"e1",
"penalty",
97 "gap extend penalty for middle gaps",
99 arg_desc->AddDefaultKey(
"matrix",
"matrix",
100 "score matrix to use",
102 arg_desc->AddDefaultKey(
"v",
"verbose",
103 "turn on verbose output",
105 arg_desc->AddDefaultKey(
"local",
"local",
106 "reduce end gap penalties in profile-profile alignment",
108 arg_desc->AddFlag(
"pairs",
"Report pairwise distances up to a cutoff "
109 "value instead of the distance matrix");
110 arg_desc->AddDefaultKey(
"cutoff",
"distance",
111 "Maximum pairwise distance to report",
113 arg_desc->AddDefaultKey(
"out",
"outfile",
"Output file name",
135 while (!line_reader.
AtEOF()) {
145 "Could not retrieve seq entry");
151 seqloc->
SetWhole().Assign(*itr->GetId().front());
152 blast::SSeqLoc sl(seqloc, scope);
153 retval.push_back(sl);
162 if (
strcmp(matrix_name,
"BLOSUM62") == 0)
164 else if (
strcmp(matrix_name,
"BLOSUM45") == 0)
166 else if (
strcmp(matrix_name,
"BLOSUM80") == 0)
168 else if (
strcmp(matrix_name,
"PAM30") == 0)
170 else if (
strcmp(matrix_name,
"PAM70") == 0)
172 else if (
strcmp(matrix_name,
"PAM250") == 0)
180 vector<CSequence>& query_data)
182 int align_length = query_data[0].GetLength();
183 int num_seqs = (
int)query_data.size();
184 for (
int i = 0;
i < align_length;
i++) {
185 for (
int j = 0; j < num_seqs; j++)
186 freq_data[
i][query_data[j].GetLetter(
i)]++;
194 for (
int i = 0;
i < freq_size;
i++) {
200 sum += freq_data[
i][j];
205 freq_data[
i][j] *= sum;
212 double **freq1,
int len1,
213 double **freq2,
int len2,
224 const size_t dim = transcript.size();
227 for(
size_t i = 0;
i < dim; ++
i) {
229 int wg1 = end_gap_open, ws1 = end_gap_extend;
230 int wg2 = end_gap_open, ws2 = end_gap_extend;
232 if (offset1 >= 0 && offset1 < len1 - 1) {
233 wg1 = aligner.
GetWg();
234 ws1 = aligner.
GetWs();
237 if (offset2 >= 0 && offset2 < len2 - 1) {
238 wg2 = aligner.
GetWg();
239 ws2 = aligner.
GetWs();
249 ++offset1; ++offset2;
250 double accum = 0.0, sum = 0.0;
255 if (freq1[offset1][m] < freq2[offset2][m]) {
256 accum += freq1[offset1][m] * (double)sm[m][m];
258 diff_freq2[m] = freq2[offset2][m] - freq1[offset1][m];
261 accum += freq2[offset2][m] * (double)sm[m][m];
262 diff_freq1[m] = freq1[offset1][m] -
268 if (freq1[offset1][0] <= freq2[offset2][0]) {
270 sum += diff_freq1[m];
273 sum += diff_freq2[m];
277 if (freq1[offset1][0] <= freq2[offset2][0]) {
279 diff_freq1[m] /= sum;
282 diff_freq2[m] /= sum;
287 accum += diff_freq1[m] *
294 freq1[offset1][0] * aligner.
GetWs() * (1-freq2[offset2][0]) +
295 freq2[offset2][0] * aligner.
GetWs() * (1-freq1[offset1][0]);
301 if(state1 != 1) dscore += wg1 * (1.0 - freq2[offset2][0]);
302 state1 = 1; state2 = 0;
309 if(state2 != 1) dscore += wg2 * (1.0 - freq1[offset1][0]);
310 state1 = 0; state2 = 1;
321 return (
int)(dscore + 0.5);
328 int freq_size = alignment[0].GetLength();
333 freq_data =
new double* [freq_size];
336 for (
int i = 1;
i < freq_size;
i++)
339 memset(freq_data[0], 0,
kAlphabetSize * freq_size *
sizeof(
double));
356 double **freq2_data,
int freq2_size,
358 bool local_alignment)
364 aligner.
SetSequences((
const double**)freq1_data, freq1_size,
365 (
const double**)freq2_data, freq2_size,
369 freq1_data, freq1_size,
370 freq2_data, freq2_size,
371 local_alignment ? 0 : end_gap_open,
372 local_alignment ? 0 : end_gap_extend);
381 for (
int i = 0;
i < freq_size;
i++) {
383 score += freq_data[
i][j] * matrix.
s[j][j];
398 vector<SAlignEntry>& aligns)
402 for (
i = 0;
i < level;
i++)
406 if (
tree->GetValue().GetId() >= 0)
407 printf(
"cluster %d (%s) ",
tree->GetValue().GetId(),
408 aligns[
tree->GetValue().GetId()].name);
409 if (
tree->GetValue().IsSetDist())
410 printf(
"distance %lf",
tree->GetValue().GetDist());
419 while (child !=
tree->SubNodeEnd()) {
420 for (
i = 0;
i < level;
i++)
438 vector<CTree::STreeLeaf> left_leaves;
439 vector<CTree::STreeLeaf> right_leaves;
443 (*child)->GetValue().GetDist());
446 (*child)->GetValue().GetDist());
448 for (
size_t i = 0;
i < left_leaves.size();
i++) {
449 for (
size_t j = 0; j < right_leaves.size(); j++) {
450 int idx1 = left_leaves[
i].query_idx;
451 double dist1 = left_leaves[
i].distance;
452 int idx2 = right_leaves[j].query_idx;
453 double dist2 = right_leaves[j].distance;
459 matrix(idx1, idx2) = matrix(idx2, idx1) = dist1 + dist2;
463 right_leaves.clear();
479 bool verbose = args[
"v"].AsBoolean();
500 vector<SAlignEntry> align_list;
501 while (!infile.fail() && !infile.eof()) {
514 for (
size_t i = 0;
i < align_read.size();
i++) {
522 char *name_ptr =
buf + strlen(
buf) - 1;
523 while (name_ptr >
buf &&
524 name_ptr[-1] !=
'/' &&
525 name_ptr[-1] !=
'\\') {
528 strcpy(e.
name, name_ptr);
531 int first_index = args[
"first"].AsInteger();
532 int last_index = args[
"last"].AsInteger();
533 if (last_index == 0) {
534 last_index = (
int)align_list.size() - 1;
541 if (first_index != 0) {
542 NcbiCerr <<
"error: all-against-all alignment required" << endl;
549 for (
int i = first_index;
i <= last_index;
i++) {
550 for (
size_t j =
i + 1; j < align_list.size(); j++) {
552 if (last_index < (
int)align_list.size() - 1 && (
int)j <= last_index) {
556 int len1 = align_list[
i].align[0].GetLength();
557 int len2 = align_list[j].align[0].GetLength();
560 if (args[
"local"].AsBoolean() ==
true) {
561 if (len1 > 1.5 * len2 || len2 > 1.5 * len1) {
564 else if (len1 > 1.2 * len2 || len2 > 1.2 * len1) {
571 align_list[
i].length,
572 align_list[j].profile,
573 align_list[j].length,
575 args[
"local"].AsBoolean());
578 printf(
"%s %s %.2f\n", align_list[
i].name,
579 align_list[j].name, (
double)score / 100);
585 distances(
i, j) = distances(j,
i) = 1.0 - (double)score *
586 (1.0 / align_list[
i].self_score +
587 1.0 / align_list[j].self_score) / 2;
594 int num_clusters = (
int)align_list.size();
599 while (!dfile.fail() && !dfile.eof()) {
608 dfile.getline(defline,
sizeof(defline),
'\n');
612 for (
int i = 0;
i < num_clusters;
i++) {
613 if (strstr(align_list[
i].name, name)) {
614 snprintf(align_list[
i].name,
sizeof(align_list[
i].name),
615 "%s %s", name, defline);
616 char *
tmp = align_list[
i].name;
628 for (
int i=0;
i <= last_index;
i++) {
629 for (
int j=(
i+1);j < num_clusters; j++) {
631 if (last_index < (
int)align_list.size() - 1 && j <= last_index) {
635 if (distances(
i, j) < args[
"cutoff"].AsDouble()) {
636 args[
"out"].AsOutputFile() << align_list[
i].name <<
"\t"
637 << align_list[j].name <<
"\t"
644 printf(
"%d\n", num_clusters);
645 for (
int i = 0;
i < num_clusters;
i++) {
646 printf(
"%s\n", align_list[
i].name);
647 for (
int j = 0; j < num_clusters; j++) {
648 printf(
"%5.4f ", distances(
i, j));
665 printf(
"\n\nNew Distance matrix:\n ");
666 for (
int i = (
int)align_list.size() - 1;
i > 0;
i--)
670 for (
int i = 0;
i < (
int)align_list.size() - 1;
i++) {
672 for (
int j = (
int)align_list.size() - 1; j >
i; j--) {
673 printf(
"%5.3f ", new_distances(
i, j));
679 printf(
"\n\nPercent relative error:\n ");
680 for (
int i = (
int)align_list.size() - 1;
i > 0;
i--)
684 for (
int i = 0;
i < (
int)align_list.size() - 1;
i++) {
686 for (
int j = (
int)align_list.size() - 1; j >
i; j--) {
687 printf(
"%5.2f ", 100*
fabs(new_distances(
i, j) -
688 distances(
i, j)) / distances(
i, j));
704 int main(
int argc,
const char* argv[])
static const int kAlphabetSize
The aligner internally works only with the ncbistdaa alphabet.
Data loader implementation that uses the blast databases.
Base class for reading FASTA sequences.
virtual void Exit(void)
Cleanup on application exit.
virtual void Init(void)
Initialize the application.
virtual int Run(void)
Run the application.
virtual void Exit(void)
Cleanup on application exit.
virtual void Init(void)
Initialize the application.
CRef< CObjectManager > m_ObjMgr
virtual int Run(void)
Run the application.
Simple implementation of ILineReader for i(o)streams.
definition of a Culling tree
A wrapper for controlling access to the phylogenetic tree generated by CDistMethods.
static void ListTreeLeaves(const TPhyTreeNode *node, vector< STreeLeaf > &node_list, double curr_dist=0)
Traverse a tree below a given starting point, listing all leaves encountered along the way.
Template class for iteration on objects of class C (non-medifiable version)
Interface for CMultiAligner.
Operators to edit gaps in sequences.
void SetStartWg(TScore value)
TTranscript GetTranscript(bool reversed=true) const
void SetEndWs(TScore value)
virtual CNWAligner::TScore Run(void)
TScore GetStartWs() const
void SetScoreMatrix(const SNCBIPackedScoreMatrix *scoremat)
SNCBIFullScoreMatrix & GetMatrix()
void SetEndWg(TScore value)
vector< ETranscriptSymbol > TTranscript
void SetSequences(const char *seq1, size_t len1, const char *seq2, size_t len2, bool verify=true)
TScore GetStartWg() const
void SetEndSpaceFree(bool Left1, bool Right1, bool Left2, bool Right2)
void SetStartWs(TScore value)
void HideStdArgs(THideStdArgs hide_mask)
Set the hide mask for the Hide Std Flags.
virtual const CArgs & GetArgs(void) const
Get parsed command line arguments.
int AppMain(int argc, const char *const *argv, const char *const *envp=0, EAppDiagStream diag=eDS_Default, const char *conf=NcbiEmptyCStr, const string &name=NcbiEmptyString)
Main function (entry point) for the NCBI application.
virtual void SetupArgDescriptions(CArgDescriptions *arg_desc)
Setup the command line argument descriptions.
const CNcbiArguments & GetArguments(void) const
Get the application's cached unprocessed command-line arguments.
@ fHideXmlHelp
Hide XML help description.
@ fHideLogfile
Hide log file description.
@ fHideFullVersion
Hide full version description.
@ fHideDryRun
Hide dryrun description.
@ fHideConffile
Hide configuration file description.
@ eInputFile
Name of file (must exist and be readable)
@ eBoolean
{'true', 't', 'false', 'f'}, case-insensitive
@ eDouble
Convertible into a floating point number (double)
@ eString
An arbitrary string.
@ eOutputFile
Name of file (must be writable)
@ eInteger
Convertible into an integer number (int or Int8)
EDiagSev SetDiagPostLevel(EDiagSev post_sev=eDiag_Error)
Set the threshold severity for posting the messages.
void SetDiagStream(CNcbiOstream *os, bool quick_flush=true, FDiagCleanup cleanup=0, void *cleanup_data=0, const string &stream_name="")
Set diagnostic stream.
@ eDS_Default
Try standard log file (app.name + ".log") in /log/, use stderr on failure.
@ eDiag_Warning
Warning message.
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
virtual CRef< CSeq_entry > ReadOneSeq(ILineErrorListener *pMessageListener=nullptr)
Read a single effective sequence, which may turn out to be a segmented set.
bool AtEOF(void) const
Indicates (negatively) whether there is any more input.
@ fNoParseID
Generate an ID (whole defline -> title)
@ fForceType
Force specified type regardless of accession.
@ fParseRawID
Try to identify raw accessions.
@ fAssumeProt
Assume prots unless accns indicate otherwise.
CConstBeginInfo ConstBegin(const C &obj)
Get starting point of non-modifiable object hierarchy.
static CRef< CObjectManager > GetInstance(void)
Return the existing object manager or create one.
CSeq_entry_Handle AddTopLevelSeqEntry(CSeq_entry &top_entry, TPriority pri=kPriority_Default, EExist action=eExist_Default)
Add seq_entry, default priority is higher than for defaults or loaders Add object to the score with p...
void AddDefaults(TPriority pri=kPriority_Default)
Add default data loaders from object manager.
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
IO_PREFIX::ifstream CNcbiIfstream
Portable alias for ifstream.
TNodeList_CI SubNodeBegin(void) const
Return first const iterator on subnode list.
TNodeList::const_iterator TNodeList_CI
bool IsLeaf() const
Report whether this is a leaf node.
TNodeList_CI SubNodeEnd(void) const
Return last const iterator on subnode list.
unsigned int
A callback function used to compare two keys in a database.
static void x_NormalizeResidueFrequencies(double **freq_data, int freq_size)
static void x_FillResidueFrequencies(double **freq_data, vector< CSequence > &query_data)
static double x_GetSelfScore(double **freq_data, int freq_size, SNCBIFullScoreMatrix &matrix)
static void x_PrintTree(const TPhyTreeNode *tree, int level, vector< SAlignEntry > &aligns)
static int x_ScoreFromTranscriptCore(CPSSMAligner &aligner, double **freq1, int len1, double **freq2, int len2, int end_gap_open, int end_gap_extend)
int x_AlignProfileProfile(double **freq1_data, int freq1_size, double **freq2_data, int freq2_size, CPSSMAligner &aligner, bool local_alignment)
static blast::TSeqLocVector x_GetSeqLocFromStream(CNcbiIstream &instream, CObjectManager &objmgr)
void x_SetScoreMatrix(const char *matrix_name, CPSSMAligner &aligner)
static double ** x_GetProfile(vector< CSequence > &alignment)
int main(int argc, const char *argv[])
static void x_FillNewDistanceMatrix(const TPhyTreeNode *node, CDistMethods::TMatrix &matrix)
static const int kScaleFactor
CSequnceHelper< CObject > CSequence
int strcmp(const char *str1, const char *str2)
Defines the CNcbiApplication and CAppException classes for creating NCBI applications.
Defines classes: CDirEntry, CFile, CDir, CSymLink, CMemoryFile, CFileUtil, CFileLock,...
const SNCBIPackedScoreMatrix NCBISM_Pam30
const SNCBIPackedScoreMatrix NCBISM_Blosum62
const SNCBIPackedScoreMatrix NCBISM_Pam250
#define NCBI_FSM_DIM
Recommended approach: unpack and index directly.
const SNCBIPackedScoreMatrix NCBISM_Blosum80
const SNCBIPackedScoreMatrix NCBISM_Pam70
const SNCBIPackedScoreMatrix NCBISM_Blosum45
The standard matrices.
vector< SSeqLoc > TSeqLocVector
Vector of sequence locations.
vector< CSequence > align