92 TRange(it->seq1_start, it->seq1_stop),
93 TRange(it->seq2_start, it->seq2_stop),
117 const unsigned char* sequence = it->GetSequence();
118 for (
int i=0;
i < it->GetLength();
i++) {
121 "input sequences are not allowed");
134 "Empty input alignment");
140 if (*it >= (
int)
m_InMSA1.size() || *it < 0) {
143 " for MSA 1 out of bounds");
149 if (*it >= (
int)
m_InMSA2.size() || *it < 0) {
152 " for MSA 2 out of bounds");
167 "Sequence specified by constraint is out of range");
176 if (from1 > to1 || from2 > to2) {
178 "Range specified by constraint is invalid");
185 "Constraint is out of range");
197 if (queries.size() < 2) {
199 "Aligner requires at least two input sequences");
221 if (queries.size() < 2) {
223 "Aligner requires at least two input sequences");
227 = objects::CObjectManager::GetInstance();
232 vector<objects::CBioseq_Handle> bioseq_handles;
234 bioseq_handles.push_back(
m_Scope->AddBioseq(**it));
238 ITERATE(vector<objects::CBioseq_Handle>, it, bioseq_handles) {
240 seq_loc(
new objects::CSeq_loc(objects::CSeq_loc::e_Whole));
243 seq_loc->SetId(*it->GetSeqId());
245 catch (objects::CObjMgrException e) {
247 (
string)
"Missing seq-id in bioseq. " + e.
GetMsg());
265 if (queries.size() < 2) {
267 "Aligner requires at least two input sequences");
273 for (
size_t i=0;
i < queries.size();
i++) {
282 m_Scope->AddScope(*queries[
i].scope);
298 const objects::CSeq_align& msa2,
314 ITERATE (objects::CDense_seg::TIds, it,
315 msa1.GetSegs().GetDenseg().GetIds()) {
318 new objects::CSeq_loc(objects::CSeq_loc::e_Whole)));
321 ITERATE (objects::CDense_seg::TIds, it,
322 msa2.GetSegs().GetDenseg().GetIds()) {
325 new objects::CSeq_loc(objects::CSeq_loc::e_Whole)));
330 if (!repr1.
empty()) {
341 if (!repr2.
empty()) {
361 "No tree to return");
383 if (
strcmp(matrix_name,
"BLOSUM62") == 0)
385 else if (
strcmp(matrix_name,
"BLOSUM45") == 0)
387 else if (
strcmp(matrix_name,
"BLOSUM80") == 0)
389 else if (
strcmp(matrix_name,
"PAM30") == 0)
391 else if (
strcmp(matrix_name,
"PAM70") == 0)
393 else if (
strcmp(matrix_name,
"PAM250") == 0)
397 "Unsupported score matrix. Valid matrix names: BLOSUM45, "\
398 "BLOSUM62, BLOSUM80, PAM30, PAM70 and PAM250");
430 const Int4 kGapOpen = 11;
431 const Int4 kGapExtend = 1;
435 NCBI_THROW(blast::CBlastException, eInvalidArgument,
436 "Cannot generate Karlin block");
450 dmat.
Resize(clusters.size(), clusters.size(), 0.0);
451 for (
size_t i=0;
i < clusters.size() - 1;
i++) {
452 for (
size_t j=
i+1;j < clusters.size();j++) {
453 dmat(
i, j) = bigmat(clusters[
i].GetPrototype(),
454 clusters[j].GetPrototype());
455 dmat(j,
i) = dmat(
i, j);
467 printf(
"distance matrix:\n");
469 for (
int i = matrix.
GetCols() - 1;
i > 0;
i--)
475 for (
int j = matrix.
GetCols() - 1; j >
i; j--) {
476 printf(
"%5.3f ", matrix(
i, j));
508 "Alignment interrupted");
524 vector<CTree::STreeLeaf> node_list1;
525 vector<CTree::STreeLeaf> node_list2;
538 vector<int> compress_inds;
540 compress_inds.push_back(
i);
543 compress_inds.clear();
545 compress_inds.push_back(
i);
559 vector<const CSequence*> pattern_queries;
580 for (
unsigned int i = 0;
i < pair_info.
GetRows();
i++) {
581 for (
unsigned int j = 0; j < pair_info.
GetCols(); j++) {
582 pair_info(
i, j).ResetList();
602 (
string)
"Number of queries exceeds maximum of "
606 bool is_cluster_found =
false;
607 vector<TPhyTreeNode*> cluster_trees;
635 "Invalid clustering option");
644 vector<const CSequence*> pattern_queries;
658 if (is_cluster_found) {
678 "Invalid clustering option");
718 catch (blast::CBlastException e) {
719 blast::CBlastException::EErrCode err_code
720 = (blast::CBlastException::EErrCode)e.
GetErrCode();
722 status = (err_code == blast::CBlastException::eInvalidArgument
731 catch (std::exception e) {
749 vector<TKmerCounts> kmer_counts;
758 shared_ptr<CClusterer::TDistMatrix> dmat
769 for (
size_t i=0;
i < dmat->GetRows();
i++) {
773 (*dmat)(center,
i) = 0.0;
774 (*dmat)(
i, center) = 0.0;
788 for (
int i=0;
i < (
int)dmat->GetCols() - 1;
i++) {
792 if (!constr_q.
empty() && constr_q.
find(
i) != constr_q.
end()) {
796 for (
int j=
i+1;j < (
int)dmat->GetCols();j++) {
798 if (!constr_q.
empty() && constr_q.
find(j) != constr_q.
end()) {
811 const double kMaxDistance = 1.5;
815 for (
int i=0;
i < (
int)dmat->GetRows();
i++) {
817 (*dmat)(
i, *it) = kMaxDistance;
818 (*dmat)(*it,
i) = kMaxDistance;
825 printf(
"K-mer counts distance matrix:\n");
827 for (
size_t i=dmat->GetCols() - 1;
i > 0;
i--) {
828 printf(
"%6d", (
int)
i);
831 for (
size_t i=0;
i < dmat->GetRows() - 1;
i++) {
832 printf(
"%3d:", (
int)
i);
833 for (
size_t j=dmat->GetCols() - 1;j >
i;j--) {
834 printf(
"%6.3f", (*dmat)(
i, j));
862 printf(
"\nNumber of queries in clusters: 0 (0%%)\n");
863 printf(
"Number of domain searches reduced by: 0 (0%%)\n\n");
864 printf(
"Only single-element clusters were found."
865 " No clustering information will be used.\n");
879 if (it->size() == 1) {
880 it->SetPrototype(*it->begin());
882 }
else if (it->size() == 2) {
887 int prot = (len1 > len2) ? (*it)[0] : (*it)[1];
888 it->SetPrototype(
prot);
898 vector< CRef<objects::CSeq_loc> > cluster_prototypes;
901 cluster_prototypes.push_back(
m_tQueries[cluster_it->GetPrototype()]);
912 const vector<CSequence>& q =
915 printf(
"Query clusters:\n");
917 size_t num_in_clusters = 0;
919 printf(
"Cluster %3d: ", cluster_idx++);
920 printf(
"(prototype: %3d) ", it_cl->GetPrototype());
923 printf(
"%d (%d), ", *it_el, q[*it_el].
GetLength());
926 if (it_cl->size() > 1) {
927 num_in_clusters += it_cl->size();
931 size_t gain =
m_QueryData.size() - clusters.size();
932 printf(
"\nNumber of queries in clusters: %lu (%.0f%%)\n",
934 (
double)num_in_clusters /
m_QueryData.size() * 100.0);
935 printf(
"Number of domain searches reduced by: %lu (%.0f%%)\n\n", gain,
939 printf(
"Distances in clusters:\n");
940 for (
size_t cluster_idx=0;cluster_idx < clusters.size();
944 if (cluster.
size() == 1) {
948 printf(
"Cluster %d:\n", (
int)cluster_idx);
949 if (cluster.
size() == 2) {
950 printf(
" %6.3f\n\n", d(cluster[0], cluster[1]));
955 for (
size_t i= cluster.
size() - 1;
i > 0;
i--) {
956 printf(
"%6d", (
int)cluster[
i]);
959 for (
size_t i=0;
i < cluster.
size() - 1;
i++) {
960 printf(
"%3d:", (
int)cluster[
i]);
961 for (
size_t j=cluster.
size() - 1;j >
i;j--) {
962 printf(
"%6.3f", d(cluster[
i], cluster[j]));
969 printf(
"Sequences that belong to different clusters with distance"
970 " smaller than threshold (exludes prototypes):\n");
972 if (it->size() == 1) {
983 if (*el == cl->GetPrototype()) {
988 printf(
"%3d, %3d: %f\n", *elem, *el, d(*elem, *el));
1023 for (
size_t cluster_idx=0;cluster_idx < clusters.size();cluster_idx++) {
1030 if (clusters[cluster_idx].
size() > 1) {
1036 bool is_gap_in_prototype =
false;
1056 if (len1 > 1.2 * len2 || len2 > 1.2 * len1) {
1076 for (
size_t j=0;j <
t.size();j++) {
1078 is_gap_in_prototype =
true;
1083 if (!is_gap_in_prototype) {
1092 for (it=clusters[cluster_idx].begin();it != seq_idx;++it) {
1114 printf(
"Aligning in cluster %d:\n", (
int)cluster_idx);
1117 printf(
"%3d: ", *elem);
1131 "Alignement Interrupted");
1141 printf(
"Gaps in cluster %d: ", (
int)
i);
1153 if (clusters.size() == 1) {
1164 it->letters.resize(
len);
1165 for (
size_t i=0;
i <
len;
i++) {
1166 it->letters[
i] = -1;
1176 vector<CMultiAligner::SColumn>::iterator& it,
1177 size_t len,
int num,
int cluster)
1180 it->letters.resize(
len);
1181 for (
size_t i=0;
i <
len;
i++) {
1182 it->letters[
i] = -1;
1185 it->cluster = cluster;
1192 int seq_length =
m_Results[0].GetLength();
1195 vector<int> letter_inds(clusters.size());
1196 vector<SColumn> columns(seq_length);
1203 for (
size_t j=0;j < clusters.size();j++) {
1205 it->letters[clusters[j].GetPrototype()] = letter_inds[j]++;
1212 int new_length = seq_length;
1215 for (
size_t cluster_idx=0;cluster_idx < clusters.size();cluster_idx++) {
1233 vector<SColumn>::iterator it = columns.begin();
1234 size_t prototype_idx = clusters[cluster_idx].GetPrototype();
1235 while (it != columns.end()
1236 && (it->insert || it->letters[prototype_idx] < (
int)
letter)) {
1241 it = columns.insert(it,
SColumn());
1256 "Alignment interrupted");
1267 it->Reset(new_length);
1272 vector<int> gap_offsets(clusters.size());
1276 ITERATE(vector<SColumn>, it, columns) {
1281 for (
size_t i=0;
i < clusters.size();
i++) {
1284 size_t prototype_idx = clusters[
i].GetPrototype();
1285 int letter = it->letters[prototype_idx];
1303 results[*elem].SetLetter(col,
1313 for (
int i=0;
i < it->number;
i++) {
1314 results[*elem].SetLetter(col +
i,
1325 "Alignment interrupted");
1332 printf(
"Cluster prototypes:\n");
1334 const CSequence& seq = results[it->GetPrototype()];
1342 printf(
"Individual clusters:\n");
1343 for (
int i=0;
i < (
int)clusters.size();
i++) {
1344 if (clusters[
i].
size() > 1) {
1345 printf(
"Cluster %d:\n",
i);
1359 printf(
"All queries:\n");
1360 ITERATE(vector<CSequence>, it, results) {
1381 for (
size_t cluster_idx=0;cluster_idx < clusters.size();cluster_idx++) {
1387 if (cluster.
size() == 1) {
1415 freqs(
i, k) += matrix(
i +
offset, k);
1426 "Alignment interrupted");
1434 vector<int>& indices)
1445 queries.push_back(sl);
1446 indices.push_back(*it);
1453 queries.push_back(sl);
1454 indices.push_back((
int)
m_InMSA1.size() + *it);
1467 blast::SSeqLoc sl(**it, *
m_Scope);
1468 queries.push_back(sl);
1478 int index = it->GetPrototype();
1480 queries.push_back(sl);
1481 indices.push_back(index);
1487 "Invalid in-cluster alignment method");
1494 vector<int>& indices)
1505 indices[
i] = (
int)
i;
1512 queries.resize(clusters.size());
1513 indices.resize(clusters.size());
1514 for (
size_t i=0;
i < clusters.size();
i++) {
1515 int index = clusters[
i].GetPrototype();
1524 "Invalid in-cluster alignment method");
1541 double node_dist = distance / 2.0;
1544 if (node_dist <= 0.0) {
1552 node->
GetValue().SetDist(node_dist);
1558 node->
GetValue().SetDist(node_dist);
1575 int id = ids[node->
GetValue().GetId()];
1599 _ASSERT(trees.size() == clusters.size());
1603 for (
size_t i=0;
i < trees.size();
i++) {
1606 if (clusters[
i].
size() == 1) {
1614 trees.resize(clusters.size());
1615 for (
int clust_idx=0;clust_idx < (
int)clusters.size();clust_idx++) {
1619 if (cluster.
size() == 1) {
1620 trees[clust_idx] =
NULL;
1624 if (cluster.
size() == 2) {
1633 CTree single_tree(mat,
1640 trees[clust_idx] = root;
1646 for (
size_t i=0;
i < trees.size();
i++) {
1648 printf(
"Tree for cluster %d:\n", (
int)
i);
1667 vector<double>& leaf_dists,
1668 vector<TPhyTreeNode*>& leaf_nodes,
1669 bool last_edge_only =
false)
1673 if (
tree->IsLeaf()) {
1675 int id =
tree->GetValue().GetId();
1676 double dist =
tree->GetValue().GetDist();
1677 if (!last_edge_only) {
1678 dist += dist_from_root;
1681 _ASSERT(
id < (
int)leaf_dists.size());
1682 leaf_dists[id] = dist;
1685 _ASSERT(
id < (
int)leaf_nodes.size() && !leaf_nodes[
id]);
1686 leaf_nodes[id] =
tree;
1692 if (
tree->GetParent() &&
tree->GetValue().IsSetDist() && !last_edge_only) {
1693 dist =
tree->GetValue().GetDist();
1700 while (it !=
tree->SubNodeEnd()) {
1714 double dist_from_root)
1718 if (node->
GetValue().GetId() ==
id) {
1719 return dist_from_root + node->
GetValue().GetDist();
1779 if (curr_dist > 0.0) {
1780 scale = dist / curr_dist;
1791 const vector<TPhyTreeNode*>& cluster_trees,
1792 const vector<TPhyTreeNode*>& cluster_leaves)
1794 ITERATE(vector<TPhyTreeNode*>, it, cluster_leaves) {
1802 int cluster_id = node->
GetValue().GetId();
1811 int seq_id = cluster[0];
1825 vector<TPhyTreeNode*> children;
1828 children.push_back(*child);
1831 ITERATE(vector<TPhyTreeNode*>, it, children) {
1849 _ASSERT(cluster_trees.size() == clusters.size());
1853 vector<double> cluster_dists(clusters.size(), 0.0);
1854 vector<TPhyTreeNode*> cluster_leaves(clusters.size(),
NULL);
1859 vector<TPhyTreeNode*> dummy_vect(clusters.size(),
NULL);
1860 vector<double>d(cluster_dists.size());
1862 for (
size_t i=0;
i < d.size();
i++) {
1863 printf(
"%d:%f ", (
int)
i, d[
i]);
1870 for (
size_t i=0;
i < cluster_trees.size();
i++) {
1873 if (!cluster_trees[
i]) {
1880 if (cluster_dists[
i] <= 0.0) {
1881 cluster_dists[
i] = 1e-5;
1900 for (
size_t i=0;
i < cluster_dists.size();
i++) {
1901 printf(
"%d:%f ", (
int)
i, cluster_dists[
i]);
1907 printf(
"Full tree:\n");
static const int kAlphabetSize
The aligner internally works only with the ncbistdaa alphabet.
Declares the BLAST exception class.
Int2 Blast_KarlinBlkGappedLoadFromTables(Blast_KarlinBlk *kbp, Int4 gap_open, Int4 gap_extend, const char *matrix_name, Boolean standard_only)
Attempts to fill KarlinBlk for given gap opening, extensions etc.
int GetPrototype(void) const
Get cluster prototype.
size_t size(void) const
Get cluster size.
vector< int >::const_iterator const_iterator
Interface for CClusterer class used for clustering any type of data based on distance matrix.
void ReleaseTrees(vector< TPhyTreeNode * > &trees)
Get list of trees for clusters and release ownership to caller.
@ eClique
Clusters can be joined if there is a link between all pairs of their elements.
@ eCompleteLinkage
Maximum distance between elements.
TPhyTreeNode * ReleaseTree(int index=0)
Get cluster tree and release ownership to caller.
void Reset(void)
Clear clusters and distance matrix.
void SetMakeTrees(bool trees)
Set make cluster tree/dendrogram option.
void SetDistMatrix(const TDistMatrix &dmat)
Set new distance matrix.
const TDistMatrix & GetDistMatrix(void) const
Get distance matrix.
void SetLinks(CRef< CLinks > links)
Set distance links.
void Run(void)
Cluster elements.
const TClusters & GetClusters(void) const
Get clusters.
void ComputeClusters(double max_diam, EDistMethod dist_method=eCompleteLinkage, bool do_trees=true, double infinity=-1.0)
Compute clusters.
void SetClustMethod(EClustMethod method)
Set clustering method for links.
vector< TSingleCluster > TClusters
TClusters & SetClusters(void)
Set clusters.
void PurgeDistMatrix(void)
Delete distance matrix.
void GetClusterDistMatrix(int index, TDistMatrix &mat) const
Get distance matrix for elements of a selected cluster.
Representation of pairwise distances, intended for use in multiple sequence alignment applications.
const CDistMethods::TMatrix & GetMatrix() const
Access the current distance matrix.
Interface for the traceback from blast hits.
An ordered collection of CHit objects.
int Size() const
Retrieve number of hits in list.
void PurgeAllHits()
Delete all hits unconditionally.
CHit * GetHit(int index)
Retrieve a hit from the hitlist.
void AddToHitList(CHit *hit)
Append a hit to the hitlist.
A generalized representation of a pairwise alignment.
int m_SeqIndex1
Numerical identifier for first sequence in alignment.
int m_SeqIndex2
Numerical identifier for second sequence in alignment.
TRange m_SeqRange1
The range of offsets on the first sequence.
TRange m_SeqRange2
The range of offsets on the second sequence.
Set of edges with weights between nodes represented by zero-based positive integers.
void AddLink(int first, int second, double weight)
Add link.
void Sort(void)
Sort links according to weights in ascending order.
Options and parameters for multiple alignement.
@ eFastME
Fast Minimum Evolution.
@ eClusters
Clustering dendrogram.
TScore GetEndGapExtendPenalty(void) const
Get gap extension penalty for end gaps in pairwise global alignment of profiles.
double GetMaxInClusterDist(void) const
Get maximum allowed distance between sequences in a cluster.
string GetScoreMatrixName(void) const
Get alignment score matrix name.
TKMethods::EDistMeasures GetKmerDistMeasure(void) const
Get method for computing distance between word count vectors.
int GetCentralSeq(void) const
Get central sequence.
EInClustAlnMethod GetInClustAlnMethod(void) const
TScore GetGapExtendPenalty(void) const
Get gap extension penlaty for middle gaps in pairwise global alignment of profiles.
const TConstraints & GetUserConstraints(void) const
Get user constraints.
TScore GetGapOpenPenalty(void) const
Get gap opening penalty for middle gaps in pairwise global alignment of profiles.
TScore GetEndGapOpenPenalty(void) const
Get gap opening penalty for end gaps in pairwise global alignment of profiles.
TKMethods::ECompressedAlphabet GetKmerAlphabet(void) const
Get alphabet used for creating word count vectors.
ETreeMethod GetTreeMethod(void) const
Get method for creating tree that guides progressive alignment.
vector< SConstraint > TConstraints
bool GetUseQueryClusters(void) const
Check if query clustering option is on.
@ eToPrototype
All cluster elements are aligner to cluster prototype.
@ eMulti
Alignment guide tree for each cluster is attached to the main alignment guide tree.
int GetKmerLength(void) const
Get word size for creating word count vectors.
int GetUserConstraintsScore(void) const
Get score for user alignment constraints.
bool GetVerbose(void) const
Get verbose mode.
Simultaneously align multiple protein sequences.
vector< CSequence > m_AllQueryData
vector< vector< Uint4 > > m_ClusterGapPositions
CMultiAlignerOptions::EInClustAlnMethod m_ClustAlnMethod
void x_SetScoreMatrix(const char *matrix_name)
Set the score matrix the aligner will use.
SProgress m_ProgressMonitor
CRef< objects::CScope > m_Scope
void x_ComputeClusterTrees(vector< TPhyTreeNode * > &trees)
Compute independent phylogenetic trees each cluster.
void x_FindLocalInClusterHits(const vector< TPhyTreeNode * > &cluster_trees)
Run blast on sequences from each cluster subtree.
static void x_InitInsertColumn(vector< SColumn >::iterator &it, size_t len, int num, int cluster)
vector< int > m_Msa2Repr
Indices of sequence representatives in input alignment 2.
vector< CRef< objects::CSeq_loc > > m_tQueries
void x_MakeClusterResidueFrequencies()
Compute profile residue frequencies for clusters.
vector< CRef< objects::CSeq_loc > > m_AllQueries
TStatus Run(void)
Align the current set of input sequences (reset any existing alignment information).
struct CMultiAligner::SColumn SColumn
Column in an alignment used for combining result from multiple alignment and pair-wise in-cluster ali...
void x_ComputeTree()
Given the current list of domain and local hits, generate a phylogenetic tree that clusters the curre...
void x_FindLocalHits(const blast::TSeqLocVector &queries, const vector< int > &indices)
Run blast on selected input sequences and postprocess the results.
@ eOutOfMemory
Out of memory error.
@ eInternalError
Unexpected error occured.
@ eSuccess
Alignment successfully completed.
@ eInterrupt
Alignment interruped through callback function.
@ eOptionsError
Error related to options occured.
@ eDatabaseError
Error related to RPS database occured.
@ eQueriesError
Error related to query sequences occured.
void x_BuildFullTree(const vector< TPhyTreeNode * > &cluster_trees)
Combine alignment guide tree computed for clusters with guide trees computed for each cluster.
vector< CSequence > m_QueryData
bool x_ValidateInputMSAs(void) const
Validate input alignments.
bool x_ValidateQueries(void) const
Validate query sequences.
void x_CreatePatternQueries(vector< const CSequence * > &queries, vector< int > &indices)
Create query set for PROSITE pattern search along with indices in multiple alignment queries array.
void SetInputMSAs(const objects::CSeq_align &msa1, const objects::CSeq_align &msa2, const set< int > &representatives1, const set< int > &representatives2, CRef< objects::CScope > scope)
Set input alignments.
void x_AlignMSAs(void)
Align multiple sequence alignments.
void x_MultiAlignClusters()
Combine pair-wise in-cluster alignements with multiple alignments of cluster prototypes.
vector< CSequence > m_InMSA1
Input alignment.
vector< CSequence > m_Results
void x_FindConsistentHitSubset(void)
Find consistent subset of pair-wise hits that can be used as alignment constraints.
CConstRef< CMultiAlignerOptions > m_Options
void x_InitParams(void)
Initiate parameters using m_Options.
void x_AlignProfileProfile(vector< CTree::STreeLeaf > &node_list1, vector< CTree::STreeLeaf > &node_list2, vector< CSequence > &alignment, CNcbiMatrix< CHitList > &pair_info, int iteration)
Align two collections of sequences.
void SetQueries(const vector< CRef< objects::CSeq_loc > > &queries, CRef< objects::CScope > scope)
Set query sequences.
void Reset(void)
Clear out the state left by the previous alignment operation.
void x_AlignInClusters()
Pair-wise align each cluster sequence to cluster representative.
vector< string > m_Messages
static void x_InitColumn(vector< SColumn >::iterator &it, size_t len)
virtual void x_Run(void)
Align the current set of input sequences (reset any existing alignment information).
bool(* FInterruptFn)(SProgress *progress)
Prototype for function pointer to dertermine whether alignment should proceed of be interrupted.
void x_FindDomainHits(blast::TSeqLocVector &queries, const vector< int > &indices)
Run RPS blast on seletced input sequences and postprocess the results.
CMultiAligner(void)
Create mutli aligner with default options.
void x_BuildAlignment()
Given the current domain, local, pattern and user hits, along with the current tree,...
void x_Init(void)
Initiate class attributes that are not alignment parameters.
void x_FindPatternHits(const vector< const CSequence * > &queries, const vector< int > &indices)
Find PROSITE pattern hits on selected input sequences.
CRef< objects::CBioTreeContainer > GetTreeContainer(void) const
Get serializable tree used as guide in progressive alignment.
vector< int > m_Msa1Repr
Indices of sequence representatives in input alignment 1.
void x_AttachClusterTrees(const vector< TPhyTreeNode * > &cluster_trees, const vector< TPhyTreeNode * > &cluster_leaves)
Replace leaves in the alignment guide tree of clusters with cluster trees.
bool x_FindQueryClusters()
Find clusters of similar queries, select cluster representative sequences, and prepare input to multi...
void x_CreateBlastQueries(blast::TSeqLocVector &queries, vector< int > &indices)
Create query set for RPS Blast and Blastp searches along with indices in multiple alignment queries a...
bool x_ValidateUserHits(void)
Validate user constraints with queries.
void x_InitAligner(void)
Initiate PSSM aligner parameters.
static const int kClusterNodeId
FInterruptFn SetInterruptCallback(FInterruptFn fnptr, void *user_data=NULL)
Set a function callback to be invoked by multi aligner to allow interrupting alignment in progress.
vector< CSequence > m_InMSA2
Input alignment.
void Resize(size_t i, size_t j, T val=T())
resize this matrix, filling the empty cells with a known value
size_t GetRows() const
get the number of rows in this matrix
size_t GetCols() const
get the number of columns in this matrix
Class for representing protein sequences.
int GetLength() const
Get the length of the current sequence.
unsigned char GetLetter(int pos) const
Access the sequence letter at a specified position.
TFreqMatrix & GetFreqs()
Access the list of position frequencies associated with a sequence.
static void CompressSequences(vector< CSequence > &seq, vector< int > index_list)
Given a collection of sequences, remove all sequence positions where a subset of the sequences all co...
void PropagateGaps(const CNWAligner::TTranscript &transcript, CNWAligner::ETranscriptSymbol gap_choice)
Given an edit script, insert gaps into a sequence.
static const unsigned char kGapChar
The ncbistdaa code for a gap.
unsigned char * GetSequence()
Access the raw sequence data, in ncbistdaa format.
static void CreateMsa(const objects::CSeq_align &seq_align, objects::CScope &scope, vector< CSequence > &msa)
Create a vector of CSequence objects that represents the alignment in given Seq_align.
unsigned char GetPrintableLetter(int pos) const
Access the sequence letter at a specified position, and return an ASCII representation of that letter...
definition of a Culling tree
A wrapper for controlling access to the phylogenetic tree generated by CDistMethods.
void SetTree(TPhyTreeNode *tree)
Set tree.
static void PrintTree(const TPhyTreeNode *node, int level=0)
Debug routine to recursively print out a tree.
const TPhyTreeNode * GetTree() const
Access the current tree.
TPhyTreeNode * ReleaseTree()
Get the current tree and release ownership.
void ComputeTree(const CDistMethods::TMatrix &distances, bool use_fastme=false)
Compute a new tree.
static void ComputeDistMatrix(const vector< TKmerCounts > &counts, double(*fsim)(const TKmerCounts &, const TKmerCounts &), TDistMatrix &dmat)
Compute matrix of distances between given counts vectors.
static void ComputeCounts(const vector< CRef< objects::CSeq_loc > > &seqs, objects::CScope &scope, vector< TKmerCounts > &counts)
Create k-mer counts vectors for given sequences.
static void SetParams(unsigned kmer_len, unsigned alphabet_size)
Set default counts vector parameters.
iterator_bool insert(const value_type &val)
const_iterator begin() const
const_iterator find(const key_type &key) const
const_iterator end() const
static TPhyTreeNode * s_MakeTwoLeafTree(const CClusterer::CSingleCluster &ids, double distance)
Create phylogenetic tree for two sequences.
static void s_ScaleTreeEdges(TPhyTreeNode *node, double scale)
Scale all tree edges by given factor (recursive).
static void s_SetLeafIds(TPhyTreeNode *node, const CClusterer::CSingleCluster &ids)
Change ids of leaf nodes in a given tree to desired values (recursive).
static double s_FindNodeDistance(const TPhyTreeNode *node, int id, double dist_from_root)
Find distance from root for selected node (recursive).
static void s_RescaleTree(TPhyTreeNode *tree, int id, double dist)
Rescale tree so that node with given id has desired distance from root.
static void s_FindLeafDistances(TPhyTreeNode *tree, double dist_from_root, vector< double > &leaf_dists, vector< TPhyTreeNode * > &leaf_nodes, bool last_edge_only=false)
Compute length of the edge or distance from root for each leaf (recursive).
Interface for CMultiAligner.
CRef< objects::CBioTreeContainer > MakeBioTreeContainer(const TPhyTreeNode *tree)
Conversion from TPhyTreeNode to CBioTreeContainer.
void SetStartWg(TScore value)
TTranscript GetTranscript(bool reversed=true) const
void SetEndWs(TScore value)
virtual CNWAligner::TScore Run(void)
void SetScoreMatrix(const SNCBIPackedScoreMatrix *scoremat)
SNCBIFullScoreMatrix & GetMatrix()
void SetEndWg(TScore value)
vector< ETranscriptSymbol > TTranscript
void SetSequences(const char *seq1, size_t len1, const char *seq2, size_t len2, bool verify=true)
void SetEndSpaceFree(bool Left1, bool Right1, bool Left2, bool Right2)
void SetStartWs(TScore value)
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
TErrCode GetErrCode(void) const
Get error code.
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
const string & GetMsg(void) const
Get message string.
virtual const char * what(void) const noexcept
Standard report (includes full backlog).
TSeqPos GetLength(const CSeq_id &id, CScope *scope)
Get sequence length if scope not null, else return max possible TSeqPos.
bool Empty(void) const THROWS_NONE
Check if CConstRef is empty – not pointing to any object which means having a null value.
void Reset(void)
Reset reference object.
int32_t Int4
4-byte (32-bit) signed integer
uint32_t Uint4
4-byte (32-bit) unsigned integer
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define END_SCOPE(ns)
End the previously defined scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
#define BEGIN_SCOPE(ns)
Define a new scope.
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
TNodeList::iterator TNodeList_I
TTreeType * DetachNode(TTreeType *subnode)
Remove the subtree from the tree without destroying it.
TNodeList_CI SubNodeBegin(void) const
Return first const iterator on subnode list.
TNodeList::const_iterator TNodeList_CI
void AddNode(TTreeType *subnode)
Add new subnode.
bool IsLeaf() const
Report whether this is a leaf node.
TNodeList_CI SubNodeEnd(void) const
Return last const iterator on subnode list.
const TValue & GetValue(void) const
Return node's value.
const TTreeType * GetParent(void) const
Get node's parent.
TTo GetTo(void) const
Get the To member data.
TFrom GetFrom(void) const
Get the From member data.
unsigned int
A callback function used to compare two keys in a database.
constexpr bool empty(list< Ts... >) noexcept
const struct ncbi::grid::netcache::search::fields::SIZE size
CSequnceHelper< CObject > CSequence
int strcmp(const char *str1, const char *str2)
#define ASSERT
macro for assert.
void copy(Njn::Matrix< S > *matrix_, const Njn::Matrix< T > &matrix0_)
CTreeNode< CPhyNodeData > TPhyTreeNode
const SNCBIPackedScoreMatrix NCBISM_Pam30
const SNCBIPackedScoreMatrix NCBISM_Blosum62
const SNCBIPackedScoreMatrix NCBISM_Pam250
const SNCBIPackedScoreMatrix NCBISM_Blosum80
const SNCBIPackedScoreMatrix NCBISM_Pam70
const SNCBIPackedScoreMatrix NCBISM_Blosum45
The standard matrices.
vector< SSeqLoc > TSeqLocVector
Vector of sequence locations.
Structure to hold the Karlin-Altschul parameters.
Structure for listing tree leaves.
static Uint4 letter(char c)