43 #ifdef NCBI_COMPILER_MSVC
44 # define isfinite _finite
45 #elif defined(NCBI_OS_SOLARIS) && !defined(__builtin_isfinite) \
46 && (!defined(__GNUC__) || (__GNUC__ == 4 && __GNUC_MINOR < 5))
48 # define isfinite finite
57 throw invalid_argument(
"Matrix contained NaN or Inf");
67 for (
size_t i = 0;
i < frac_diff.
GetRows(); ++
i) {
68 for (
size_t j = 0; j < frac_diff.
GetCols(); ++j) {
69 result(
i, j) = -0.75 *
log(1 - (4.0 / 3.0) * frac_diff(
i, j));
80 for (
size_t i = 0;
i < frac_diff.
GetRows(); ++
i) {
81 for (
size_t j = 0; j < frac_diff.
GetCols(); ++j) {
93 for (
size_t i = 0;
i < frac_diff.
GetRows(); ++
i) {
94 for (
size_t j = 0; j < frac_diff.
GetCols(); ++j) {
96 - 0.2 * frac_diff(
i, j) * frac_diff(
i, j));
107 for (
size_t i = 0;
i < frac_diff.
GetRows(); ++
i) {
108 for (
size_t j = 0; j < frac_diff.
GetCols(); ++j) {
109 if (frac_diff(
i, j) >= 1.0) {
110 throw invalid_argument(
"Grishin distance can not be computed \
111 for sequences that are 100% different");
113 result(
i, j) = frac_diff(
i, j) * (2.0 - frac_diff(
i, j))
114 / (2 * (1.0 - frac_diff(
i, j)));
124 for (
size_t i = 0;
i < frac_diff.
GetRows(); ++
i) {
125 for (
size_t j = 0; j < frac_diff.
GetCols(); ++j) {
126 if (frac_diff(
i, j) >= 1.0) {
127 throw invalid_argument(
"Grishin distance can not be computed \
128 for sequences that are 100% different");
131 0.65 * (pow(1 - frac_diff(
i, j), -1.0 / 0.65) - 1.0);
138 const vector<string>& labels)
144 for (
unsigned int i = 0;
i < dist_mat.
GetRows(); ++
i) {
147 if (labels.empty()) {
150 new_node->
GetValue().SetLabel() = labels[
i];
155 int next_id = dist_mat.
GetRows();
159 for (
unsigned int n = dist_mat.
GetRows();
n > 2; --
n) {
165 it1 !=
tree->SubNodeEnd(); ++it1) {
166 i = (*it1)->GetValue().GetId();
169 it2 !=
tree->SubNodeEnd(); ++it2) {
173 j = (*it2)->GetValue().GetId();
182 it1 !=
tree->SubNodeEnd(); ++it1) {
185 for (; it2 !=
tree->SubNodeEnd(); ++it2) {
186 i = (*it1)->GetValue().GetId();
187 j = (*it2)->GetValue().GetId();
188 m = dmat(
i, j) - (
r[
i] +
r[j]) / (
n - 2);
198 i = (*neighbor1)->GetValue().GetId();
199 j = (*neighbor2)->GetValue().GetId();
202 double viu = dmat(
i, j) / 2 + (
r[
i] -
r[j]) / (2 * (
n - 2));
203 double vju = dmat(
i, j) - viu;
204 (*neighbor1)->GetValue().SetDist(viu);
205 (*neighbor2)->GetValue().SetDist(vju);
208 tree->AddNode(new_node);
211 it1 !=
tree->SubNodeEnd(); ++it1) {
212 int k = (*it1)->GetValue().GetId();
216 dmat(k, u) = dmat(u, k)
217 = (dmat(
i, k) + dmat(j, k) - dmat(
i, j)) / 2;
227 if ((*it1)->IsLeaf()) {
230 double d = dmat((*it1)->GetValue().GetId(), (*it2)->GetValue().GetId());
231 (*it2)->GetValue().SetDist(d);
232 (*it1)->AddNode(
tree->DetachNode(it2));
244 double distance = 0.0;
269 while (child !=
tree->SubNodeEnd()) {
270 if ((*child)->IsLeaf()) {
313 double new_distance = parent->
GetValue().GetDist();
314 parent->
GetValue().SetDist(distance);
315 distance = new_distance;
331 parent = next_parent;
367 typedef
char boolean;
370 int btype_in,
int wtype_in,
int ntype_in);
381 const vector<
string>& labels)
385 node->GetValue().SetId(
id);
386 if (!labels.empty()) {
387 node->GetValue().SetLabel(labels[
id]);
389 node->GetValue().SetLabel(me_node->label);
397 child_node->
GetValue().SetDist(me_node->leftEdge->distance);
401 child_node->
GetValue().SetDist(me_node->rightEdge->distance);
407 const vector<string>& labels) {
411 edge = me_tree->root->leftEdge;
414 node->
GetValue().SetDist(edge->distance);
418 if (!labels.empty()) {
419 node->
GetValue().SetLabel(labels[
id]);
421 node->
GetValue().SetLabel(me_tree->root->label);
432 const vector<string>& labels,
441 for (
unsigned int i = 0;
i < dist_mat.
GetRows(); ++
i) {
442 for (
unsigned int j = 0; j < dist_mat.
GetRows(); ++j) {
443 dfme[
i][j] = dist_mat(
i, j);
449 char **clabels =
new char*[dist_mat.
GetRows()];
450 vector<string> dummy_labels;
451 dummy_labels.resize(dist_mat.
GetRows());
452 for (
unsigned int i = 0;
i < dist_mat.
GetRows(); ++
i) {
454 clabels[
i] =
const_cast<char *
>(dummy_labels[
i].c_str());
459 btype, wtype, ntype);
489 for (
unsigned int pos = 0; pos < seq1.size(); ++pos) {
490 if (seq1[pos] ==
'-' || seq2[pos] ==
'-') {
494 if (seq1[pos] != seq2[pos]) {
499 return diff_count / (double) pos_count;
508 for (
unsigned int pos = 0; pos < seq1.size(); ++pos) {
509 if (seq1[pos] ==
'-' || seq2[pos] ==
'-') {
513 if (seq1[pos] == seq2[pos]) {
520 if (pos_count == 0) {
524 _ASSERT(same_count <= pos_count);
525 return same_count / (double) pos_count;
534 avec.SetGapChar(
'-');
535 avec.SetEndChar(
'-');
537 int nseqs = avec.GetNumRows();
538 result.Resize(nseqs, nseqs);
539 vector<string> seq(nseqs);
541 for (
int i = 0;
i < nseqs; ++
i) {
542 avec.GetWholeAlnSeqString(
i, seq[
i]);
545 for (
int i = 0;
i < nseqs; ++
i) {
547 for (
int j =
i + 1; j < nseqs; ++j) {
557 int parent_uid,
int& next_uid)
559 const int label_fid = 0;
560 const int dist_fid = 1;
564 int my_uid = next_uid++;
569 node->SetParent(parent_uid);
570 if (ptn->
GetValue().GetLabel() !=
"") {
572 node_feature->SetFeatureid(label_fid);
573 node_feature->SetValue(ptn->
GetValue().GetLabel());
574 node->SetFeatures().Set().push_back(node_feature);
578 node_feature->SetFeatureid(dist_fid);
579 node_feature->SetValue
581 node->SetFeatures().Set().push_back(node_feature);
584 btc->
SetNodes().Set().push_back(node);
597 const int label_fid = 0;
598 const int dist_fid = 1;
604 fdescr->SetId(label_fid);
605 fdescr->SetName(
"label");
606 btc->
SetFdict().Set().push_back(fdescr);
609 fdescr->SetId(dist_fid);
610 fdescr->SetName(
"dist");
611 btc->
SetFdict().Set().push_back(fdescr);
616 btc->
SetNodes().Set().front()->ResetParent();
622 const int label_fid = 0;
623 const int dist_fid = 1;
629 fdescr->SetId(label_fid);
630 fdescr->SetName(
"label");
631 btc->
SetFdict().Set().push_back(fdescr);
636 btc->
SetNodes().Set().front()->ResetParent();
638 bool at_least_one_node_has_distance =
false;
640 for (
auto it = nodes.begin(); it != nodes.end(); ++it) {
641 const auto& node = **it;
642 if (!node.IsSetFeatures() || !node.GetFeatures().CanGet()) {
645 const auto& features = node.GetFeatures().Get();
646 for (
auto it1 = features.begin(); it1 != features.end(); ++it1) {
647 const auto& feature = **it1;
648 if (feature.IsSetFeatureid() && (dist_fid == feature.GetFeatureid())) {
649 at_least_one_node_has_distance =
true;
653 if (at_least_one_node_has_distance) {
657 if (at_least_one_node_has_distance) {
659 fdescr->SetId(dist_fid);
660 fdescr->SetName(
"dist");
661 btc->
SetFdict().Set().push_back(fdescr);
669 if (!isfinite(*it) || *it < 0.0) {
User-defined methods of the data storage class.
const CDense_seg & GetDenseg(void) const
CScope & GetScope(void) const
static void ZeroNegativeBranches(TTree *node)
Sets negative lengths of branches of a tree to zero.
static TTree * RerootTree(TTree *tree, TTree *node=NULL)
Reroot tree, new root is placed in the middle of the edge specified by node.
static bool AllFinite(const TMatrix &mat)
Check a matrix for NaNs and Infs.
static TTree * NjTree(const TMatrix &dist_mat, const vector< string > &labels=vector< string >())
Compute a tree by neighbor joining; as per Hillis et al.
static double FractionIdentity(const string &seq1, const string &seq2)
Calculate pairwise fraction identity based on positions where both sequences have a base/residue.
static TTree * x_FindLargestEdge(TTree *tree, TTree *best_node)
Find node with the longest edge in the tree.
static void GrishinGeneralDist(const TMatrix &frac_diff, TMatrix &result)
Grishin's distance for protein sequences 1 - p = ln(1 + 2d) / 2d.
static void GrishinDist(const TMatrix &frac_diff, TMatrix &result)
Grishin's distance for protein sequences: 1 - p = (1 - e^(2*d)) / (2 * d) approximated with d = p(2 +...
static double Divergence(const string &seq1, const string &seq2)
Calculate pairwise fractions of non-identity.
static void PoissonDist(const TMatrix &frac_diff, TMatrix &result)
Simple distance calculation for protein sequences: d = -ln(1 - p).
static TTree * FastMeTree(const TMatrix &dist_mat, const vector< string > &labels=vector< string >(), EFastMePar btype=eOls, EFastMePar wtype=eOls, EFastMePar ntype=eBalanced)
Compute a tree using the fast minimum evolution algorithm.
static void KimuraDist(const TMatrix &frac_diff, TMatrix &result)
Kimura's distance for protein sequences: d = -ln(1 - p - 0.2p^2).
static void JukesCantorDist(const TMatrix &frac_diff, TMatrix &result)
Jukes-Cantor distance calculation for DNA sequences: d = -3/4 ln(1 - (4/3)p).
void Resize(size_t i, size_t j, T val=T())
resize this matrix, filling the empty cells with a known value
TData & GetData()
retrieve the data associated with this matrix
size_t GetRows() const
get the number of rows in this matrix
size_t GetCols() const
get the number of columns in this matrix
definition of a Culling tree
void freeMatrix(double **D, int size)
static void s_AddNodeToBtc(CRef< CBioTreeContainer > btc, const TPhyTreeNode *ptn, int parent_uid, int &next_uid)
Recursive function for adding TPhyTreeNodes to BioTreeContainer.
static void s_AddFastMeSubtree(fastme::meNode *me_node, CDistMethods::TTree *node, const vector< string > &labels)
double ** initDoubleMatrix(int d)
CRef< CBioTreeContainer > MakeDistanceSensitiveBioTreeContainer(const TPhyTreeNode *tree)
Conversion from TPhyTreeNode to CBioTreeContainer, potentially without dist feature key.
void s_ThrowIfNotAllFinite(const CDistMethods::TMatrix &mat)
static CDistMethods::TTree * s_ConvertFastMeTree(fastme::meTree *me_tree, const vector< string > &labels)
CRef< CBioTreeContainer > MakeBioTreeContainer(const TPhyTreeNode *tree)
Conversion from TPhyTreeNode to CBioTreeContainer.
meTree * fastme_run(double **D_in, int numSpecies_in, char **labels, int btype_in, int wtype_in, int ntype_in)
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
void swap(NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair1, NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair2)
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define END_SCOPE(ns)
End the previously defined scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
#define BEGIN_SCOPE(ns)
Define a new scope.
static string DoubleToString(double value, int precision=-1, TNumToStringFlags flags=0)
Convert double to string.
static int StringToInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to int.
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
TNodeList::iterator TNodeList_I
TTreeType * DetachNode(TTreeType *subnode)
Remove the subtree from the tree without destroying it.
TNodeList_CI SubNodeBegin(void) const
Return first const iterator on subnode list.
TNodeList::const_iterator TNodeList_CI
void AddNode(TTreeType *subnode)
Add new subnode.
bool IsLeaf() const
Report whether this is a leaf node.
TNodeList_CI SubNodeEnd(void) const
Return last const iterator on subnode list.
const TValue & GetValue(void) const
Return node's value.
const TTreeType * GetParent(void) const
Get node's parent.
void SetNodes(TNodes &value)
Assign a value to Nodes data member.
void SetFdict(TFdict &value)
Assign a value to Fdict data member.
const Tdata & Get(void) const
Get the member data.
const TNodes & GetNodes(void) const
Get the Nodes member data.
const struct ncbi::grid::netcache::search::fields::SIZE size
Floating-point support routines.
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)