NCBI C++ ToolKit
phytree_calc.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: phytree_calc.cpp 87244 2019-08-12 12:55:34Z ucko $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Irena Zaretskaya, Greg Boratyn
27  *
28  */
29 
30 #include <ncbi_pch.hpp>
31 #include <corelib/ncbistl.hpp>
32 #include <corelib/ncbifloat.h>
33 
35 #include <objects/seq/Seqdesc.hpp>
37 #include <objmgr/util/sequence.hpp>
38 #include <objmgr/align_ci.hpp>
39 
41 
43 
44 #include <math.h>
45 
46 #ifdef NCBI_COMPILER_MSVC
47 # define isfinite _finite
48 #elif defined(NCBI_OS_SOLARIS) && !defined(__builtin_isfinite) \
49  && (!defined(__GNUC__) || (__GNUC__ == 4 && __GNUC_MINOR < 5))
50 # undef isfinite
51 # define isfinite finite
52 #endif
53 
56 
57 
59  : m_NumElements(num_elements), m_Diagnol(0.0)
60 {
61  if (num_elements > 0) {
62  m_Distances.resize(num_elements * num_elements - num_elements, -1.0);
63  }
64 }
65 
66 void CPhyTreeCalc::CDistMatrix::Resize(int num_elements)
67 {
68  m_NumElements = num_elements;
69  if (num_elements > 0) {
70  m_Distances.resize(num_elements * num_elements - num_elements);
71  }
72 }
73 
74 const double& CPhyTreeCalc::CDistMatrix::operator()(int i, int j) const
75 {
76  if (i >= m_NumElements || j >= m_NumElements || i < 0 || j < 0) {
77  NCBI_THROW(CPhyTreeCalcException, eDistMatrixError,
78  "Distance matrix index out of bounds");
79  }
80 
81  if (i == j) {
82  return m_Diagnol;
83  }
84 
85  if (i < j) {
86  swap(i, j);
87  }
88 
89  int index = (i*i - i) / 2 + j;
90  _ASSERT(index < (int)m_Distances.size());
91 
92  return m_Distances[index];
93 }
94 
96 {
97  if (i >= m_NumElements || j >= m_NumElements || i < 0 || j < 0) {
98  NCBI_THROW(CPhyTreeCalcException, eDistMatrixError,
99  "Distance matrix index out of bounds");
100  }
101 
102  if (i == j) {
103  NCBI_THROW(CPhyTreeCalcException, eDistMatrixError,
104  "Distance matrix diagnol elements cannot be set");
105  }
106 
107  if (i < j) {
108  swap(i, j);
109  }
110 
111  int index = (i*i - i) / 2 + j;
112  _ASSERT(index < (int)m_Distances.size());
113 
114  return m_Distances[index];
115 }
116 
117 
119  CRef<CScope> scope)
120  : m_Scope(scope)
121 {
122  x_Init();
123  x_InitAlignDS(seq_aln);
124 }
125 
126 
128  CRef<CScope> scope)
129  : m_Scope(scope)
130 {
131  x_Init();
132  x_InitAlignDS(seq_align_set);
133 }
134 
135 
136 void CPhyTreeCalc::SetQuery(const CSeq_id& seqid)
137 {
139  CSeq_id_Handle query_handle = CSeq_id_Handle::GetHandle(seqid);
140  size_t i;
141  // for each sequence id in the alignment
142  for (i=0;i < ids.size();i++) {
143  CSeq_id_Handle id_handle = CSeq_id_Handle::GetHandle(*ids[i]);
144 
145  // check whether seqid and id_handle point to the same sequence
146  if (m_Scope->IsSameBioseq(query_handle, id_handle,
148  m_QueryIdx = i;
149  break;
150  }
151  }
152 
153  // throw if no sequence is found
154  if ((int)i != m_QueryIdx) {
155  NCBI_THROW(CPhyTreeCalcException, eInvalidOptions,
156  (string)"Sequence id " + seqid.AsFastaString()
157  + " not found in alignment");
158  }
159 }
160 
162 {
163  if (!m_Tree) {
165  "Tree was not constructed");
166  }
167 
169 
170  return btc;
171 }
172 
174 {
175  CRef<CDense_seg> denseg(new CDense_seg());
176  denseg->Assign(m_AlignDataSource->GetDenseg());
177 
178  CRef<CSeq_align> seqalign(new CSeq_align());
180  seqalign->SetSegs().SetDenseg(*denseg);
181  seqalign->SetDim(denseg->GetDim());
182 
183  return seqalign;
184 }
185 
186 const vector< CRef<CSeq_id> >& CPhyTreeCalc::GetSeqIds(void) const
187 {
188  return m_AlignDataSource->GetDenseg().GetIds();
189 }
190 
191 // Make sure that matrix has finite non-negative values
193 {
194  for (int i=0;i < mat.GetNumElements() - 1;i++) {
195  for (int j=i+1;j < mat.GetNumElements();j++) {
196  double val = mat(i, j);
197  if (!isfinite(val) || val < 0.0) {
198  return false;
199  }
200  }
201  }
202  return true;
203 }
204 
205 // Calculate divergence matrix, considering max divergence constraint
206 bool CPhyTreeCalc::x_CalcDivergenceMatrix(vector<int>& included)
207 {
208  int num_seqs = m_AlignDataSource->GetNumRows();
209 
210  bm::bvector<> bitvector; // which seqeunces will be included in phylo tree
211  list<SLink> links; // for storing dissimilariues between sequences
212  vector<string> sequences(num_seqs); // buffer for AA sequences
213 
214  // query sequence is always included
215  const int query_idx = m_QueryIdx;
216 
217  m_AlignDataSource->GetWholeAlnSeqString(query_idx, sequences[query_idx]);
218  const string& query_seq = sequences[query_idx];
219 
220  bitvector.set(query_idx);
221 
222  // Compute distances between query and each sequence
223  // and include only sequences that are similar enough to the query
224 
225  // for each sequence
226  for (int i=0;i < num_seqs;i++) {
227 
228  // except for the query
229  if (i == query_idx) {
230  continue;
231  }
232 
233  // find divergence between query and and sequence
235  double dist = 1.0 - CDistMethods::FractionIdentity(query_seq,
236  sequences[i]);
237  _ASSERT(dist > -1e-10);
238 
239  // if divergence is finite and smaller than cutoff save divergence
240  // and mark sequence as included
241  if (dist < m_MaxDivergence) {
242  links.push_back(SLink(query_idx, i, dist));
243  bitvector.set(i);
244  }
245  else if (!m_CalcSegInfo) {
246  // otherwise purge the sequence
247  sequences[i].erase();
248  }
249  }
250 
251  // calculate segment positions, if requested
252  if (m_CalcSegInfo) {
253  x_CalcAlnSegInfo(sequences, m_SegInfo);
254  }
255 
256  // Compute distances between incuded sequences
257  list<SLink>::const_iterator it1 = links.begin();
258 
259  // for each included sequence 1
260  for (; it1 != links.end() && it1->index1 == query_idx; ++it1) {
261 
262  // check if still included
263  if (!bitvector.test(it1->index2)) {
264  continue;
265  }
266  const string& seqi = sequences[it1->index2];
267  _ASSERT(!seqi.empty());
268 
269  list<SLink>::const_iterator it2(it1);
270  it2++;
271 
272  // for each sequence 2
273  for (; it2 != links.end() && it2->index1 == query_idx
274  && bitvector.test(it1->index2); ++it2) {
275 
276  // check if still included
277  if (!bitvector.test(it2->index2)) {
278  continue;
279  }
280  const string& seqj = sequences[it2->index2];
281  _ASSERT(!seqj.empty());
282 
283  // compute divergence
284  double dist = 1.0 - CDistMethods::FractionIdentity(seqi, seqj);
285  _ASSERT(dist > -1e-10);
286 
287  // if divergence is finite and smaller than cutoff save divergence
288  if (dist < m_MaxDivergence) {
289  links.push_back(SLink(it2->index2, it1->index2, dist));
290  }
291  else {
292 
293  //TO DO: This should be changed to which divergence
294  // otherwise exclude sequence less similar to the query
295  int score_1 = m_AlignDataSource->CalculateScore(query_idx,
296  it1->index2);
297  int score_2 = m_AlignDataSource->CalculateScore(query_idx,
298  it2->index2);
299 
300  bitvector[score_1 > score_2 ? it2->index2 : it1->index2]
301  = false;
302  }
303  }
304  }
305 
307  bitvector.build_rs_index(&rs_index);
308 
309  // Get indices of included sequences
310  bm::bvector<>::counted_enumerator en = bitvector.first();
311  for (; en.valid(); ++en) {
312  included.push_back((int)*en);
313  }
314 
315  // Create divergence matrix and set values
316  m_DivergenceMatrix.Resize(bitvector.count());
317  // for each saved divergence
318  ITERATE (list<SLink>, it, links) {
319  // if both sequences included
320  if (bitvector.test(it->index1) && bitvector.test(it->index2)) {
321 
322  // set divergence value using bit vector storage encoding
323 
324  int index1 = (it->index1 == 0 ? 0 :
325  bitvector.count_range(0, it->index1 - 1, rs_index));
326 
327  int index2 = (it->index2 == 0 ? 0 :
328  bitvector.count_range(0, it->index2 - 1, rs_index));
329 
330  m_DivergenceMatrix(index1, index2) = it->distance;
331  }
332  }
334  NCBI_THROW(CPhyTreeCalcException, eDistMatrixError, "The calculated "
335  "divergence matrix contains invalid or inifinite values");
336  }
337 
338  return included.size() > 1;
339 }
340 
341 static void s_RecordSeqId(int index,
342  const CAlnVec& align_data_source,
343  vector<string>& seqids)
344 {
345  CSeq_id_Handle seq_id_handle
346  = sequence::GetId(align_data_source.GetBioseqHandle(index),
348 
349  CConstRef<CSeq_id> seq_id = seq_id_handle.GetSeqId();
350  string seq_id_str = "";
351  (*seq_id).GetLabel(&seq_id_str);
352 
353  seqids.push_back(seq_id_str);
354 }
355 
356 void CPhyTreeCalc::x_CreateValidAlign(const vector<int>& used_indices)
357 {
358  if ((int)used_indices.size() < m_AlignDataSource->GetNumRows()) {
359 
360  // collect ids of removed sequences
361  int index = 0;
362  ITERATE (vector<int>, it, used_indices) {
363  for (; index < *it; index++) {
365  m_RemovedSeqIndeces.push_back(index);
366  }
367  index++;
368  }
369  for (; index < m_AlignDataSource->GetNumRows();index++) {
371  m_RemovedSeqIndeces.push_back(index);
372  }
373  _ASSERT(used_indices.size() + m_RemovedSeqIds.size()
374  == (size_t)m_AlignDataSource->GetNumRows());
375 
376  _ASSERT(m_RemovedSeqIndeces.size() == m_RemovedSeqIds.size());
377 
378  CRef<CDense_seg> new_denseg
379  = m_AlignDataSource->GetDenseg().ExtractRows(used_indices);
380 
381  // if sequence labels are specified, remove labels correponding to
382  // unused sequences
383  if (!m_Labels.empty()) {
384  vector<string> new_labels;
385  ITERATE (vector<int>, it, used_indices) {
386  new_labels.push_back(m_Labels[*it]);
387  }
388  m_Labels.swap(new_labels);
389  _ASSERT((int)m_Labels.size() == m_AlignDataSource->GetNumRows());
390  }
391 
392  m_AlignDataSource.Reset(new CAlnVec(*new_denseg, *m_Scope));
393  }
394 }
395 
397 {
399 
400  // create dist matrix data structure required by CDistMethos
403  0.0);
404 
405  for (int i=0;i < m_DivergenceMatrix.GetNumElements() - 1;i++) {
406  for (int j=i+1; j < m_DivergenceMatrix.GetNumElements();j++) {
407  pmat(i, j) = pmat(j, i) = m_DivergenceMatrix(i, j);
408  }
409  }
410 
411  // prepare structure for string results of distance computation
414  0.0);
415 
416  // compute distances
417  switch (m_DistMethod) {
418  case eJukesCantor :
420  break;
421 
422  case ePoisson :
424  break;
425 
426  case eKimura :
428  break;
429 
430  case eGrishin :
432  break;
433 
434  case eGrishinGeneral :
436  break;
437 
438  default:
439  NCBI_THROW(CPhyTreeCalcException, eInvalidOptions,
440  "Invalid distance calculation method");
441  }
442 
443  // store distances in memory efficient data structure
445  for (int i=0;i < m_DistMatrix.GetNumElements() - 1;i++) {
446  for (int j=i+1;j < m_DistMatrix.GetNumElements();j++) {
447  m_DistMatrix(i, j) = m_FullDistMatrix(i, j);
448  }
449  }
451  NCBI_THROW(CPhyTreeCalcException, eDistMatrixError, "The calculated "
452  "distance matrix contains invalid or infinite values");
453  }
454 }
455 
456 // Compute phylogenetic tree
457 void CPhyTreeCalc::x_ComputeTree(bool correct)
458 {
461 
462  // if labels not provided, use sequence indeces as labels
463  if (m_Labels.empty()) {
464  for (int i=0;i < m_AlignDataSource->GetNumRows();i++) {
465  m_Labels.push_back(NStr::IntToString(i));
466  }
467  }
468  _ASSERT((int)m_Labels.size() == m_AlignDataSource->GetNumRows());
469 
470  m_Tree = NULL;
471  switch (m_TreeMethod) {
472  case eNJ :
474  break;
475 
476  case eFastME :
478  break;
479 
480  default:
481  NCBI_THROW(CPhyTreeCalcException, eInvalidOptions,
482  "Invalid tree reconstruction method");
483  };
484 
485  if (!m_Tree) {
486  NCBI_THROW(CPhyTreeCalcException, eTreeComputationProblem,
487  "Tree was not created");
488  }
489 
490  // find the longest edge and make one of its nodes tree root
491  m_Tree->GetValue().SetDist(0.0);
493  _ASSERT(m_Tree);
494 
495  // put the tree root in the middle of the longest edge
496  int num_children = 0;
497  double cumulative_length = 0.0;
499  it != m_Tree->SubNodeEnd(); ++it) {
500 
501  cumulative_length += (*it)->GetValue().GetDist();
502  num_children++;
503  }
504 
505  double new_len = cumulative_length / (double)num_children;
507  it != m_Tree->SubNodeEnd(); ++it) {
508 
509  (*it)->GetValue().SetDist(new_len);
510  }
511 
512  // release memory used by full distance matrix
513  m_FullDistMatrix.Resize(1, 1);
514 
515  if (correct) {
517  }
518 }
519 
521 {
522  _ASSERT(node);
523  if (!node->IsLeaf()) {
524  for (TPhyTreeNode::TNodeList_CI it = node->SubNodeBegin();
525  it != node->SubNodeEnd(); ++it) {
527  }
528  }
529 
530  if (node->GetValue().IsSetDist()) {
531  double dist = node->GetValue().GetDist();
532  if (!isfinite(dist) || dist < 0.0) {
533  node->GetValue().SetDist(0.0);
534  }
535  }
536 }
537 
539 {
540  if (!m_Labels.empty()
541  && (int)m_Labels.size() != m_AlignDataSource->GetNumRows()) {
542 
543  NCBI_THROW(CPhyTreeCalcException, eInvalidOptions, "Number of labels"
544  " is not the same as number of sequences");
545  }
546 
547  if (m_MaxDivergence < 0.0) {
548  NCBI_THROW(CPhyTreeCalcException, eInvalidOptions, "Maximum "
549  "divergence must be positive");
550  }
551 
552 
553  if (m_DistMethod == eKimura && m_MaxDivergence > 0.85) {
554  NCBI_THROW(CPhyTreeCalcException, eInvalidOptions, "Maximum "
555  "divergence must be smaller than 0.85 if Kimura distance is"
556  " selected");
557  }
558 
559  vector<int> used_inds;
560 
561  bool valid;
562  valid = x_CalcDivergenceMatrix(used_inds);
563 
564  if (valid) {
565 
566  if ((int)used_inds.size() < m_AlignDataSource->GetNumRows()) {
567  int initial_num_seqs = m_AlignDataSource->GetNumRows();
568  x_CreateValidAlign(used_inds);
569  m_Messages.push_back(NStr::IntToString(initial_num_seqs
570  - (int)used_inds.size())
571  + " sequences were discarded due to"
572  " divergence that exceeds maximum allowed.");
573  }
575  x_ComputeTree();
576 
577  }
578  else {
579  m_Messages.push_back("Sequence dissimilarity exceeds maximum"
580  " divergence.");
581  }
582 
583  return valid;
584 }
585 
586 // Init parameters to default values
588 {
589  m_QueryIdx = 0;
592  m_MaxDivergence = 0.85;
593  m_Tree = NULL;
594  m_CalcSegInfo = false;
595 }
596 
597 
598 
599 bool CPhyTreeCalc::x_InitAlignDS(const CSeq_align_set& seq_align_set)
600 {
601  bool success = true;
602 
603  if(seq_align_set.Get().size() == 1) {
604  x_InitAlignDS(**(seq_align_set.Get().begin()));
605  }
606  else if(seq_align_set.Get().size() > 1) {
607  CAlnMix mix;
608  ITERATE (CSeq_annot::TData::TAlign, iter, seq_align_set.Get()) {
609 
610  CRef<CSeq_align> seq_align = *iter;
611  mix.Add(**iter);
612  }
613 
614  CAlnMix::TMergeFlags merge_flags;
616  mix.Merge(merge_flags);
617 
618  x_InitAlignDS(mix.GetSeqAlign());
619  }
620  else {
621  success = false;
622  NCBI_THROW(CPhyTreeCalcException, eInvalidOptions,
623  "Empty seqalign provided");
624  }
625  return success;
626 }
627 
628 
629 
631 {
633  *m_Scope));
636 }
637 
638 
639 void CPhyTreeCalc::x_CalcAlnSegInfo(const vector<string>& aln,
640  CPhyTreeCalc::TSegInfo& seg_info)
641 {
642  const char kGap = '-';
643  seg_info.clear();
644  seg_info.resize(aln.size());
645 
646  // find query range in the msa
647  const string& query = aln[m_QueryIdx];
648  size_t query_start = 0;
649  size_t query_end = query.length() - 1;
650  while (query_start < query.length() && query[query_start] == kGap) {
651  query_start++;
652  }
653  while (query_end > 0 && query[query_end] == kGap) {
654  query_end--;
655  }
656 
657  // for each sequence
658  for (size_t i=0;i < aln.size();i++) {
659  const string& sequence = aln[i];
660  vector<TRange>& segs = seg_info[i];
661  size_t p = 0;
662  _ASSERT(query_end < sequence.length());
663 
664  // find the aligned sequence length
665  int seq_len = 0;
666  for (size_t k=query_start;k <= query_end;k++) {
667  if (sequence[k] != kGap) {
668  seq_len++;
669  }
670  }
671  // minimum gap length
672  const int kMinSplit = max(seq_len / 20, 4);
673 
674  while (p < sequence.length()) {
675 
676  // find start of a segment aligned to the query
677  while (p < sequence.length() && (p < query_start ||
678  sequence[p] == kGap)) {
679  p++;
680  }
681 
682  int from = p;
683  int to = p;
684  int gaps = 0;
685  // find segment end, treating short gaps as part of the segment
686  while (p < sequence.length() && gaps < kMinSplit && p <= query_end) {
687  int residues = 0;
688  to = p;
689  while (p < sequence.length() && p <= query_end &&
690  sequence[p] != kGap) {
691  p++;
692  to++;
693  residues++;
694  }
695  // disregard short gaps betweem segments of at least 4 residues
696  if (residues > 4) {
697  gaps = 0;
698  }
699  while (p < sequence.length() && p <= query_end &&
700  sequence[p] == kGap) {
701  gaps++;
702  p++;
703  }
704  }
705  // this is the segment
706  TRange seg(from, to);
707  // if the segment is long enough, report it, otherwise treat it as
708  // part of the gap
709  if (seg.GetLength() > 4 || segs.empty()) {
710  segs.push_back(seg);
711  gaps = 0;
712  }
713 
714  // skip positions not aligned to the query
715  if (p > query_end) {
716  break;
717  }
718  }
719  }
720 }
721 
static CRef< CScope > m_Scope
TDim GetNumRows(void) const
Definition: alnmap.hpp:517
const CDense_seg & GetDenseg(void) const
Definition: alnmap.hpp:475
int TMergeFlags
Definition: alnmix.hpp:114
void Add(const CDense_seg &ds, TAddFlags flags=0)
Definition: alnmix.cpp:120
@ fGapJoin
Definition: alnmix.hpp:103
@ fTruncateOverlaps
Definition: alnmix.hpp:101
void Merge(TMergeFlags flags=0)
Definition: alnmix.cpp:273
const CSeq_align & GetSeqAlign(void) const
Definition: alnmix.cpp:302
const CBioseq_Handle & GetBioseqHandle(TNumrow row) const
Definition: alnvec.cpp:86
void SetEndChar(TResidue gap_char)
Definition: alnvec.hpp:368
void SetGapChar(TResidue gap_char)
Definition: alnvec.hpp:339
string & GetWholeAlnSeqString(TNumrow row, string &buffer, TSeqPosList *insert_aln_starts=0, TSeqPosList *insert_starts=0, TSeqPosList *insert_lens=0, unsigned int scrn_width=0, TSeqPosList *scrn_lefts=0, TSeqPosList *scrn_rights=0) const
Definition: alnvec.cpp:199
int CalculateScore(TNumrow row1, TNumrow row2) const
Definition: alnvec.cpp:926
CRef< CDense_seg > ExtractRows(const vector< TDim > &rows) const
Extract specified rows of the alignment, in specified order.
Definition: Dense_seg.cpp:837
void Assign(const CSerialObject &obj, ESerialRecursionMode how=eRecursive)
overloaded Assign()
Definition: Dense_seg.cpp:62
static TTree * RerootTree(TTree *tree, TTree *node=NULL)
Reroot tree, new root is placed in the middle of the edge specified by node.
static TTree * NjTree(const TMatrix &dist_mat, const vector< string > &labels=vector< string >())
Compute a tree by neighbor joining; as per Hillis et al.
static double FractionIdentity(const string &seq1, const string &seq2)
Calculate pairwise fraction identity based on positions where both sequences have a base/residue.
static void GrishinGeneralDist(const TMatrix &frac_diff, TMatrix &result)
Grishin's distance for protein sequences 1 - p = ln(1 + 2d) / 2d.
static void GrishinDist(const TMatrix &frac_diff, TMatrix &result)
Grishin's distance for protein sequences: 1 - p = (1 - e^(2*d)) / (2 * d) approximated with d = p(2 +...
static void PoissonDist(const TMatrix &frac_diff, TMatrix &result)
Simple distance calculation for protein sequences: d = -ln(1 - p).
static TTree * FastMeTree(const TMatrix &dist_mat, const vector< string > &labels=vector< string >(), EFastMePar btype=eOls, EFastMePar wtype=eOls, EFastMePar ntype=eBalanced)
Compute a tree using the fast minimum evolution algorithm.
static void KimuraDist(const TMatrix &frac_diff, TMatrix &result)
Kimura's distance for protein sequences: d = -ln(1 - p - 0.2p^2).
static void JukesCantorDist(const TMatrix &frac_diff, TMatrix &result)
Jukes-Cantor distance calculation for DNA sequences: d = -3/4 ln(1 - (4/3)p).
void Resize(size_t i, size_t j, T val=T())
resize this matrix, filling the empty cells with a known value
Definition: matrix.hpp:390
size_t GetRows() const
get the number of rows in this matrix
Definition: matrix.hpp:298
Guide tree calc exceptions.
Distance matrix (square, symmetric with zeros on diagnol)
CDistMatrix(int num_elements=0)
Constructor.
const double & operator()(int i, int j) const
Get distance value.
bool Empty(void) const
Is matrix size zero.
void Resize(int num_elements)
Resize matrix.
vector< double > m_Distances
Storage for distance values.
int GetNumElements(void) const
Get number of rows/columns.
CDistMatrix m_DivergenceMatrix
Matrix of percent identities based distances.
CRef< CSeq_align > GetSeqAlign(void) const
Get seq_align that corresponds to current tree.
CRef< CBioTreeContainer > GetSerialTree(void) const
Get serial tree.
vector< string > m_RemovedSeqIds
Sequences that are not included in the tree.
bool m_CalcSegInfo
Calculate segment positions.
vector< int > m_RemovedSeqIndeces
Indeces of sequences that are not included in the tree.
bool CalcBioTree(void)
Compute bio tree for the current alignment in a black box manner.
CDistMethods::TMatrix m_FullDistMatrix
Full distance matrix, this data structure is required by CDistMethods functions.
TPhyTreeNode * m_Tree
Computed tree.
void x_CalcDistMatrix(void)
Compute distance as evolutionary corrected dissimilarity.
const vector< CRef< CSeq_id > > & GetSeqIds(void) const
Get seq-ids of sequences used in tree construction.
void x_Init(void)
Initialize class attributes.
@ eFastME
Fast Minumum Evolution.
@ eNJ
Neighbor Joining.
int m_QueryIdx
Index of query sequence in the input alignment.
bool x_CalcDivergenceMatrix(vector< int > &used_indices)
Compute divergence matrix and find sequences to exclude from tree reconstruction.
CPhyTreeCalc(const CSeq_align &seq_aln, CRef< CScope > scope)
Create CPhyTreeCalc from Seq-align.
void x_CalcAlnSegInfo(const vector< string > &aln, TSegInfo &seg_info)
Calculate the alignment segemnts summary.
EDistMethod m_DistMethod
Method of calculating evolutionary distance correction.
ETreeMethod m_TreeMethod
Method of calculating tree.
vector< string > m_Messages
Error/warning messages.
CDistMatrix m_DistMatrix
Matrix of evolutionary distance.
CRef< CScope > m_Scope
Scope.
double m_MaxDivergence
Maximum allowed divergence between sequences.
vector< string > m_Labels
Labels for tree leaves.
void SetQuery(int index)
Set query sequence by index in alignment.
TSegInfo m_SegInfo
Positions of segments in mulitple or query anchored alignment.
CRef< CAlnVec > m_AlignDataSource
Alignment data source.
void x_InitAlignDS(const CSeq_align &seq_aln)
Initialize alignment data source.
void x_CorrectBranchLengths(TPhyTreeNode *node)
Change non-finite and negative tree branch lengths to zeros.
vector< vector< TRange > > TSegInfo
void x_CreateValidAlign(const vector< int > &used_indices)
Create alignment only for sequences included in tree computation.
void x_ComputeTree(bool correct=true)
Compute phylogenetic tree.
definition of a Culling tree
Definition: ncbi_tree.hpp:100
Constant iterator designed to enumerate "ON" bits counted_enumerator keeps bitcount,...
Definition: bm.h:734
bool valid() const noexcept
Checks if iterator is still valid.
Definition: bm.h:283
bool test(size_type n) const noexcept
returns true if bit n is set and false is bit n is 0.
Definition: bm.h:1502
bvector< Alloc > & set(size_type n, bool val=true)
Sets bit n if val is true, clears bit n if val is false.
Definition: bm.h:4188
size_type count_range(size_type left, size_type right, const rs_index_type &rs_idx) const noexcept
Returns count of 1 bits in the given range [left..right] Uses rank-select index to accelerate the sea...
Definition: bm.h:3516
enumerator first() const
Returns enumerator pointing on the first non-zero bit.
Definition: bm.h:1871
void build_rs_index(rs_index_type *rs_idx, bvector< Alloc > *bv_blocks=0) const
compute running total of all blocks in bit vector (rank-select index)
Definition: bm.h:2501
size_type count() const noexcept
population count (count of ON bits)
Definition: bm.h:2401
Rank-Select acceleration index.
Definition: bmrs.h:41
CRef< objects::CBioTreeContainer > MakeBioTreeContainer(const TPhyTreeNode *tree)
Conversion from TPhyTreeNode to CBioTreeContainer.
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
void swap(NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair1, NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair2)
Definition: ncbimisc.hpp:1508
#define NULL
Definition: ncbistd.hpp:225
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
const string AsFastaString(void) const
Definition: Seq_id.cpp:2265
CConstRef< CSeq_id > GetSeqId(void) const
static CSeq_id_Handle GetHandle(const CSeq_id &id)
Normal way of getting a handle, works for any seq-id.
const CSeq_id & GetId(const CSeq_loc &loc, CScope *scope)
If all CSeq_ids embedded in CSeq_loc refer to the same CBioseq, returns the first CSeq_id found,...
@ eGetId_Best
return the "best" gi (uses FindBestScore(), with CSeq_id::CalculateScore() as the score function
Definition: sequence.hpp:101
bool IsSameBioseq(const CSeq_id_Handle &id1, const CSeq_id_Handle &id2, EGetBioseqFlag get_flag)
Check if two seq-ids are resolved to the same Bioseq.
Definition: scope.cpp:168
@ eGetBioseq_All
Search bioseq, load if not loaded yet.
Definition: scope.hpp:128
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
position_type GetLength(void) const
Definition: range.hpp:158
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5084
TNodeList::iterator TNodeList_I
Definition: ncbi_tree.hpp:109
TNodeList_CI SubNodeBegin(void) const
Return first const iterator on subnode list.
Definition: ncbi_tree.hpp:160
TNodeList::const_iterator TNodeList_CI
Definition: ncbi_tree.hpp:110
bool IsLeaf() const
Report whether this is a leaf node.
Definition: ncbi_tree.hpp:296
TNodeList_CI SubNodeEnd(void) const
Return last const iterator on subnode list.
Definition: ncbi_tree.hpp:166
const TValue & GetValue(void) const
Return node's value.
Definition: ncbi_tree.hpp:184
const TDenseg & GetDenseg(void) const
Get the variant data.
Definition: Seq_align_.cpp:153
void SetSegs(TSegs &value)
Assign a value to Segs data member.
Definition: Seq_align_.cpp:310
void SetDim(TDim value)
Assign a value to Dim data member.
Definition: Seq_align_.hpp:865
void SetType(TType value)
Assign a value to Type data member.
Definition: Seq_align_.hpp:818
vector< CRef< CSeq_id > > TIds
Definition: Dense_seg_.hpp:106
TDim GetDim(void) const
Get the Dim member data.
Definition: Dense_seg_.hpp:421
const TIds & GetIds(void) const
Get the Ids member data.
Definition: Dense_seg_.hpp:505
const Tdata & Get(void) const
Get the member data.
const TSegs & GetSegs(void) const
Get the Segs member data.
Definition: Seq_align_.hpp:921
list< CRef< CSeq_align > > TAlign
Definition: Seq_annot_.hpp:194
int i
Compressed bitset (entry point to bm.h)
Floating-point support routines.
The NCBI C++/STL use hints.
T max(T x_, T y_)
bool s_ValidateMatrix(const CPhyTreeCalc::CDistMatrix &mat)
USING_SCOPE(objects)
static void s_RecordSeqId(int index, const CAlnVec &align_data_source, vector< string > &seqids)
static string query
#define _ASSERT
Modified on Tue Feb 27 05:53:36 2024 by modify_doxy.py rev. 669887