NCBI C++ ToolKit
cuSeqTreeNj.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: cuSeqTreeNj.cpp 41296 2009-03-12 19:39:21Z lanczyck $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Chris Lanczycki
27 *
28 * File Description: cdt_treealg.cpp
29 *
30 * Implementation of the Neighbor Joining algorithm of Saitou and Nei.
31 *
32 */
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbi_limits.h>
36 
38 BEGIN_SCOPE(cd_utils)
39 
40 const int NJ_TreeAlgorithm::USED_ROW = -1;
41 const int NJ_TreeAlgorithm::BAD_INDEX = -1;
42 const double NJ_TreeAlgorithm::REPLACE_NEG_DIST = 0.001;
46 
48 {
49  return TREE_ALGORITHM_NAMES[algorithm];
50 }
51 
53  : TreeAlgorithm(MY_ROOTEDNESS, MY_TREE_METHOD) {
54 
55  m_dm = NULL;
57 }
58 
60  : TreeAlgorithm(MY_ROOTEDNESS, MY_TREE_METHOD) {
61 
62  m_dm = dm;
63  //m_cdd = cdd;
65 }
66 
68  m_dm = dm;
69  /*
70  if (dm->isSetCDD()) {
71  m_cdd = const_cast<CCd*>(dm->GetCdd()); // ATTN: const_cast
72  }*/
74 }
75 /*
76 void NJ_TreeAlgorithm::SetCDD(CCd* cdd) {
77  m_cdd = cdd;
78  //fillSeqNames(m_tree, m_cdd);
79 }*/
80 
81 
82 // Initialize a list of nodes that will be placed in a tree structure.
83 // (For a tree with N leafs, there are 2N-3 total nodes, of which N-3 are internal.)
84 // The node will be identified by its one-based row indices in the distance matrix
85 // (assumed to correspond to rows an accompanying alignment). Nodes with 'rowID'
86 // > m_nseqs serve to hold composite nodes created during the NJ procedure.
87 //
88 // Node zero serves as the "hub" internal node that acts as the central node of the
89 // initial star configuration; it's value is 0 to set it apart from the other
90 // internal nodes. The final three nodes in the NJ algorithm get joined at the hub.
91 //
92 // NOTE: If the distance matrix row does not map directly to the row indices of an
93 // alignment, the caller is responsible for re-mapping rowID to the correct
94 // alignment row number.
95 //
96 // NOTE: Names are initially assigned to be the row ID as a string.
97 
99 
100  // m_tree is initialized by base-class constructor
101 
102  int nnodes;
103  SeqItem* sip;
104 
105  if (m_dm && m_nseqs>0) {
106  for (int i=0; i<(2*m_nseqs - 2); ++i) {
107  if (m_items[i]) {
108  delete m_items[i];
109  }
110  }
111  }
112 
113  m_seqiters.clear();
114  m_items.clear();
115 
116  m_nseqs = 0;
117  m_nextNode = 0;
118 
119  if (m_dm) {
120  // Allow square matrix only
121  if (m_dm->GetNumRows() != m_dm->GetNumCols()) {
122  m_dm = NULL;
123  return;
124  } else {
125  m_nseqs = m_dm->GetNumRows();
126  nnodes = 2*m_nseqs - 2;
127  if (nnodes < 3) nnodes = 3; // to join the final nodes, need a dummy if < 3
128  m_seqiters = TTreeIt(nnodes);
129  //cout << "m_seqiters: " << m_seqiters.size() << endl;
130  for (int i=0; i<nnodes; ++i) {
131  sip = new SeqItem(i);
132  sip->name = NStr::IntToString(i);
133  if (sip) {
134  m_items.push_back(sip);
135  } else {
136  m_items.push_back(NULL);
137  }
138  }
139  }
140  }
141 }
142 
144 
145  m_seqiters.clear();
146  for (int i=0; i<(2*m_nseqs - 2); ++i) {
147  delete m_items[i];
148  }
149 }
150 
151 // Disconnect composite nodes from hub if needed, and reconnect the indicated nodes
152 // to a new composite node. Connect the composite node to the hub.
153 void NJ_TreeAlgorithm::Join(int idmin, int jdmin, double ilen, double jlen) {
154 
155  double tmpd = ilen;
156  int maxIndex = 2*m_nseqs - 3; // tot. # nodes less hub
157 
158  if (idmin == jdmin) {
159  cerr << "Error: You cannot join node " << idmin << " to itself.\n";
160  return;
161  }
162 
163  if (idmin < 0 || idmin > maxIndex || jdmin < 0 || jdmin > maxIndex || m_nextNode < 0 || m_nextNode > maxIndex) {
164  // Don't print message unless there's a serious problem.
165  // [ij]dmin = 'USED_ROW' here does not appear to cause issues.
166  if ((idmin != USED_ROW) && (jdmin != USED_ROW)) {
167  cerr << "Warning: Out of range index in Join: " << idmin << " "
168  << jdmin << " " << m_nextNode << " Max allowed index: " << maxIndex << endl;
169  }
170  return;
171  }
172 
173  TChildIt cit;
174  TSeqIt& hub = m_seqiters[0];
175 
176  // New composite node
178 
179  int tmpi = idmin;
180  while (tmpi == idmin || tmpi == jdmin) {
181  cit = m_seqiters[tmpi];
182  if (cit.is_valid()) {
183  if (m_tree->parent(cit) == hub) {
184  cit->distance = tmpd;
185  TChildIt nexit=cit;++nexit;
186  m_tree->reparent(m_seqiters[m_nextNode], cit, nexit); // don't use ++ as it modifies cit
187  } else { // this case should not occur
188  tmpi = -(idmin+jdmin);
189  cerr << "Error: iterator found (id= " << cit->rowID << ") not attached to hub.\n";
190  }
191  } else {
192  m_items[tmpi]->distance = tmpd;
194  }
195  tmpd = jlen;
196  tmpi = (tmpi==idmin) ? jdmin : -(idmin+jdmin);
197  }
198 }
199 
200 
201 // return the number of loops in ComputeTree
203  int it, j, i;
204  long count=0;
205  int* indexMap = new int[m_nseqs];
206  for (i=0; i<m_nseqs; ++i) {
207  indexMap[i] = i+1;
208  }
209  for (it=1; it<=m_nseqs-3; ++it) {
210  for (j=1; j<m_nseqs; ++j) {
211 // if (indexMap[j] != USED_ROW) {
212 // for (i=0; i<j; ++i) {
213 // if (indexMap[i] != USED_ROW) {
214 // count ++;
215 // }
216 // }
217 // }
218  count++; // don't count in inner loop or can exceed int4 max
219  }
220  indexMap[it] = USED_ROW;
221  }
222  delete []indexMap;
223  return(count);
224 }
225 
226 
227 // Main computational engine for the NJ method
228 
230 
231  int i, j, k;
232  int idmin, jdmin, tmp;
233  int imin = 0, jmin = 0;
234  double ilen, jlen;
235  double sum_d, tmp_d;
236 
237  double normalization;
238 
239 /*#if _DEBUG
240  ofstream ofs;
241  ofs.open("nj_info.txt");
242 #endif
243  */
245  DistanceMatrix::TMatType sum_ij = 0.0;
246 
247  string newickStr = "";
248 
249  m_tree = atree;
250  if (m_tree == NULL) {
251  return;
252  } else if (!m_dm) {
253  m_tree->clear();
254  m_tree = NULL;
255  return;
256  }
257 
258  m_seqiters[0] = m_tree->insert(m_tree->begin(), *m_items[0]);
259 
260  // For composite nodes, save the mean dist. between the two component nodes.
261  // When this node becomes part of a composite node in a later iteration, that
262  // distance is subtracted from the orig-composite-node to hub distance.
263 
264  double* internalDistCorrection = new double[m_nseqs];
265 
266  // Map rows in distance matrix to index of corresponding node in m_seqiters;
267  // when there is no corresponding distance matrix row, enter USED_ROW.
268 
269  int* indexMap = new int[m_nseqs];
270 
271  m_nextNode = m_nseqs + 1;
272 
273  if (indexMap == 0 || internalDistCorrection == 0) {
274  m_tree->clear();
275  m_tree = NULL;
276  return;
277  } else {
278  for (i=0; i<m_nseqs; ++i) {
279  indexMap[i] = i+1;
280  internalDistCorrection[i] = 0.0;
281  }
282  }
283 
284  // adjust distance matrix (do this before pre-fetching)
286 
287  // pre-fetch all distances (for speed)
288  double** ppDists;
289  ppDists = new double*[m_nseqs];
290  for (i=0; i<m_nseqs; i++) {
291  ppDists[i] = new double[m_nseqs];
292  }
293  for (i=0; i<m_nseqs; i++) {
294  for (j=0; j<m_nseqs; j++) {
295  ppDists[i][j] = m_dm->FastGet(i, j);
296  }
297  }
298 
299  // pre-calc sums (for speed)
300  double* pSums;
301  pSums = new double[m_nseqs];
302  for (i=0; i<m_nseqs; i++) {
303  sum_d = 0;
304  for (k=0; k<m_nseqs; ++k) {
305  sum_d += ppDists[i][k];
306  }
307  pSums[i] = sum_d;
308  }
309 
310  // Iterations loop
311  long total = GetNumLoopsForTreeCalc();
312  long count = 0;
313  for (int it=1; it<=m_nseqs-3; ++it) {
314 
315  minval = INIT_MINIMA;
316  normalization = 1.0/((double)(m_nseqs - it + 1) - 2.0);
317 
318  // Compute/minimize sum of all branch lengths in tree
319  for (j=1; j<m_nseqs; ++j) {
320  if (indexMap[j] != USED_ROW) {
321  for (i=0; i<j; ++i) {
322  if (indexMap[i] != USED_ROW) {
323 // count++;
324  double Sum = pSums[i] + pSums[j];
325  sum_ij = ppDists[j][i]/normalization - Sum;
326  if (sum_ij < minval) {
327  minval = sum_ij;
328  imin = i;
329  jmin = j;
330  }
331  }
332  }
333  }
334  count++;
335  }
336  pFunc(count, total);
337  idmin = indexMap[imin];
338  jdmin = indexMap[jmin];
339 
340 // if (idmin == USED_ROW || jdmin == USED_ROW) {
341 // cerr << "Indexing Error: " << idmin << " or " << jdmin << " is not a valid row index "
342 // << " for DM indices " << imin << " and " << jmin << ", respectively.\n\n";
343 // break;
344 // }
345 
346  // Get length of new branches...
347 
348  // l(inode->m_nextNode) = .5(d(inode,jnode) + sum_k(d(i,k) - d(j,k))/(m_nseqs-2))
349  sum_d = 0.0;
350  for (k=0; k<m_nseqs; ++k) {
351  sum_d += ppDists[imin][k] - ppDists[jmin][k];
352  }
353  sum_d *= normalization;
354  ilen = 0.5*(ppDists[imin][jmin] + sum_d) - internalDistCorrection[imin];
355  jlen = 0.5*(ppDists[imin][jmin] - sum_d) - internalDistCorrection[jmin];
356 /*#if _DEBUG
357  if (1) {
358  ofs << "\nIteration " << it << ": min sum of lengths = " << minval << endl;
359  ofs << "Rows : " << imin << "(node " << idmin << "; length= " << ilen
360  << ") and " << jmin << "(node " << jdmin << "; length= " << jlen << ")." << endl;
361  ofs << "Distance between these nodes: " << (*m_dm)[imin][jmin] << endl;
362  ofs << "norm: " << sum_d << " dist corrs: " << internalDistCorrection[imin]
363  << " " << internalDistCorrection[jmin] << endl;
364  }
365 #endif
366 */
367 // Detach any edges to the hub, and attach/create nodes as needed.
368 
369  ilen = (ilen >= 0) ? ilen : REPLACE_NEG_DIST;
370  jlen = (jlen >= 0) ? jlen : REPLACE_NEG_DIST;
371 
372  Join(idmin, jdmin, ilen, jlen);
373 
374  internalDistCorrection[imin] = 0.5*ppDists[imin][jmin];
375  indexMap[imin] = m_nextNode++;
376  indexMap[jmin] = USED_ROW;
377 
378  // Fix-up distance matrix: row jmin is zeroed out, imin row becomes averaged of imin/jmin
379  // also fix up the pre-calculated partial sums
380  for (k=0; k<m_nseqs; ++k) {
381  if (indexMap[k] != USED_ROW) {
382  tmp_d = 0.5*(ppDists[imin][k] + ppDists[jmin][k]);
383  pSums[imin] -= ppDists[imin][k];
384  pSums[imin] += tmp_d;
385  ppDists[imin][k] = tmp_d;
386  pSums[k] -= ppDists[k][imin];
387  pSums[k] += tmp_d;
388  ppDists[k][imin] = tmp_d;
389  }
390  }
391  pSums[imin] -= ppDists[imin][imin];
392  ppDists[imin][imin] = 0.0;
393  for (k=0; k<m_nseqs; ++k) {
394  pSums[jmin] -= ppDists[jmin][k];
395  ppDists[jmin][k] = 0.0;
396  pSums[k] -= ppDists[k][jmin];
397  ppDists[k][jmin] = 0.0;
398  }
399 
400 // cout << "Distance matrix at end of iteration " << it << endl;
401 // m_dm->printMat();
402 
403  } // end iteration loop
404  assert(count == total);
405 
406  // Deal with final three nodes (should all be hooked to the hub node)
407 
408  int finalNodes[3] = {USED_ROW, USED_ROW, USED_ROW};
409  double d0, d1, d2;
410  double finalLen[3] = {0.0, 0.0, 0.0};
411 
412  tmp = 0;
413  i = USED_ROW;
414  j = USED_ROW;
415  k = USED_ROW;
416  for (int l=0; l<m_nseqs; ++l) {
417  if (indexMap[l] != USED_ROW) {
418  if (tmp < 3) {
419  finalNodes[tmp] = l;
420  }
421  tmp++;
422  }
423  }
424  if (tmp <= 3) {
425  for (int l=0; l<tmp; ++l) {
426  i = finalNodes[l];
427  j = finalNodes[(l+1)%3]; // here, '3' is the # of expected final nodes
428  k = finalNodes[(l+2)%3]; // here, '3' is the # of expected final nodes
429  d0 = (i < m_nseqs && j < m_nseqs && i != USED_ROW && j != USED_ROW) ? ppDists[i][j] - internalDistCorrection[i] : 0;
430  d1 = (i < m_nseqs && k < m_nseqs && i != USED_ROW && k != USED_ROW) ? ppDists[i][k] - internalDistCorrection[i]: 0;
431  d2 = (j < m_nseqs && k < m_nseqs && j != USED_ROW && k != USED_ROW) ? ppDists[j][k] : 0;
432  finalLen[l] = 0.5*(d0 + d1 - d2);
433 
434  if (i < m_nseqs && i != USED_ROW && indexMap[i] < m_seqiters.size()) {
435  if (m_seqiters[indexMap[i]].is_valid()) { // existing internal node; set dist
436  m_seqiters[indexMap[i]]->distance = finalLen[l];
437  } else {
438  m_items[indexMap[i]]->distance = finalLen[l];
439  m_seqiters[indexMap[i]] = m_tree->append_child(m_seqiters[0], *m_items[indexMap[i]]);
440  }
441  }
442  }
443 /*#if _DEBUG
444  ofs << "Final nodes (" << tmp << ") found -- expected 3.\n" << i << " " << j
445  << " " << k << endl;
446  ofs << "Index Maps: " << indexMap[i] << " " << indexMap[j] << " " << indexMap[k] << endl;
447  ofs << "Lengths: " << finalLen[0] << " " << finalLen[1] << " " << finalLen[2] << endl << endl;
448  } else {
449  ofs << "Error: the wrong number of 'final' nodes (" << tmp << ") found -- expected 3.\n" << i << " " << j << " " << k << endl << endl;
450 #endif
451 */
452  }
453 
454  if (m_nseqs > 2) midpointRootIfNeeded();
455 /*
456 #if _DEBUG
457  ofs.close();
458 #endif
459 */
460  delete [] indexMap;
461  delete [] internalDistCorrection;
462 
463  // clean up temp storage of distance matrix
464  for (i=0; i<m_nseqs; i++) {
465  delete []ppDists[i];
466  }
467  delete []ppDists;
468  delete []pSums;
469 }
470 
472 
473  int n = 0;
474  if (sit.is_valid()) {
475  n = m_tree->number_of_children(sit);
476  }
477  return n;
478 
479 }
480 
483 }
484 
485 // string NJ_TreeAlgorithm::toString(const TSeqIt& sit) {
486 // return "a string from an iterator\n";
487 // }
488 
489 
490 END_SCOPE(cd_utils)
double FastGet(const int RowIndex, const int ColIndex) const
Definition: cuMatrix.hpp:88
int GetNumCols() const
Definition: cuMatrix.hpp:85
int GetNumRows() const
Definition: cuMatrix.hpp:84
static std::string toNestedString(const SeqTree &seqTree)
double TMatType
Definition: cuDistmat.hpp:91
void EnforceSymmetry()
Definition: cuDistmat.cpp:125
static const int USED_ROW
Definition: cuSeqTreeNj.hpp:71
static const double REPLACE_NEG_DIST
Definition: cuSeqTreeNj.hpp:49
virtual string toString()
virtual void ComputeTree(SeqTree *tree, pProgressFunction pFunc)
static const DistanceMatrix::TMatType INIT_MINIMA
Definition: cuSeqTreeNj.hpp:52
virtual void SetDistMat(DistanceMatrix *dm)
Definition: cuSeqTreeNj.cpp:67
virtual long GetNumLoopsForTreeCalc()
void initializeNodes()
Definition: cuSeqTreeNj.cpp:98
static const Rootedness MY_ROOTEDNESS
Definition: cuSeqTreeNj.hpp:50
void Join(int inode1, int inode2, double len1, double len2)
virtual ~NJ_TreeAlgorithm()
vector< SeqItem * > m_items
Definition: cuSeqTreeNj.hpp:86
static const int BAD_INDEX
Definition: cuSeqTreeNj.hpp:72
int numChildren(const TSeqIt &sit)
static const ETreeMethod MY_TREE_METHOD
Definition: cuSeqTreeNj.hpp:51
std::string name
Definition: cuSeqtree.hpp:68
vector< TSeqIt > TTreeIt
SeqTree * m_tree
DistanceMatrix * m_dm
void midpointRootIfNeeded()
bool is_valid() const
Definition: tree_msvc7.hpp:134
iter parent(iter) const
Definition: tree_msvc7.hpp:655
iter append_child(iter position)
Definition: tree_msvc7.hpp:680
void clear()
Definition: tree_msvc7.hpp:520
pre_order_iterator begin() const
Definition: tree_msvc7.hpp:573
iter insert(iter position, const T &x)
Definition: tree_msvc7.hpp:762
unsigned int number_of_children(const iterator_base &) const
iter reparent(iter position, sibling_iterator begin, sibling_iterator end)
void(* pProgressFunction)(int Num, int Total)
Definition: cuDistmat.hpp:47
ETreeMethod
@ eNJ
const string TREE_ALGORITHM_NAMES[]
string GetTreeAlgorithmName(ETreeMethod algorithm)
Definition: cuSeqTreeNj.cpp:47
#define NULL
Definition: ncbistd.hpp:225
#define kMax_Double
Definition: ncbi_limits.h:208
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5084
int i
yy_size_t n
static char tmp[2048]
Definition: utf8.c:42
#define assert(x)
Definition: srv_diag.hpp:58
Modified on Wed Feb 21 09:57:30 2024 by modify_doxy.py rev. 669887