NCBI C++ ToolKit
cuSeqTreeSlc.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: cuSeqTreeSlc.cpp 41296 2009-03-12 19:39:21Z lanczyck $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Chris Lanczycki
27 *
28 * File Description: treealg.cpp
29 *
30 * Implementation of a single-linkage clustering algorithm.
31 * Produces a tree with a FAKE root.
32 * Estimates branch lengths as half node-to-node distance.
33 *
34 */
35 #include <ncbi_pch.hpp>
36 #include <corelib/ncbi_limits.h>
38 
40 BEGIN_SCOPE(cd_utils)
41 
42 const int SLC_TreeAlgorithm::USED_ROW = -1;
43 const int SLC_TreeAlgorithm::BAD_INDEX = -1;
44 const double SLC_TreeAlgorithm::REPLACE_NEG_DIST = 0.001;
48 
49 double myMin(double a, double b) {
50  return a < b ? a : b;
51 }
52 
54  : TreeAlgorithm(MY_ROOTEDNESS, MY_TREE_METHOD) {
55 
56  m_dm = NULL;
58 }
59 
61  : TreeAlgorithm(MY_ROOTEDNESS, MY_TREE_METHOD) {
62 
63  m_dm = dm;
64  // m_cdd = cdd;
66 }
67 
69  m_dm = dm;
70  /*
71  if (dm->isSetCDD()) {
72  m_cdd = const_cast<CCd*>(dm->GetCdd()); // ATTN: const_cast
73  }*/
75 }
76 /*
77 void SLC_TreeAlgorithm::SetCDD(CCd* cdd) {
78  m_cdd = cdd;
79  //fillSeqNames(m_tree, m_cdd);
80 }*/
81 
82 
83 // Initialize a list of nodes that will be placed in a tree structure.
84 // (For a rooted tree with N leafs, there are 2N-1 total nodes, of which
85 // N-2 are internal and one is the root.)
86 // The node will be identified by its one-based row indices in the distance matrix
87 // (assumed to correspond to rows an accompanying alignment). Nodes with 'rowID'
88 // > m_nseqs serve to hold composite nodes created during the SLC procedure.
89 //
90 // The root node will have length 0 and often called the 'hub' node.
91 //
92 // NOTE: If the distance matrix row does not map directly to the row indices of an
93 // alignment, the caller is responsible for re-mapping rowID to the correct
94 // alignment row number.
95 //
96 // NOTE: Names are initially assigned to be the row ID as a string.
97 
99 
100  // m_tree is initialized by base-class constructor
101 
102  int nnodes;
103  SeqItem* sip;
104 
105  if (m_dm && m_nseqs>0) {
106  for (int i=0; i<(2*m_nseqs - 1); ++i) {
107  if (m_items[i]) {
108  delete m_items[i];
109  }
110  }
111  }
112 
113  m_seqiters.clear();
114  m_items.clear();
115 
116  m_nseqs = 0;
117  m_nextNode = 0;
118 
119  if (m_dm) {
120  // Allow square matrix only
121  if (m_dm->GetNumRows() != m_dm->GetNumCols()) {
122  m_dm = NULL;
123  return;
124  } else {
125  m_nseqs = m_dm->GetNumRows();
126  nnodes = 2*m_nseqs - 1;
127  m_seqiters = TTreeIt(nnodes);
128  //cout << "m_seqiters: " << m_seqiters.size() << endl;
129  for (int i=0; i<nnodes; ++i) {
130  sip = new SeqItem(i);
131  sip->name = NStr::IntToString(i);
132  if (sip) {
133  m_items.push_back(sip);
134  } else {
135  m_items.push_back(NULL);
136  }
137  }
138  }
139  }
140 }
141 
143 
144  m_seqiters.clear();
145  for (int i=0; i<(2*m_nseqs - 1); ++i) {
146  delete m_items[i];
147  }
148 }
149 
150 //
151 void SLC_TreeAlgorithm::Join(int idmin, int jdmin, double ilen, double jlen) {
152 
153  double tmpd = ilen;
154  int maxIndex = 2*m_nseqs - 2; // tot. # nodes (less the hub)
155 
156  if (idmin == jdmin) {
157  cerr << "Error: You cannot join node " << idmin << " to itself.\n";
158  return;
159  }
160 
161  if (idmin < 0 || idmin > maxIndex || jdmin < 0 || jdmin > maxIndex || m_nextNode < 0 || m_nextNode > maxIndex+1) {
162  // Don't print message unless there's a serious problem.
163  // [ij]dmin = 'USED_ROW' here does not appear to cause issues.
164  if ((idmin != USED_ROW) && (jdmin != USED_ROW)) {
165  cerr << "Warning: Out of range index in Join: " << idmin << " "
166  << jdmin << " " << m_nextNode << " Max allowed index: " << maxIndex << endl;
167  }
168  return;
169  }
170 
171  TChildIt cit;
172  TSeqIt& hub = m_seqiters[0];
173 
174  // New composite node (unless we're at the final two nodes)
175  if (m_nextNode <= maxIndex) {
177  }
178 
179  int tmpi = idmin;
180  while (tmpi == idmin || tmpi == jdmin) {
181  cit = m_seqiters[tmpi];
182  if (m_nextNode <= maxIndex) { // unless the last two nodes...
183  if (cit.is_valid()) {
184  if (m_tree->parent(cit) == hub) {
185  cit->distance = tmpd;
186  TChildIt nexit=cit;++nexit;
187  m_tree->reparent(m_seqiters[m_nextNode], cit, nexit); // don't use ++ as it modifies cit
188  } else { // this case should not occur
189  tmpi = -(idmin+jdmin);
190  cerr << "Error: iterator found (id= " << cit->rowID << ") not attached to hub.\n";
191  }
192  } else {
193  m_items[tmpi]->distance = tmpd;
195  }
196  } else {
197  if (cit.is_valid()) {
198  if (m_tree->parent(cit) == hub) {
199  cit->distance = tmpd;
200  } else { // this case should not occur
201  tmpi = -(idmin+jdmin);
202  cerr << "Error: iterator found (id= " << cit->rowID << ") not attached to hub for last nodes.\n";
203  }
204  } else {
205  m_items[tmpi]->distance = tmpd;
206  m_seqiters[tmpi] = m_tree->append_child(m_seqiters[0], *m_items[tmpi]);
207  }
208  }
209  tmpd = jlen;
210  tmpi = (tmpi==idmin) ? jdmin : -(idmin+jdmin);
211  }
212 }
213 
214 
215 // return the number of loops in ComputeTree
217  int it, j, i;
218  long count=0;
219  int* indexMap = new int[m_nseqs];
220  for (i=0; i<m_nseqs; ++i) {
221  indexMap[i] = i+1;
222  }
223  for (it=1; it<=m_nseqs-1; ++it) {
224  for (j=1; j<m_nseqs; ++j) {
225 // if (indexMap[j] != USED_ROW) {
226 // for (i=0; i<j; ++i) {
227 // if (indexMap[i] != USED_ROW) {
228 // count++;
229 // }
230 // }
231 // }
232  count++; // don't count in inner loop or can exceed int4 max
233  }
234  indexMap[it] = USED_ROW;
235  }
236  delete []indexMap;
237  return(count);
238 }
239 
240 
241 // Main computational engine for the SLC method (simplification to the NJ alg)
242 
244 
245  int i, j, k;
246  int idmin, jdmin;
247  int imin = 0, jmin = 0;
248  double ilen, jlen;
249 
251 
252  string newickStr = "";
253 /*
254 #if _DEBUG
255  ofstream ofs;
256  ofs.open("slc_info.txt");
257 #endif
258  */
259  m_tree = atree;
260  if (m_tree == NULL) {
261  return;
262  } else if (!m_dm) {
263  m_tree->clear();
264  m_tree = NULL;
265  return;
266  }
267 
268  m_seqiters[0] = m_tree->insert(m_tree->begin(), *m_items[0]);
269 
270  // For composite nodes, save the 1/2 dist. between the two component nodes.
271  // When this node becomes part of a composite node in a later iteration, that
272  // distance is subtracted from the orig-composite-node to hub distance.
273 
274  double* internalDistCorrection = new double[m_nseqs];
275 
276  // Map rows in distance matrix to index of corresponding node in m_seqiters;
277  // when there is no corresponding distance matrix row, enter USED_ROW.
278 
279  int* indexMap = new int[m_nseqs];
280 
281  m_nextNode = m_nseqs + 1;
282 
283  if (indexMap == 0 || internalDistCorrection == 0) {
284  m_tree->clear();
285  m_tree = NULL;
286  return;
287  } else {
288  for (i=0; i<m_nseqs; ++i) {
289  indexMap[i] = i+1;
290  internalDistCorrection[i] = 0.0;
291  }
292  }
293 
294  // adjust distance matrix (do this before pre-fetching)
296 
297  // speeding up tree-calc, pre-fetch all distances
298  double** ppDists;
299  ppDists = new double*[m_nseqs];
300  for (i=0; i<m_nseqs; i++) {
301  ppDists[i] = new double[m_nseqs];
302  }
303  for (i=0; i<m_nseqs; i++) {
304  for (j=0; j<m_nseqs; j++) {
305  ppDists[i][j] = m_dm->FastGet(i, j);
306  }
307  }
308 
309  // Iterations loop
310  long count = 0;
311  long total = GetNumLoopsForTreeCalc();
312  for (int it=1; it<=m_nseqs-1; ++it) {
313 
314  minval = INIT_MINIMA;
315 
316  // Find minimum distance in active part of the matrix
317  for (j=1; j<m_nseqs; ++j) {
318  if (indexMap[j] != USED_ROW) {
319  for (i=0; i<j; ++i) {
320  if (indexMap[i] != USED_ROW) {
321  if (ppDists[j][i] < minval) {
322  minval = ppDists[j][i];
323  imin = i;
324  jmin = j;
325  }
326 // count++;
327  }
328  }
329  }
330  count++;
331  }
332  pFunc(count, total);
333  idmin = indexMap[imin];
334  jdmin = indexMap[jmin];
335 
336 // if (idmin == USED_ROW || jdmin == USED_ROW) {
337 // cerr << "Indexing Error: " << idmin << " or " << jdmin << " is not a valid row index "
338 // << " for DM indices " << imin << " and " << jmin << ", respectively.\n\n";
339 // break;
340 // }
341 
342  // Assign length of new branches to be 1/2 m_dm(imin, jmin)
343 
344  ilen = 0.5*ppDists[imin][jmin] - internalDistCorrection[imin];
345  jlen = 0.5*ppDists[imin][jmin] - internalDistCorrection[jmin];
346 /*
347 #if _DEBUG
348  if (1) {
349  ofs << "\nIteration " << it << ": min distance = " << minval << endl;
350  ofs << "Rows : " << imin << "(node " << idmin << "; length= " << ilen
351  << ") and " << jmin << "(node " << jdmin << "; length= " << jlen << ")." << endl;
352  ofs << " dist corrs: " << internalDistCorrection[imin]
353  << " " << internalDistCorrection[jmin] << endl;
354  }
355 #endif
356 */
357 // Connect nodes, creating them as needed.
358 
359  ilen = (ilen > 0) ? ilen : REPLACE_NEG_DIST;
360  jlen = (jlen > 0) ? jlen : REPLACE_NEG_DIST;
361 
362  Join(idmin, jdmin, ilen, jlen);
363 
364  internalDistCorrection[imin] = 0.5*ppDists[imin][jmin];
365  indexMap[imin] = m_nextNode++;
366  indexMap[jmin] = USED_ROW;
367 
368  // Fix-up distance matrix: row jmin is zeroed out, imin row becomes m_dm(imin, jmin)
369  for (k=0; k<m_nseqs; ++k) {
370  if (indexMap[k] != USED_ROW) {
371  ppDists[k][imin] = myMin(ppDists[k][imin], ppDists[k][jmin]);
372  ppDists[k][jmin] = 0.0;
373  ppDists[jmin][k] = 0.0;
374  ppDists[imin][k] = ppDists[k][imin];
375  }
376  }
377  ppDists[imin][imin] = 0.0;
378 
379  } // end iteration loop
380  assert(count == total);
381 
382  // Need to deal with final node? (should all be hooked to the hub node)
383 /*#if _DEBUG
384  ofs.close();
385 #endif
386 */
388  //fillSeqNames(m_tree, m_cdd);
389 
390  delete [] indexMap;
391  delete [] internalDistCorrection;
392 
393 }
394 
396 
397  int n = 0;
398  if (sit.is_valid()) {
399  n = m_tree->number_of_children(sit);
400  }
401  return n;
402 
403 }
404 
407 }
408 
409 // string SLC_TreeAlgorithm::toString(const TSeqIt& sit) {
410 // return "a string from an iterator\n";
411 // }
412 
413 END_SCOPE(cd_utils)
double FastGet(const int RowIndex, const int ColIndex) const
Definition: cuMatrix.hpp:88
int GetNumCols() const
Definition: cuMatrix.hpp:85
int GetNumRows() const
Definition: cuMatrix.hpp:84
static std::string toNestedString(const SeqTree &seqTree)
double TMatType
Definition: cuDistmat.hpp:91
void EnforceSymmetry()
Definition: cuDistmat.cpp:125
virtual void ComputeTree(SeqTree *tree, pProgressFunction pFunc)
virtual long GetNumLoopsForTreeCalc()
int numChildren(const TSeqIt &sit)
static const DistanceMatrix::TMatType INIT_MINIMA
static const ETreeMethod MY_TREE_METHOD
vector< SeqItem * > m_items
static const double REPLACE_NEG_DIST
virtual ~SLC_TreeAlgorithm()
static const int BAD_INDEX
virtual void SetDistMat(DistanceMatrix *dm)
static const Rootedness MY_ROOTEDNESS
static const int USED_ROW
virtual string toString()
void Join(int inode1, int inode2, double len1, double len2)
std::string name
Definition: cuSeqtree.hpp:68
vector< TSeqIt > TTreeIt
SeqTree * m_tree
DistanceMatrix * m_dm
void midpointRootIfNeeded()
bool is_valid() const
Definition: tree_msvc7.hpp:134
iter parent(iter) const
Definition: tree_msvc7.hpp:655
iter append_child(iter position)
Definition: tree_msvc7.hpp:680
void clear()
Definition: tree_msvc7.hpp:520
pre_order_iterator begin() const
Definition: tree_msvc7.hpp:573
iter insert(iter position, const T &x)
Definition: tree_msvc7.hpp:762
unsigned int number_of_children(const iterator_base &) const
iter reparent(iter position, sibling_iterator begin, sibling_iterator end)
void(* pProgressFunction)(int Num, int Total)
Definition: cuDistmat.hpp:47
ETreeMethod
@ eSLC
double myMin(double a, double b)
#define NULL
Definition: ncbistd.hpp:225
#define kMax_Double
Definition: ncbi_limits.h:208
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5083
int i
yy_size_t n
unsigned int a
Definition: ncbi_localip.c:102
#define assert(x)
Definition: srv_diag.hpp:58
Modified on Sat Dec 09 04:44:27 2023 by modify_doxy.py rev. 669887