NCBI C++ ToolKit
clusterer.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /*
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's offical duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================*/
25 
26 /*****************************************************************************
27 
28 File name: clusterer.cpp
29 
30 Author: Greg Boratyn
31 
32 Contents: Implementation of CClusterer class
33 
34 ******************************************************************************/
35 
36 
37 #include <ncbi_pch.hpp>
39 
41 USING_SCOPE(cobalt);
42 
44 {
45  if (dmat.GetRows() != dmat.GetCols()) {
46  NCBI_THROW(CClustererException, eInvalidOptions,
47  "Distance matrix is not square");
48  }
49 }
50 
52 {
53  x_Init();
54 }
55 
57  : m_DistMatrix(new TDistMatrix(dmat))
58 {
60  x_Init();
61 }
62 
63 CClusterer::CClusterer(shared_ptr<CClusterer::TDistMatrix>& dmat)
64  : m_DistMatrix(dmat)
65 {
67  x_Init();
68 }
69 
70 CClusterer::CClusterer(CRef<CLinks> links) : m_Links(links)
71 {
72  x_Init();
73 }
74 
75 static void s_PurgeTrees(vector<TPhyTreeNode*>& trees)
76 {
77  NON_CONST_ITERATE(vector<TPhyTreeNode*>, it, trees) {
78  delete *it;
79  *it = NULL;
80  }
81  trees.clear();
82 }
83 
85 {
87 }
88 
90 {
91  m_MaxDiameter = 0.8;
93  m_MakeTrees = false;
94  m_ReportSingletons = true;
95 }
96 
98 {
99  s_CheckDistMatrix(dmat);
100 
101  m_DistMatrix.reset(new TDistMatrix());
102  m_DistMatrix->Resize(dmat.GetRows(), dmat.GetCols());
103  copy(dmat.begin(), dmat.end(), m_DistMatrix->begin());
104 }
105 
106 void CClusterer::SetDistMatrix(shared_ptr<TDistMatrix>& dmat)
107 {
108  s_CheckDistMatrix(*dmat);
109 
110  m_DistMatrix = dmat;
111 }
112 
114 {
115  m_Links = links;
116 }
117 
119 {
120  if (!m_DistMatrix.get()) {
121  NCBI_THROW(CClustererException, eNoDistMatrix,
122  "Distance matrix not assigned");
123  }
124 
125  return *m_DistMatrix;
126 }
127 
128 
130 CClusterer::GetSingleCluster(size_t index) const
131 {
132  if (index >= m_Clusters.size()) {
133  NCBI_THROW(CClustererException, eClusterIndexOutOfRange,
134  "Cluster index out of range");
135  }
136 
137  return m_Clusters[index];
138 }
139 
140 
141 // Finds maximum distance between given element and element in given cluster
142 static double s_FindMaxDistFromElem(int elem,
143  const CClusterer::TSingleCluster& cluster,
144  const CClusterer::TDistMatrix& dmat) {
145 
146  _ASSERT(cluster.size() > 0);
147  _ASSERT(elem < (int)dmat.GetRows());
148 
149  double result = 0.0;
150  ITERATE(CClusterer::TSingleCluster, elem_it, cluster) {
151  if (dmat(elem, *elem_it) > result) {
152 
153  _ASSERT(*elem_it < (int)dmat.GetRows());
154 
155  result = dmat(elem, *elem_it);
156  }
157  }
158 
159  return result;
160 }
161 
162 // Finds maximum distance between any pair of elements from given clusters
163 static double s_FindDistAsMax(const CClusterer::TSingleCluster& cluster1,
164  const CClusterer::TSingleCluster& cluster2,
165  const CClusterer::TDistMatrix& dmat) {
166 
167  _ASSERT(cluster1.size() > 0 && cluster2.size() > 0);
168 
169  double result = 0.0;
170  ITERATE(CClusterer::TSingleCluster, elem, cluster1) {
171 
172  _ASSERT(*elem < (int)dmat.GetRows());
173 
174  double dist = s_FindMaxDistFromElem(*elem, cluster2, dmat);
175  if (dist > result) {
176  result = dist;
177  }
178  }
179 
180  return result;
181 }
182 
183 /// Find sum of distances between given element and cluster elements
184 static double s_FindSumDistFromElem(int elem,
185  const CClusterer::TSingleCluster& cluster,
186  const CClusterer::TDistMatrix& dmat) {
187 
188  _ASSERT(cluster.size() > 0);
189  _ASSERT(elem < (int)dmat.GetRows());
190 
191  double result = 0.0;
192  ITERATE(CClusterer::TSingleCluster, elem_it, cluster) {
193  _ASSERT(*elem_it < (int)dmat.GetRows());
194 
195  result += dmat(elem, *elem_it);
196  }
197 
198  return result;
199 }
200 
201 /// Find mean distance between cluster elements
202 static double s_FindDistAsMean(const CClusterer::TSingleCluster& cluster1,
203  const CClusterer::TSingleCluster& cluster2,
204  const CClusterer::TDistMatrix& dmat)
205 {
206  _ASSERT(cluster1.size() > 0 && cluster2.size() > 0);
207 
208  double result = 0.0;
209  ITERATE(CClusterer::TSingleCluster, elem, cluster1) {
210 
211  _ASSERT(*elem < (int)dmat.GetRows());
212 
213  result += s_FindSumDistFromElem(*elem, cluster2, dmat);
214  }
215  result /= (double)(cluster1.size() * cluster2.size());
216 
217  return result;
218 }
219 
220 
221 /// Find mean
222 static double s_FindMean(const vector<double>& vals)
223 {
224  double result = 0.0;
225  if (!vals.empty()) {
226  ITERATE(vector<double>, it, vals) {
227  result += *it;
228  }
229  result /= (double)vals.size();
230  }
231  return result;
232 }
233 
234 /// Find distance between two clusters
235 /// (cluster and included_cluster + extended_cluster) using given dist method.
236 /// current_dist is the current distance between cluster and extended_cluster.
237 static double s_FindDist(const CClusterer::CSingleCluster& cluster,
238  const CClusterer::CSingleCluster& included_cluster,
239  const CClusterer::CSingleCluster& extended_cluster,
240  const CClusterer::TDistMatrix& dmat,
241  double current_dist,
242  CClusterer::EDistMethod dist_method)
243 {
244  double result = -1.0;
245  if (dist_method == CClusterer::eCompleteLinkage) {
246  double dist = s_FindDistAsMax(cluster, included_cluster, dmat);
247 
248  result = dist > current_dist ? dist : current_dist;
249 
250  }
251  else {
252 
253  result = s_FindDistAsMean(cluster, extended_cluster, dmat);
254  }
255  return result;
256 }
257 
258 // Create tree leaf (node representing element) with given id
260 {
261  TPhyTreeNode* node = new TPhyTreeNode();
262  node->GetValue().SetId(id);
263 
264  // This is needed so that serialized tree can be interpreted
265  // by external applications
266  node->GetValue().SetLabel(NStr::IntToString(id));
267 
268  return node;
269 }
270 
271 // Complete Linkage clustering with dendrograms
272 void CClusterer::ComputeClusters(double max_diam,
273  CClusterer::EDistMethod dist_method,
274  bool do_trees,
275  double infinity)
276 {
277  if (dist_method != eCompleteLinkage && dist_method != eAverageLinkage) {
278  NCBI_THROW(CClustererException, eInvalidOptions, "Unrecognised cluster"
279  " distance method");
280  }
281 
282  if (!m_DistMatrix.get()) {
283  NCBI_THROW(CClustererException, eInvalidInput, "Distance matrix not"
284  " set");
285  }
286 
287  m_Clusters.clear();
289 
290  size_t num_elements = m_DistMatrix->GetRows();
291 
292  // If there is exactly one element to cluster
293  if (num_elements == 1) {
294  m_Clusters.resize(1);
295  m_Clusters[0].AddElement(0);
296 
297  if (do_trees) {
298  m_Trees.push_back(s_CreateTreeLeaf(0));
299  }
300 
301  return;
302  }
303 
304  // If there are exactly two elements
305  if (num_elements == 2) {
306 
307  if ((*m_DistMatrix)(0, 1) < max_diam) {
308  m_Clusters.resize(1);
309  m_Clusters[0].AddElement(0);
310  m_Clusters[0].AddElement(1);
311  }
312  else {
313  m_Clusters.resize(2);
314  m_Clusters[0].AddElement(0);
315  m_Clusters[1].AddElement(1);
316  }
317 
318  if (do_trees) {
319  TPhyTreeNode* node0 = s_CreateTreeLeaf(0);
320  TPhyTreeNode* node1 = s_CreateTreeLeaf(1);
321 
322  if ((*m_DistMatrix)(0, 1) < max_diam) {
323  // one cluster case
324  double dist = (*m_DistMatrix)(0, 1) / 2.0;
325  node0->GetValue().SetDist(dist);
326  node1->GetValue().SetDist(dist);
327 
328  TPhyTreeNode* root = new TPhyTreeNode();
329  root->AddNode(node0);
330  root->AddNode(node1);
331 
332  m_Trees.push_back(root);
333 
334  }
335  else {
336  // two clusters case
337  m_Trees.push_back(node0);
338  m_Trees.push_back(node1);
339  }
340  }
341 
342  return;
343  }
344 
345  // If there are at least 3 elements to cluster
346 
347  // Checking whether the data is in one cluster
348  // skip if tree is requested
349  if (!do_trees) {
350  double max_dist = 0.0;
351  for (size_t i=0;i < num_elements - 1;i++) {
352  for (size_t j=i+1;j < num_elements;j++) {
353  if ((*m_DistMatrix)(i, j) > max_dist) {
354  max_dist = (*m_DistMatrix)(i, j);
355  }
356  }
357  }
358  // If yes, create the cluster and return
359  if (max_dist <= max_diam) {
360  m_Clusters.resize(1);
361  for (int i=0;i < (int)num_elements;i++) {
362  m_Clusters[0].AddElement(i);
363  }
364  return;
365  }
366  }
367 
368 
369  // Getting to this point means that there are at least two clusters
370  // and max distance is larger than max_diam
371  // or tree was requested
372 
373  // Creating working copy of dist matrix
374  TDistMatrix dmat(*m_DistMatrix);
375  // Denotes whether an entry in dist matrix and clusters is used
376  vector<bool> used_entry(num_elements, true);
377 
378  // Creating initial one-element clusters
379  TClusters clusters(num_elements);
380  for (size_t i=0;i < num_elements;i++) {
381  clusters[i].AddElement((int)i);
382  }
383 
384  // Create leaf nodes for dendrogram
385  vector<TPhyTreeNode*> nodes(num_elements);
386  if (do_trees) {
387  for (size_t i=0;i < nodes.size();i++) {
388  nodes[i] = s_CreateTreeLeaf((int)i);
389  }
390  }
391  vector< vector<double> > dists_to_root(num_elements);
392 
393  // Computing clusters
394  // Exits the loop once minimum distance is larger than max_diam
395  // It was checked above that such distance exists
396  size_t num_clusters = num_elements;
397  while (num_clusters > 1) {
398 
399  // Find first used dist matrix entries
400  size_t min_i = 0;
401  size_t min_j;
402  do {
403  while (!used_entry[min_i] && min_i < num_elements) {
404  min_i++;
405  }
406 
407  min_j = min_i + 1;
408  while (!used_entry[min_j] && min_j < num_elements) {
409  min_j++;
410  }
411 
412  if (min_j >= num_elements) {
413  min_i++;
414  }
415  } while (min_j >= num_elements && min_i < num_elements);
416 
417  // A distance larger than max_diam exists in the dist matrix,
418  // then there always should be at least two used entires in dist matrix
419  _ASSERT(do_trees || (min_i < num_elements && min_j < num_elements));
420 
421  // Find smallest distance entry
422  for (size_t i=0;i < num_elements - 1;i++) {
423  for (size_t j=i+1;j < num_elements;j++) {
424 
425  if (!used_entry[i] || !used_entry[j]) {
426  continue;
427  }
428 
429  if (dmat(i, j) < dmat(min_i, min_j)) {
430  min_i = i;
431  min_j = j;
432  }
433  }
434  }
435 
436  _ASSERT(used_entry[min_i] && used_entry[min_j]);
437 
438  // Check whether the smallest distance is larger than max_diam
439  if (dmat(min_i, min_j) > max_diam) {
440  break;
441  }
442 
443  // If distance between two one-element clusters exceeds
444  // infinity, they should not start a cluster.
445  // Try attaching one of the elements to a cluster with at least two
446  // elements.
447  if (infinity >= 0.0 && dmat(min_i, min_j) >= infinity
448  && clusters[min_i].size() == 1 && clusters[min_j].size() == 1) {
449 
450  // find first used entry
451  size_t new_min_j = 0;
452  while ((!used_entry[new_min_j] || new_min_j == min_j
453  || new_min_j == min_i
454  || clusters[new_min_j].size() == 1)
455  && new_min_j < num_elements) {
456 
457  new_min_j++;
458  }
459  if (new_min_j < num_elements) {
460 
461  // find new smallest distance
462  for (size_t j=new_min_j+1;j < num_elements;j++) {
463  if (!used_entry[j] || clusters[j].size() == 1
464  || j == min_j || j == min_i) {
465  continue;
466  }
467 
468  if (dmat(min_i, j) < dmat(min_i, new_min_j)) {
469  new_min_j = j;
470  }
471 
472  }
473  // if the new pair cannot be found keep the old min_j
474  if (new_min_j < num_elements) {
475  min_j = new_min_j;
476  if (min_i > min_j) {
477  swap(min_i, min_j);
478  }
479  }
480  }
481  // TO DO: Generate a warning if new min_j is not found
482  }
483 
484 
485  _ASSERT(clusters[min_i].size() > 0 && clusters[min_j].size() > 0);
486  _ASSERT(min_i < num_elements && min_j < num_elements);
487 
488  // Joining tree nodes
489  // must be done before joining clusters
490  if (do_trees) {
491  TPhyTreeNode* new_root = new TPhyTreeNode();
492 
493  _ASSERT(nodes[min_i]);
494  _ASSERT(nodes[min_j]);
495 
496  // left and right are meaningless, only to make code readble
497  const size_t left = min_i;
498  const size_t right = min_j;
499 
500  new_root->AddNode(nodes[left]);
501  new_root->AddNode(nodes[right]);
502 
503  // set sub node distances
504  double dist = s_FindDistAsMean(clusters[left], clusters[right],
505  *m_DistMatrix);
506 
507  // find average distances too root in left and right subtrees
508  double mean_dist_to_root_left = s_FindMean(dists_to_root[left]);
509  double mean_dist_to_root_right = s_FindMean(dists_to_root[right]);
510 
511  // set edge length between new root and subtrees
512  double left_edge_length = dist - mean_dist_to_root_left;
513  double right_edge_length = dist - mean_dist_to_root_right;
514  left_edge_length = left_edge_length > 0.0 ? left_edge_length : 0.0;
515  right_edge_length
516  = right_edge_length > 0.0 ? right_edge_length : 0.0;
517 
518  nodes[left]->GetValue().SetDist(left_edge_length);
519  nodes[right]->GetValue().SetDist(right_edge_length);
520 
521  // compute distances from leaves to new root
522  if (dists_to_root[left].empty()) {
523  dists_to_root[left].push_back(dist);
524  }
525  else {
526  NON_CONST_ITERATE(vector<double>, it, dists_to_root[left]) {
527  *it += left_edge_length;
528  }
529  }
530 
531  if (dists_to_root[right].empty()) {
532  dists_to_root[right].push_back(dist);
533  }
534  else {
535  NON_CONST_ITERATE(vector<double>, it, dists_to_root[right]) {
536  *it += right_edge_length;
537  }
538  }
539 
540  // merge cluster related information
541  // Note: the extended and included cluster must correspond to
542  // Joining clusters procedure below
543  nodes[min_i] = new_root;
544  nodes[min_j] = NULL;
545  ITERATE(vector<double>, it, dists_to_root[min_j]) {
546  dists_to_root[min_i].push_back(*it);
547  }
548  dists_to_root[min_j].clear();
549  }
550 
551  // Joining clusters
552  TSingleCluster& extended_cluster = clusters[min_i];
553  TSingleCluster& included_cluster = clusters[min_j];
554  used_entry[min_j] = false;
555  num_clusters--;
556  _ASSERT(!do_trees || !nodes[min_j]);
557 
558  ITERATE(TSingleCluster, elem, included_cluster) {
559  extended_cluster.AddElement(*elem);
560  }
561 
562  // Updating distance matrix
563  for (size_t i=0;i < min_i && min_i > 0;i++) {
564  if (!used_entry[i]) {
565  continue;
566  }
567 
568  dmat(i, min_i) = s_FindDist(clusters[i], included_cluster,
569  extended_cluster, *m_DistMatrix,
570  dmat(i, min_i), dist_method);
571 
572  dmat(min_i, i) = dmat(i, min_i);
573 
574  }
575  for (size_t j=min_i+1;j < num_elements;j++) {
576  if (!used_entry[j]) {
577  continue;
578  }
579 
580  dmat(min_i, j) = s_FindDist(clusters[j], included_cluster,
581  extended_cluster, *m_DistMatrix,
582  dmat(min_i, j), dist_method);
583 
584  dmat(j, min_i) = dmat(min_i, j);
585 
586  }
587  clusters[min_j].clear();
588  }
589 
590  // Putting result in class attribute
591  for (size_t i=0;i < num_elements;i++) {
592  if (used_entry[i]) {
593 
594  _ASSERT(clusters[i].size() > 0);
595 
596  m_Clusters.push_back(TSingleCluster());
597  TClusters::iterator it = m_Clusters.end();
598  --it;
599  ITERATE(TSingleCluster, elem, clusters[i]) {
600  it->AddElement(*elem);
601  }
602 
603  if (do_trees) {
604  _ASSERT(nodes[i]);
605 
606  m_Trees.push_back((TPhyTreeNode*)nodes[i]);
607  }
608  }
609  else {
610 
611  _ASSERT(clusters[i].size() == 0);
612 
613  if (do_trees) {
614  _ASSERT(!nodes[i]);
615  _ASSERT(dists_to_root[i].empty());
616  }
617  }
618  }
619 }
620 
622 {
623  if (m_Links.Empty()) {
624  NCBI_THROW(CClustererException, eInvalidOptions, "Distance links not"
625  " set");
626  }
627 
628  // initialize table of cluster ids for elements to unassigned
629  m_ClusterId.resize(m_Links->GetNumElements(), -1);
630 
631  // if there are any pre-computed clusters, set their ids to elements
632  if (!m_Clusters.empty()) {
633  for (size_t i=0;i < m_Clusters.size();i++) {
634  ITERATE (TSingleCluster, elem, m_Clusters[i]) {
635  if (*elem >= (int)m_Links->GetNumElements()) {
636  NCBI_THROW(CClustererException, eInvalidInput,
637  "Element index in pre-set cluster larger than "
638  "number of elements provided with links");
639  }
640  m_ClusterId[*elem] = (int)i;
641  }
642  }
643  }
644 
645  // sort links
646  if (!m_Links->IsSorted()) {
647  m_Links->Sort();
648  }
649 
650  // for each link
652  for (;it != m_Links->end();++it) {
653 
654  _ASSERT(it->first != it->second);
655 
656  // if none of elements belongs to a cluster
657  if (m_ClusterId[it->first] < 0 && m_ClusterId[it->second] < 0) {
658 
659  // join these two elements
660  x_JoinElements(*it);
661  continue;
662  }
663 
664  // if cluster ids differ
665  if (m_ClusterId[it->first] != m_ClusterId[it->second]) {
666 
667  // if one of elements does not belong to any cluster
668  if (m_ClusterId[it->first] < 0 || m_ClusterId[it->second] < 0) {
669  int elem;
670  int cluster_id;
671 
672  if (m_ClusterId[it->first] < 0) {
673  elem = it->first;
674  cluster_id = m_ClusterId[it->second];
675  }
676  else {
677  elem = it->second;
678  cluster_id = m_ClusterId[it->first];
679  }
680 
681  // check if element can be added to the cluster the other
682  // element belongs
683  double distance;
684  if (x_CanAddElem(cluster_id, elem, distance)) {
685  // add element
686  x_JoinClustElem(cluster_id, elem, distance);
687  }
688 
689  }
690  // if both elements belong to clusters
691  else {
692  // if these clusters can be joined
693  double distance;
694  if (x_CanJoinClusters(m_ClusterId[it->first],
695  m_ClusterId[it->second],
696  distance)) {
697 
698  // join clusters
699  x_JoinClusters(m_ClusterId[it->first],
700  m_ClusterId[it->second],
701  distance);
702  }
703  }
704  }
705  // if both elements already belong to the same cluster
706  else {
707  // update value of cluster diameter
708  int cluster_id = m_ClusterId[it->first];
709  if (m_Clusters[cluster_id].GetMaxDistance() < it->weight) {
710  m_Clusters[cluster_id].SetMaxDistance(it->weight);
711  }
712  }
713  }
714 
715  // try merging clusters
716  for (int i=0;i < (int)m_Clusters.size() - 1;i++) {
717  if (m_Clusters[i].size() == 0) {
718  continue;
719  }
720  for (size_t k=i+1;k < m_Clusters.size();k++) {
721  if (m_Clusters[k].size() == 0) {
722  continue;
723  }
724 
725  double distance;
726  if (x_CanJoinClusters(m_ClusterId[*m_Clusters[i].begin()],
727  m_ClusterId[*m_Clusters[k].begin()],
728  distance)) {
729 
730  // the first cluster is the joined one
732  m_ClusterId[*m_Clusters[k].begin()],
733  distance);
734  }
735  }
736  }
737 
738  // find elements that were not assigned to any clusters and create
739  // one-element clusters for them
740  if (m_ReportSingletons) {
741  for (int i=0;i < (int)m_ClusterId.size();i++) {
742  if (m_ClusterId[i] < 0) {
744  }
745  }
746  }
747 
748  // TO DO: m_Clusters needs to be chenged to vector of pointers or list
749  // so that the operations below are more efficient
750 
751  // remove empty clusters from the back of the list of clusters
752  // the empty clusters appear in the list when clusters are joined
753  while (m_Clusters.back().size() == 0) {
754  m_Clusters.pop_back();
755  }
756 
757  // remove empty clusters from the list of clusters by moving the cluster
758  // from the back of the list to the position of the empty one
759  for (int i=0;i < (int)m_Clusters.size();i++) {
760  if (m_Clusters[i].size() == 0) {
761  const TSingleCluster& cluster = m_Clusters.back();
762  _ASSERT(cluster.size() > 0);
763  ITERATE (vector<int>, it, cluster) {
764  m_Clusters[i].AddElement(*it);
765  m_ClusterId[*it] = i;
766  m_Clusters[i].m_Tree = cluster.m_Tree;
767  _ASSERT(!m_MakeTrees || m_Clusters[i].m_Tree);
768  }
769  m_Clusters.pop_back();
770 
771  // remove empty clusters from the back of the list of clusters
772  // the empty clusters appear in the list when clusters are joined
773  while (m_Clusters.back().size() == 0) {
774  m_Clusters.pop_back();
775  }
776  }
777  }
778 
779  if (m_MakeTrees) {
780  m_Trees.reserve(m_Clusters.size());
781  ITERATE (vector<CSingleCluster>, it, m_Clusters) {
782  _ASSERT(it->m_Tree);
783  m_Trees.push_back(it->m_Tree);
784  }
785  }
786 }
787 
788 
790 {
791  int cluster_id;
792 
793  // if the list of unused entries in the cluster list is empty
794  if (m_UnusedEntries.empty()) {
795 
796  // add a new cluster to the list
797  m_Clusters.push_back(CSingleCluster());
798  CSingleCluster& cluster = m_Clusters.back();
799  cluster.AddElement(link.first);
800  cluster.AddElement(link.second);
801  cluster.SetMaxDistance(link.weight);
802  cluster_id = (int)m_Clusters.size() - 1;
803  }
804  else {
805 
806  // if there are any empty clusters in the list (created by joining two
807  // clusters)
808 
809  // use the empty cluster enrty in the list for joining the elements
810  cluster_id = m_UnusedEntries.front();
811  m_UnusedEntries.pop_front();
812  _ASSERT(m_Clusters[cluster_id].size() == 0);
813  CSingleCluster& cluster = m_Clusters[cluster_id];
814  cluster.AddElement(link.first);
815  cluster.AddElement(link.second);
816  cluster.SetMaxDistance(link.weight);
817  }
818 
819  // set cluster id for elements
820  m_ClusterId[link.first] = cluster_id;
821  m_ClusterId[link.second] = cluster_id;
822 
823  if (!m_MakeTrees) {
824  return;
825  }
826 
827  // join tree nodes
828  TPhyTreeNode* root = new TPhyTreeNode();
829  TPhyTreeNode* left = s_CreateTreeLeaf(link.first);
830  TPhyTreeNode* right = s_CreateTreeLeaf(link.second);
831  root->AddNode(left);
832  root->AddNode(right);
833  double dist = link.weight / 2.0;
834  left->GetValue().SetDist(dist);
835  right->GetValue().SetDist(dist);
836 
837  // set cluster tree and average leaf distance to root
838  _ASSERT(!m_Clusters[cluster_id].m_Tree);
839  m_Clusters[cluster_id].m_Tree = root;
840  m_Clusters[cluster_id].m_DistToRoot.push_back(dist);
841  m_Clusters[cluster_id].m_DistToRoot.push_back(dist);
842 }
843 
845 {
846  size_t cluster_id;
847 
848  // if the list of unused entries in the cluster list is empty
849  if (m_UnusedEntries.empty()) {
850 
851  // add a new cluster to the list
852  m_Clusters.push_back(CSingleCluster());
853  CSingleCluster& cluster = m_Clusters.back();
854  cluster.AddElement(elem);
855  cluster_id = m_Clusters.size() - 1;
856  }
857  else {
858 
859  // if there are any empty clusters in the list (created by joining two
860  // clusters)
861 
862  // use the empty cluster enrty in the list for joining the elements
863  cluster_id = m_UnusedEntries.front();
864  m_UnusedEntries.pop_front();
865  CSingleCluster& cluster = m_Clusters[cluster_id];
866  cluster.AddElement(elem);
867  }
868 
869  // set cluster id for the element
870  m_ClusterId[elem] = (int)cluster_id;
871 
872  if (m_MakeTrees) {
873  _ASSERT(!m_Clusters[cluster_id].m_Tree);
874  m_Clusters[cluster_id].m_Tree = s_CreateTreeLeaf(elem);
875  }
876 }
877 
878 void CClusterer::x_JoinClustElem(int cluster_id, int elem, double distance)
879 {
880  m_Clusters[cluster_id].AddElement(elem);
881  m_ClusterId[elem] = cluster_id;
882 
883  if (!m_MakeTrees) {
884  return;
885  }
886 
887  _ASSERT(m_Clusters[cluster_id].m_Tree);
888 
889  // attach the new node to cluster tree
890  TPhyTreeNode* root = new TPhyTreeNode();
891  TPhyTreeNode* old_root = m_Clusters[cluster_id].m_Tree;
892  TPhyTreeNode* node = s_CreateTreeLeaf(elem);
893 
894  root->AddNode(old_root);
895  root->AddNode(node);
896 
897  // find average leaf distance to root
898  double sum_dist = 0.0;
899  ITERATE (vector<double>, it, m_Clusters[cluster_id].m_DistToRoot) {
900  sum_dist += *it;
901  }
902  double ave_dist_to_root
903  = sum_dist / (double)m_Clusters[cluster_id].m_DistToRoot.size();
904 
905  // set edge lengths for subtrees
906  double d = (distance - ave_dist_to_root) / 2.0;
907 
908  old_root->GetValue().SetDist(d > 0.0 ? d : 0.0);
909  node->GetValue().SetDist(d > 0.0 ? d : 0.0);
910 
911  // set new root for cluster
912  m_Clusters[cluster_id].m_Tree = root;
913 
914  // update leaf distances to root
915  NON_CONST_ITERATE (vector<double>, it, m_Clusters[cluster_id].m_DistToRoot) {
916  *it += d;
917  }
918  m_Clusters[cluster_id].m_DistToRoot.push_back(d);
919 }
920 
921 
922 bool CClusterer::x_CanAddElem(int cluster_id, int elem, double& dist) const
923 {
924  // for the link method, clusters and elements can be joined as long
925  // as there exists at least one link between elements
926  if (m_LinkMethod == eDist) {
927  return true;
928  }
929 
930  // for the clique method there must be a link between all pairs of elements
931  if (m_MakeTrees) {
932  vector<int> el(1, elem);
933  return m_Links->IsLink(m_Clusters[cluster_id].GetElements(), el, dist);
934  }
935  else {
936 
937  ITERATE (vector<int>, it, m_Clusters[cluster_id]) {
938  if (!m_Links->IsLink(*it, elem)) {
939  return false;
940  }
941  }
942 
943  return true;
944  }
945 }
946 
947 
948 void CClusterer::x_JoinClusters(int cluster1_id, int cluster2_id, double dist)
949 {
950  CSingleCluster& cluster1 = m_Clusters[cluster1_id];
951  CSingleCluster& cluster2 = m_Clusters[cluster2_id];
952 
953  if (m_MakeTrees) {
954 
955  _ASSERT(cluster1.m_Tree);
956  _ASSERT(cluster2.m_Tree);
957 
958  // join cluster trees
959  TPhyTreeNode* root = new TPhyTreeNode();
960  TPhyTreeNode* left = cluster1.m_Tree;
961  TPhyTreeNode* right = cluster2.m_Tree;
962 
963  root->AddNode(left);
964  root->AddNode(right);
965 
966  // find edge lengths below new root
967  double left_dist_to_root, right_dist_to_root;
968  double sum = 0.0;
969  ITERATE (vector<double>, it, cluster1.m_DistToRoot) {
970  sum += *it;
971  }
972  left_dist_to_root = sum / (double)cluster1.size();
973 
974  sum = 0.0;
975  ITERATE (vector<double>, it, cluster2.m_DistToRoot) {
976  sum += *it;
977  }
978  right_dist_to_root = sum / (double)cluster2.size();
979 
980  double left_dist = dist - left_dist_to_root;
981  double right_dist = dist - right_dist_to_root;
982 
983  // set lengths lengths for new edges
984  left->GetValue().SetDist(left_dist > 0.0 ? left_dist : 0.0);
985  right->GetValue().SetDist(right_dist > 0.0 ? right_dist : 0.0);
986 
987  // update tree root, the clusters are joint into cluster1 see below
988  cluster1.m_Tree = root;
989 
990  // update leaf distances to root
991  NON_CONST_ITERATE (vector<double>, it, cluster1.m_DistToRoot) {
992  *it += left_dist;
993  }
994 
995  size_t num_elements
996  = cluster1.m_DistToRoot.size() + cluster2.m_DistToRoot.size();
997 
998  if (cluster1.m_DistToRoot.capacity() < num_elements) {
999 
1000  cluster1.m_DistToRoot.reserve(num_elements
1001  + (int)(0.3 * num_elements));
1002  }
1003  ITERATE (vector<double>, it, cluster2.m_DistToRoot) {
1004  cluster1.m_DistToRoot.push_back(*it + right_dist);
1005  }
1006  }
1007 
1008  // add elements of cluster2 to cluster1
1009  ITERATE (vector<int>, it, cluster2) {
1010  cluster1.AddElement(*it);
1011  m_ClusterId[*it] = cluster1_id;
1012  }
1013 
1014  // remove all elements and detouch tree of cluster 2
1015  cluster2.clear();
1016  cluster2.m_Tree = NULL;
1017  _ASSERT(!cluster2.m_Tree);
1018 
1019  // mark cluster2 as unused
1020  m_UnusedEntries.push_back(cluster2_id);
1021 }
1022 
1023 
1024 bool CClusterer::x_CanJoinClusters(int cluster1_id, int cluster2_id,
1025  double& dist) const
1026 {
1027  // for the link method, clusters can be joined as long
1028  // as there exists at least one link between elements
1029  if (m_LinkMethod == eDist) {
1030  return true;
1031  }
1032 
1033  // for the clique method there must be a link between all pairs of elements
1034  if (m_MakeTrees) {
1035  return m_Links->IsLink(m_Clusters[cluster1_id].GetElements(),
1036  m_Clusters[cluster2_id].GetElements(),
1037  dist);
1038  }
1039  else {
1040 
1041  ITERATE (vector<int>, it1, m_Clusters[cluster1_id]) {
1042  ITERATE (vector<int>, it2, m_Clusters[cluster2_id]) {
1043  if (!m_Links->IsLink(*it1, *it2)) {
1044  return false;
1045  }
1046  }
1047  }
1048 
1049  return true;
1050  }
1051 }
1052 
1053 int CClusterer::GetClusterId(int elem) const
1054 {
1055  if (elem < 0 || elem >= m_ClusterId.size()) {
1056  NCBI_THROW(CClustererException, eInvalidInput, "Element index out of "
1057  "range");
1058  }
1059 
1060  return m_ClusterId[elem];
1061 }
1062 
1063 
1064 void CClusterer::GetTrees(vector<TPhyTreeNode*>& trees) const
1065 {
1066  trees.clear();
1067  ITERATE(vector<TPhyTreeNode*>, it, m_Trees) {
1068  trees.push_back(*it);
1069  }
1070 }
1071 
1072 
1073 void CClusterer::ReleaseTrees(vector<TPhyTreeNode*>& trees)
1074 {
1075  trees.clear();
1076  ITERATE(vector<TPhyTreeNode*>, it, m_Trees) {
1077  trees.push_back(*it);
1078  }
1079  m_Trees.clear();
1080 }
1081 
1082 #ifdef NCBI_COMPILER_WORKSHOP
1083 // In some configurations, cobalt.o winds up with an otherwise
1084 // undefined reference to a slightly mismangled name! The compiler's
1085 // own README recommends this workaround for such incidents.
1086 #pragma weak "__1cEncbiGcobaltKCClustererMReleaseTrees6MrnDstdGvector4Cpn0AJCTreeNode4n0AMCPhyNodeData_n0AVCDefaultNodeKeyGetter4n0E_____n0DJallocator4Cp4_____v_" = "__1cEncbiGcobaltKCClustererMReleaseTrees6MrnDstdGvector4Cpn0AJCTreeNode4n0AMCPhyNodeData_n0AVCDefaultNodeKeyGetter4n0E_____n0DJallocator4C5_____v_"
1087 #endif
1088 
1089 
1090 const TPhyTreeNode* CClusterer::GetTree(int index) const
1091 {
1092  if (index < 0 || index >= (int)m_Trees.size()) {
1093  NCBI_THROW(CClustererException, eClusterIndexOutOfRange,
1094  "Tree index out of range");
1095  }
1096 
1097  return m_Trees[index];
1098 }
1099 
1100 
1102 {
1103  if (index < 0 || index >= (int)m_Trees.size()) {
1104  NCBI_THROW(CClustererException, eClusterIndexOutOfRange,
1105  "Tree index out of range");
1106  }
1107 
1108  TPhyTreeNode* result = m_Trees[index];
1109  m_Trees[index] = NULL;
1110 
1111  return result;
1112 }
1113 
1114 
1116 
1118  cluster->SetPrototype(cluster->FindCenterElement(*m_DistMatrix));
1119  }
1120 }
1121 
1122 
1124 {
1125  if (index >= (int)m_Clusters.size()) {
1126  NCBI_THROW(CClustererException, eClusterIndexOutOfRange,
1127  "Cluster index out of range");
1128  }
1129 
1130  const CSingleCluster& cluster = m_Clusters[index];
1131 
1132  mat.Resize(cluster.size(), cluster.size(), 0.0);
1133  for (size_t i=0;i < cluster.size() - 1;i++) {
1134  for (size_t j=i+1;j < cluster.size();j++) {
1135 
1136  if (cluster[i] >= (int)m_DistMatrix->GetRows()
1137  || cluster[j] >= (int)m_DistMatrix->GetRows()) {
1138  NCBI_THROW(CClustererException, eElementOutOfRange,
1139  "Distance matrix size is smaller than number of"
1140  " elements");
1141  }
1142 
1143  mat(i, j) = mat(j, i) = (*m_DistMatrix)(cluster[i], cluster[j]);
1144  }
1145  }
1146 }
1147 
1149 {
1151  m_Clusters.clear();
1152  PurgeDistMatrix();
1153  m_Links.Reset();
1154 }
1155 
1156 
1158 {
1159  if (!m_Links.Empty()) {
1161  }
1162  else if (m_DistMatrix.get()) {
1164  }
1165  else {
1166  NCBI_THROW(CClustererException, eInvalidInput, "Either distance matrix"
1167  " or distance links must be set");
1168  }
1169 }
1170 
1171 //------------------------------------------------------------
1172 
1174 {
1175  if (index >= m_Elements.size()) {
1176  NCBI_THROW(CClustererException, eElemIndexOutOfRange,
1177  "Cluster element index out of range");
1178  }
1179 
1180  return m_Elements[index];
1181 }
1182 
1183 
1185  dmatrix) const
1186 {
1187  if (m_Elements.empty()) {
1188  NCBI_THROW(CClustererException, eInvalidOptions, "Cluster is empty");
1189  }
1190 
1191 
1192  if (m_Elements.size() == 1) {
1193  return m_Elements[0];
1194  }
1195 
1196  vector<double> sum_distance;
1197  sum_distance.resize(m_Elements.size());
1198  for (size_t i=0;i < m_Elements.size(); i++) {
1199  double dist = 0.0;
1200  for (size_t j=0;j < m_Elements.size();j++) {
1201  if (i == j) {
1202  continue;
1203  }
1204 
1205  dist += dmatrix(m_Elements[i], m_Elements[j]);
1206  }
1207  sum_distance[i] = dist;
1208  }
1209 
1210  size_t min_index = 0;
1211  for (size_t i=1;i < sum_distance.size();i++) {
1212  if (sum_distance[i] < sum_distance[min_index]) {
1213  min_index = i;
1214  }
1215  }
1216 
1217  return m_Elements[min_index];
1218 }
1219 
1220 
Exceptions for CClusterer class.
Definition: clusterer.hpp:395
int FindCenterElement(const TDistMatrix &dmatrix) const
Find element that is closest to the center of the cluster.
Definition: clusterer.cpp:1184
void clear(void)
Removes all elements from the cluster.
Definition: clusterer.hpp:92
size_t size(void) const
Get cluster size.
Definition: clusterer.hpp:118
TPhyTreeNode * m_Tree
Cluster tree root.
Definition: clusterer.hpp:153
void AddElement(int el)
Add element to the cluster.
Definition: clusterer.hpp:88
vector< int > m_Elements
List of indeces of cluster elements.
Definition: clusterer.hpp:148
int operator[](size_t index) const
Get element.
Definition: clusterer.cpp:1173
vector< double > m_DistToRoot
Distances between cluster elements and tree root.
Definition: clusterer.hpp:156
void SetMaxDistance(double dist)
Set maximum distance in cluster.
Definition: clusterer.hpp:108
vector< TPhyTreeNode * > & GetTrees(void)
Get list of trees for clusters.
Definition: clusterer.hpp:297
list< int > m_UnusedEntries
Definition: clusterer.hpp:386
void ComputeClustersFromLinks(void)
Compute clusters using graph of distances between elements.
Definition: clusterer.cpp:621
int GetClusterId(int elem) const
Find id of cluster to which given element belongs.
Definition: clusterer.cpp:1053
bool x_CanAddElem(int cluster_id, int elem, double &dist) const
Check whether element can be added to the cluster.
Definition: clusterer.cpp:922
void ReleaseTrees(vector< TPhyTreeNode * > &trees)
Get list of trees for clusters and release ownership to caller.
Definition: clusterer.cpp:1073
@ eDist
Clusters can be joined if there is a link between at least one pair of elements.
Definition: clusterer.hpp:69
@ eClique
Clusters can be joined if there is a link between all pairs of their elements.
Definition: clusterer.hpp:67
bool m_MakeTrees
Definition: clusterer.hpp:388
EDistMethod
Method for computing distance between clusters.
Definition: clusterer.hpp:60
@ eCompleteLinkage
Maximum distance between elements.
Definition: clusterer.hpp:61
@ eAverageLinkage
Avegrae distance between elements.
Definition: clusterer.hpp:62
TPhyTreeNode * ReleaseTree(int index=0)
Get cluster tree and release ownership to caller.
Definition: clusterer.cpp:1101
bool m_ReportSingletons
Definition: clusterer.hpp:389
shared_ptr< TDistMatrix > m_DistMatrix
Definition: clusterer.hpp:378
void Reset(void)
Clear clusters and distance matrix.
Definition: clusterer.cpp:1148
const TSingleCluster & GetSingleCluster(size_t index) const
Get list of elements of a specified cluster.
Definition: clusterer.cpp:130
CNcbiMatrix< double > TDistMatrix
Definition: clusterer.hpp:56
CSingleCluster TSingleCluster
Definition: clusterer.hpp:159
const TPhyTreeNode * GetTree(int index=0) const
Get tree for specific cluster.
Definition: clusterer.cpp:1090
void x_JoinElements(const CLinks::SLink &link)
Join two elements and form a cluster.
Definition: clusterer.cpp:789
vector< int > m_ClusterId
Definition: clusterer.hpp:385
void SetDistMatrix(const TDistMatrix &dmat)
Set new distance matrix.
Definition: clusterer.cpp:97
const TDistMatrix & GetDistMatrix(void) const
Get distance matrix.
Definition: clusterer.cpp:118
void SetLinks(CRef< CLinks > links)
Set distance links.
Definition: clusterer.cpp:113
void x_JoinClustElem(int cluster_id, int elem, double dist)
Add element to a cluster.
Definition: clusterer.cpp:878
void Run(void)
Cluster elements.
Definition: clusterer.cpp:1157
~CClusterer()
Destructor.
Definition: clusterer.cpp:84
void x_JoinClusters(int cluster1_id, int cluster2_id, double dist)
Join two clusters.
Definition: clusterer.cpp:948
double m_MaxDiameter
Definition: clusterer.hpp:381
void ComputeClusters(double max_diam, EDistMethod dist_method=eCompleteLinkage, bool do_trees=true, double infinity=-1.0)
Compute clusters.
Definition: clusterer.cpp:272
void x_Init(void)
Initialize parameters.
Definition: clusterer.cpp:89
vector< TSingleCluster > TClusters
Definition: clusterer.hpp:160
bool x_CanJoinClusters(int cluster1_id, int cluster2_id, double &dist) const
Check whether two clusters can be joined.
Definition: clusterer.cpp:1024
void x_CreateCluster(int elem)
Create one-element cluster.
Definition: clusterer.cpp:844
vector< TPhyTreeNode * > m_Trees
Definition: clusterer.hpp:380
TClusters m_Clusters
Definition: clusterer.hpp:379
EClustMethod m_LinkMethod
Definition: clusterer.hpp:382
void SetPrototypes(void)
Set prototypes for all clusters as center elements.
Definition: clusterer.cpp:1115
void PurgeDistMatrix(void)
Delete distance matrix.
Definition: clusterer.hpp:323
CClusterer(void)
Create empty clusterer.
Definition: clusterer.cpp:51
CRef< CLinks > m_Links
Definition: clusterer.hpp:384
void GetClusterDistMatrix(int index, TDistMatrix &mat) const
Get distance matrix for elements of a selected cluster.
Definition: clusterer.cpp:1123
void Resize(size_t i, size_t j, T val=T())
resize this matrix, filling the empty cells with a known value
Definition: matrix.hpp:390
iterator end()
Definition: matrix.hpp:332
size_t GetRows() const
get the number of rows in this matrix
Definition: matrix.hpp:298
size_t GetCols() const
get the number of columns in this matrix
Definition: matrix.hpp:305
iterator begin()
iterators
Definition: matrix.hpp:325
definition of a Culling tree
Definition: ncbi_tree.hpp:100
USING_SCOPE(cobalt)
static void s_CheckDistMatrix(const CClusterer::TDistMatrix &dmat)
Definition: clusterer.cpp:43
static void s_PurgeTrees(vector< TPhyTreeNode * > &trees)
Definition: clusterer.cpp:75
TPhyTreeNode * s_CreateTreeLeaf(int id)
Definition: clusterer.cpp:259
static double s_FindDistAsMax(const CClusterer::TSingleCluster &cluster1, const CClusterer::TSingleCluster &cluster2, const CClusterer::TDistMatrix &dmat)
Definition: clusterer.cpp:163
static double s_FindMean(const vector< double > &vals)
Find mean.
Definition: clusterer.cpp:222
USING_NCBI_SCOPE
Definition: clusterer.cpp:40
static double s_FindDistAsMean(const CClusterer::TSingleCluster &cluster1, const CClusterer::TSingleCluster &cluster2, const CClusterer::TDistMatrix &dmat)
Find mean distance between cluster elements.
Definition: clusterer.cpp:202
static double s_FindDist(const CClusterer::CSingleCluster &cluster, const CClusterer::CSingleCluster &included_cluster, const CClusterer::CSingleCluster &extended_cluster, const CClusterer::TDistMatrix &dmat, double current_dist, CClusterer::EDistMethod dist_method)
Find distance between two clusters (cluster and included_cluster + extended_cluster) using given dist...
Definition: clusterer.cpp:237
static double s_FindSumDistFromElem(int elem, const CClusterer::TSingleCluster &cluster, const CClusterer::TDistMatrix &dmat)
Find sum of distances between given element and cluster elements.
Definition: clusterer.cpp:184
static double s_FindMaxDistFromElem(int elem, const CClusterer::TSingleCluster &cluster, const CClusterer::TDistMatrix &dmat)
Definition: clusterer.cpp:142
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
void swap(NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair1, NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair2)
Definition: ncbimisc.hpp:1508
#define NULL
Definition: ncbistd.hpp:225
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
Definition: ncbiobj.hpp:719
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5078
void AddNode(TTreeType *subnode)
Add new subnode.
Definition: ncbi_tree.hpp:743
const TValue & GetValue(void) const
Return node's value.
Definition: ncbi_tree.hpp:184
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
const int infinity
Definition: nucprot.cpp:52
int i
constexpr bool empty(list< Ts... >) noexcept
const struct ncbi::grid::netcache::search::fields::SIZE size
void copy(Njn::Matrix< S > *matrix_, const Njn::Matrix< T > &matrix0_)
Definition: njn_matrix.hpp:613
CTreeNode< CPhyNodeData > TPhyTreeNode
Definition: phy_node.hpp:81
#define _ASSERT
else result
Definition: token2.c:20
Modified on Fri Sep 20 14:58:04 2024 by modify_doxy.py rev. 669887