NCBI C++ ToolKit
clusterer.hpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 #ifndef ALGO_COBALT___CLUSTER_METHODS__HPP
2 #define ALGO_COBALT___CLUSTER_METHODS__HPP
3 
4 /* $Id: clusterer.hpp 92103 2020-12-22 14:59:31Z boratyng $
5 * ===========================================================================
6 *
7 * PUBLIC DOMAIN NOTICE
8 * National Center for Biotechnology Information
9 *
10 * This software/database is a "United States Government Work" under the
11 * terms of the United States Copyright Act. It was written as part of
12 * the author's offical duties as a United States Government employee and
13 * thus cannot be copyrighted. This software/database is freely available
14 * to the public for use. The National Library of Medicine and the U.S.
15 * Government have not placed any restriction on its use or reproduction.
16 *
17 * Although all reasonable efforts have been taken to ensure the accuracy
18 * and reliability of the software and data, the NLM and the U.S.
19 * Government do not and cannot warrant the performance or results that
20 * may be obtained by using this software or data. The NLM and the U.S.
21 * Government disclaim all warranties, express or implied, including
22 * warranties of performance, merchantability or fitness for any particular
23 * purpose.
24 *
25 * Please cite the author in any work or product based on this material.
26 *
27 * ===========================================================================*/
28 
29 /*****************************************************************************
30 
31 File name: clusterer.hpp
32 
33 Author: Greg Boratyn
34 
35 Contents: Interface for CClusterer class
36 
37 ******************************************************************************/
38 
39 
40 #include <corelib/ncbidbg.hpp>
41 #include <corelib/ncbiobj.hpp>
42 #include <util/math/matrix.hpp>
44 #include <algo/cobalt/links.hpp>
45 #include <vector>
46 
48 BEGIN_SCOPE(cobalt)
49 
50 /// Interface for CClusterer class used for clustering any type of data based
51 /// on distance matrix. The class operates on ideces in the distance matrix.
53 {
54 public:
55 
57 
58  /// Method for computing distance between clusters
59  ///
60  enum EDistMethod {
61  eCompleteLinkage = 0, ///< Maximum distance between elements
62  eAverageLinkage ///< Avegrae distance between elements
63  };
64 
65  /// Method for clustering from links
66  enum EClustMethod {
67  eClique = 0, ///< Clusters can be joined if there is a link
68  ///< between all pairs of their elements
69  eDist ///< Clusters can be joined if there is a link
70  ///< between at least one pair of elements
71  };
72 
73  /// Single cluster
75  {
76  public:
77  typedef vector<int>::const_iterator const_iterator;
78  typedef vector<int>::iterator iterator;
79 
80  public:
81  /// Create empty cluster
82  ///
83  CSingleCluster(void) : m_Prototype(-1), m_Diameter(0.0), m_Tree(NULL) {}
84 
85  /// Add element to the cluster
86  /// @param el Index of an element
87  ///
88  void AddElement(int el) {m_Elements.push_back(el);}
89 
90  /// Removes all elements from the cluster
91  ///
92  void clear(void) {m_Elements.clear();}
93 
94  /// Set cluster prototype
95  /// @param el Index of element to be used as cluster prototype
96  ///
97  /// Prototype does not have to belong to the cluster
98  void SetPrototype(int el) {m_Prototype = el;}
99 
100  /// Get cluster prototype
101  /// @return Index of element used as cluster prototype
102  ///
103  int GetPrototype(void) const {return m_Prototype;}
104 
105  /// Set maximum distance in cluster
106  /// @param dist Maximum distance
107  ///
108  void SetMaxDistance(double dist) {m_Diameter = dist;}
109 
110  /// Get maximum distance in cluster
111  /// @return Maximum distance in cluster
112  ///
113  double GetMaxDistance(void) const {return m_Diameter;}
114 
115  /// Get cluster size
116  /// @return Number of elements in the cluster
117  ///
118  size_t size(void) const {return m_Elements.size();}
119 
120  /// Find element that is closest to the center of the cluster
121  /// @param dmatrix Full distance matrix used to cluster data
122  /// @return Index of element that has smallest distance to all other
123  /// elements
124  ///
125  int FindCenterElement(const TDistMatrix& dmatrix) const;
126 
127  /// Get list of cluster elements
128  /// @return Cluster elements
129  ///
130  const vector<int>& GetElements(void) const {return m_Elements;}
131 
132  /// Get element
133  /// @param index Element index
134  /// @return Element
135  ///
136  int operator[](size_t index) const;
137 
138  // Iterators
139 
140  const_iterator begin(void) const {return m_Elements.begin();}
141  const_iterator end(void) const {return m_Elements.end();}
142  iterator begin(void) {return m_Elements.begin();}
143  iterator end(void) {return m_Elements.end();}
144 
145  protected:
146  int m_Prototype; ///< Index of cluster representative element
147  double m_Diameter; ///< Max distance between elements
148  vector<int> m_Elements; ///< List of indeces of cluster elements
149 
150  public:
151 
152  /// Cluster tree root
154 
155  /// Distances between cluster elements and tree root
156  vector<double> m_DistToRoot;
157  };
158 
160  typedef vector<TSingleCluster> TClusters;
161 
162 public:
163 
164  /// Create empty clusterer
165  ///
166  CClusterer(void);
167 
168  /// Create clusterer
169  /// @param dmat Distance matrix
170  ///
171  CClusterer(const TDistMatrix& dmat);
172 
173  /// Create clusterer
174  /// @param dmat Pointer to distance matrix
175  ///
176  CClusterer(shared_ptr<TDistMatrix>& dmat);
177 
178  /// Create clusterer
179  /// @param links Graph of distances between elements
180  ///
181  CClusterer(CRef<CLinks> links);
182 
183  /// Destructor
184  ///
185  ~CClusterer();
186 
187  /// Set new distance matrix
188  /// @param dmat Distance matrix
189  ///
190  void SetDistMatrix(const TDistMatrix& dmat);
191 
192  /// Set new distance matrix without copying
193  /// @param dmat Distance matrix
194  ///
195  void SetDistMatrix(shared_ptr<TDistMatrix>& dmat);
196 
197  /// Set distance links
198  /// @param links Distance links
199  ///
200  void SetLinks(CRef<CLinks> links);
201 
202  /// Get distance matrix
203  /// @return Distance matrix
204  ///
205  const TDistMatrix& GetDistMatrix(void) const;
206 
207  /// Set maximum diameter for single cluster
208  /// @param diam Maximum cluster diameter
209  ///
210  void SetMaxClusterDiameter(double diam) {m_MaxDiameter = diam;}
211 
212  /// Get maximum diameter for single cluster
213  /// @return Maximum cluster diameter
214  ///
215  double GetMaxClusterDiameter(void) const {return m_MaxDiameter;}
216 
217  /// Set clustering method for links
218  /// @param method Clustering method
219  ///
220  void SetClustMethod(EClustMethod method) {m_LinkMethod = method;}
221 
222  /// Get clustering method for links
223  /// @return Clustering method for links
224  ///
225  EClustMethod GetClustMethod(void) const {return m_LinkMethod;}
226 
227  /// Set make cluster tree/dendrogram option
228  /// @param trees If true cluster trees will be computed [in]
229  ///
230  void SetMakeTrees(bool trees) {m_MakeTrees = trees;}
231 
232  /// Set reporting of single element clusters
233  /// @param b If true, single element clusters will be reported [in]
234  ///
235  void SetReportSingletons(bool b) {m_ReportSingletons = b;}
236 
237  /// Get reporting mode for single element clusters
238  /// @return If true, single element clusters are reported
239  ///
240  bool GetReportSingletons(void) const {return m_ReportSingletons;}
241 
242  /// Compute clusters
243  ///
244  /// Computes complete linkage distance-based clustering with constrainted
245  /// maxium pairwise distance between cluster elements. Cluster dendrogram
246  /// can be computed for each such cluster indepenently.
247  /// @param max_dim Maximum distance between two elements in a cluster [in]
248  /// @param dist_method Method for computing distance between clusters [in]
249  /// @param do_trees If true, cluster dendrogram will be computed for each
250  /// cluster [in]
251  /// @param infinity Distance above which two single elements cannot be
252  /// joined in a cluster. They are added to exising clusters. [in]
253  ///
254  void ComputeClusters(double max_diam,
255  EDistMethod dist_method = eCompleteLinkage,
256  bool do_trees = true,
257  double infinity = -1.0);
258 
259  /// Compute clusters using graph of distances between elements
260  ///
261  void ComputeClustersFromLinks(void);
262 
263  /// Get list of elements of a specified cluster
264  /// @param index Cluster index
265  /// @return list of element indeces that belong to the cluster
266  ///
267  const TSingleCluster& GetSingleCluster(size_t index) const;
268 
269  /// Get clusters
270  /// @return list of clusters
271  ///
272  const TClusters& GetClusters(void) const {return m_Clusters;}
273 
274  /// Set clusters
275  /// @return Clusters
276  ///
277  TClusters& SetClusters(void) {return m_Clusters;}
278 
279  /// Find id of cluster to which given element belongs
280  /// @param elem Element [in]
281  /// @return Cluster numerical id
282  ///
283  int GetClusterId(int elem) const;
284 
285  /// Get list of trees for clusters
286  /// @param List of trees [out]
287  ///
288  void GetTrees(vector<TPhyTreeNode*>& trees) const;
289 
290  /// Get list of trees for clusters and release ownership to caller
291  /// @param List of trees [out]
292  ///
293  void ReleaseTrees(vector<TPhyTreeNode*>& trees);
294 
295  /// Get list of trees for clusters
296  /// @return List of trees
297  vector<TPhyTreeNode*>& GetTrees(void) {return m_Trees;}
298 
299  /// Get tree for specific cluster
300  /// @param index Cluster index [in]
301  /// @return Cluster tree
302  ///
303  const TPhyTreeNode* GetTree(int index = 0) const;
304 
305  /// Get cluster tree and release ownership to caller
306  /// @param index Cluster index [in]
307  /// @return Cluster Tree
308  ///
309  TPhyTreeNode* ReleaseTree(int index = 0);
310 
311  /// Set prototypes for all clusters as center elements
312  ///
313  void SetPrototypes(void);
314 
315  /// Get distance matrix for elements of a selected cluster
316  /// @param index Cluster index [in]
317  /// @param mat Distance matrix for cluster elements [out]
318  ///
319  void GetClusterDistMatrix(int index, TDistMatrix& mat) const;
320 
321  /// Delete distance matrix
322  ///
323  void PurgeDistMatrix(void) {m_DistMatrix.reset();}
324 
325  /// Clear clusters and distance matrix
326  ///
327  void Reset(void);
328 
329  /// Cluster elements. The clustering method is selected based on whether
330  /// distance matrix or distance links are set.
331  ///
332  void Run(void);
333 
334 protected:
335 
336  /// Forbid copy constructor
338 
339  /// Forbid assignment operator
341 
342  /// Initialize parameters
343  void x_Init(void);
344 
345  /// Join two elements and form a cluster
346  void x_JoinElements(const CLinks::SLink& link);
347 
348  /// Add element to a cluster
349  void x_JoinClustElem(int cluster_id, int elem, double dist);
350 
351  /// Join two clusters
352  void x_JoinClusters(int cluster1_id, int cluster2_id, double dist);
353 
354  /// Create one-element cluster
355  void x_CreateCluster(int elem);
356 
357  /// Check whether element can be added to the cluster. The function assumes
358  /// that there is a link between the element and at least one element
359  /// of the cluster.
360  /// @param cluster_id Cluster id [in]
361  /// @param elem Element [in]
362  /// @param dist Average distance between the element and cluster
363  /// elements [out]
364  bool x_CanAddElem(int cluster_id, int elem, double& dist) const;
365 
366  /// Check whether two clusters can be joined. The function assumes that
367  /// there is a link between at least one pair of elements from the two
368  /// clusters.
369  /// @param cluster1_id Id of the first cluster [in]
370  /// @param cluster2_id Id of the second cluster [in]
371  /// @param dist Average distance between all pairs of elements (x, y), such
372  /// that x belongs to cluster1 and y to cluster2 [out]
373  bool x_CanJoinClusters(int cluster1_id, int cluster2_id,
374  double& dist) const;
375 
376 
377 protected:
378  shared_ptr<TDistMatrix> m_DistMatrix;
380  vector<TPhyTreeNode*> m_Trees;
383 
385  vector<int> m_ClusterId;
386  list<int> m_UnusedEntries;
387 
390 };
391 
392 
393 /// Exceptions for CClusterer class
395 {
396 public:
397 
398  /// Error codes
399  enum EErrCode {
400  eClusterIndexOutOfRange, ///< Index of cluster out of range
401  eElemIndexOutOfRange, ///< Index of cluster element out of range
402  eElementOutOfRange, ///< Cluster element is larger than distance
403  ///< matrix size
404  eNoDistMatrix, ///< Distance matrix not assigned
407  };
408 
410 };
411 
412 
413 END_SCOPE(cobalt)
415 
416 #endif /* ALGO_COBALT___CLUSTER_METHODS__HPP */
417 
Exceptions for CClusterer class.
Definition: clusterer.hpp:395
NCBI_EXCEPTION_DEFAULT(CClustererException, CException)
EErrCode
Error codes.
Definition: clusterer.hpp:399
@ eElementOutOfRange
Cluster element is larger than distance matrix size.
Definition: clusterer.hpp:402
@ eElemIndexOutOfRange
Index of cluster element out of range.
Definition: clusterer.hpp:401
@ eClusterIndexOutOfRange
Index of cluster out of range.
Definition: clusterer.hpp:400
@ eNoDistMatrix
Distance matrix not assigned.
Definition: clusterer.hpp:404
CSingleCluster(void)
Create empty cluster.
Definition: clusterer.hpp:83
void SetPrototype(int el)
Set cluster prototype.
Definition: clusterer.hpp:98
const_iterator begin(void) const
Definition: clusterer.hpp:140
void clear(void)
Removes all elements from the cluster.
Definition: clusterer.hpp:92
int GetPrototype(void) const
Get cluster prototype.
Definition: clusterer.hpp:103
size_t size(void) const
Get cluster size.
Definition: clusterer.hpp:118
TPhyTreeNode * m_Tree
Cluster tree root.
Definition: clusterer.hpp:153
void AddElement(int el)
Add element to the cluster.
Definition: clusterer.hpp:88
double m_Diameter
Max distance between elements.
Definition: clusterer.hpp:147
vector< int > m_Elements
List of indeces of cluster elements.
Definition: clusterer.hpp:148
int m_Prototype
Index of cluster representative element.
Definition: clusterer.hpp:146
const vector< int > & GetElements(void) const
Get list of cluster elements.
Definition: clusterer.hpp:130
vector< int >::iterator iterator
Definition: clusterer.hpp:78
vector< int >::const_iterator const_iterator
Definition: clusterer.hpp:77
double GetMaxDistance(void) const
Get maximum distance in cluster.
Definition: clusterer.hpp:113
vector< double > m_DistToRoot
Distances between cluster elements and tree root.
Definition: clusterer.hpp:156
const_iterator end(void) const
Definition: clusterer.hpp:141
void SetMaxDistance(double dist)
Set maximum distance in cluster.
Definition: clusterer.hpp:108
Interface for CClusterer class used for clustering any type of data based on distance matrix.
Definition: clusterer.hpp:53
vector< TPhyTreeNode * > & GetTrees(void)
Get list of trees for clusters.
Definition: clusterer.hpp:297
list< int > m_UnusedEntries
Definition: clusterer.hpp:386
void SetMaxClusterDiameter(double diam)
Set maximum diameter for single cluster.
Definition: clusterer.hpp:210
double GetMaxClusterDiameter(void) const
Get maximum diameter for single cluster.
Definition: clusterer.hpp:215
EClustMethod
Method for clustering from links.
Definition: clusterer.hpp:66
bool m_MakeTrees
Definition: clusterer.hpp:388
EDistMethod
Method for computing distance between clusters.
Definition: clusterer.hpp:60
bool m_ReportSingletons
Definition: clusterer.hpp:389
shared_ptr< TDistMatrix > m_DistMatrix
Definition: clusterer.hpp:378
CClusterer & operator=(const CClusterer &)
Forbid assignment operator.
void SetReportSingletons(bool b)
Set reporting of single element clusters.
Definition: clusterer.hpp:235
CNcbiMatrix< double > TDistMatrix
Definition: clusterer.hpp:56
EClustMethod GetClustMethod(void) const
Get clustering method for links.
Definition: clusterer.hpp:225
CSingleCluster TSingleCluster
Definition: clusterer.hpp:159
bool GetReportSingletons(void) const
Get reporting mode for single element clusters.
Definition: clusterer.hpp:240
void SetMakeTrees(bool trees)
Set make cluster tree/dendrogram option.
Definition: clusterer.hpp:230
vector< int > m_ClusterId
Definition: clusterer.hpp:385
const TClusters & GetClusters(void) const
Get clusters.
Definition: clusterer.hpp:272
double m_MaxDiameter
Definition: clusterer.hpp:381
void SetClustMethod(EClustMethod method)
Set clustering method for links.
Definition: clusterer.hpp:220
CClusterer(const CClusterer &)
Forbid copy constructor.
vector< TSingleCluster > TClusters
Definition: clusterer.hpp:160
vector< TPhyTreeNode * > m_Trees
Definition: clusterer.hpp:380
TClusters & SetClusters(void)
Set clusters.
Definition: clusterer.hpp:277
TClusters m_Clusters
Definition: clusterer.hpp:379
CClusterer(shared_ptr< TDistMatrix > &dmat)
Create clusterer.
EClustMethod m_LinkMethod
Definition: clusterer.hpp:382
void PurgeDistMatrix(void)
Delete distance matrix.
Definition: clusterer.hpp:323
CRef< CLinks > m_Links
Definition: clusterer.hpp:384
CObject –.
Definition: ncbiobj.hpp:180
definition of a Culling tree
Definition: ncbi_tree.hpp:100
EDistMethod
Definition: cuDistmat.hpp:60
#define NULL
Definition: ncbistd.hpp:225
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
void Run(void)
Enter the main loop.
#define NCBI_COBALT_EXPORT
Definition: ncbi_export.h:977
const int infinity
Definition: nucprot.cpp:52
NCBI C++ auxiliary debug macros.
Portable reference counted smart and weak pointers using CWeakRef, CRef, CObject and CObjectEx.
Modified on Wed Jul 24 17:17:29 2024 by modify_doxy.py rev. 669887