NCBI C++ ToolKit
vector_score.hpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 #ifndef ALGO_TEXT___VECTOR_SCORE__HPP
2 #define ALGO_TEXT___VECTOR_SCORE__HPP
3 
4 /* $Id: vector_score.hpp 47451 2010-10-08 00:12:53Z dicuccio $
5  * ===========================================================================
6  *
7  * PUBLIC DOMAIN NOTICE
8  * National Center for Biotechnology Information
9  *
10  * This software/database is a "United States Government Work" under the
11  * terms of the United States Copyright Act. It was written as part of
12  * the author's official duties as a United States Government employee and
13  * thus cannot be copyrighted. This software/database is freely available
14  * to the public for use. The National Library of Medicine and the U.S.
15  * Government have not placed any restriction on its use or reproduction.
16  *
17  * Although all reasonable efforts have been taken to ensure the accuracy
18  * and reliability of the software and data, the NLM and the U.S.
19  * Government do not and cannot warrant the performance or results that
20  * may be obtained by using this software or data. The NLM and the U.S.
21  * Government disclaim all warranties, express or implied, including
22  * warranties of performance, merchantability or fitness for any particular
23  * purpose.
24  *
25  * Please cite the author in any work or product based on this material.
26  *
27  * ===========================================================================
28  *
29  * Authors: Mike DiCuccio
30  *
31  * File Description:
32  *
33  */
34 
35 #include <corelib/ncbistd.hpp>
36 #include <math.h>
37 
39 
40 
41 ///
42 /// Cosine similarity measure
43 ///
44 
45 template <class iterator1, class iterator2>
46 float Cosine(iterator1 iter1, iterator1 end1,
47  iterator2 iter2, iterator2 end2)
48 {
49  float cosine = 0;
50  float len_a = 0;
51  float len_b = 0;
52 
53  for ( ; iter1 != end1 && iter2 != end2; ) {
54  if (iter1->first == iter2->first) {
55  cosine += float(iter1->second) * float(iter2->second);
56  len_a += iter1->second * iter1->second;
57  len_b += iter2->second * iter2->second;
58  ++iter1;
59  ++iter2;
60  } else {
61  if (iter1->first < iter2->first) {
62  len_a += iter1->second * iter1->second;
63  ++iter1;
64  } else {
65  len_b += iter2->second * iter2->second;
66  ++iter2;
67  }
68  }
69  }
70 
71  for ( ; iter1 != end1; ++iter1) {
72  len_a += iter1->second * iter1->second;
73  }
74 
75  for ( ; iter2 != end2; ++iter2) {
76  len_b += iter2->second * iter2->second;
77  }
78 
79  cosine /= sqrt(len_a * len_b);
80 
81  return cosine;
82 }
83 
84 ///
85 /// Minkowski similarity measure
86 ///
87 
88 template <class iterator1, class iterator2>
89 float Minkowski(iterator1 iter1, iterator1 end1,
90  iterator2 iter2, iterator2 end2,
91  size_t power)
92 {
93  float mink = 0;
94  float len_a = 0;
95  float len_b = 0;
96 
97  for ( ; iter1 != end1 && iter2 != end2; ) {
98  if (iter1->first == iter2->first) {
99  mink += float(iter1->second) * float(iter2->second);
100  len_a += pow(iter1->second, power);
101  len_b += pow(iter2->second, power);
102  ++iter1;
103  ++iter2;
104  } else {
105  if (iter1->first < iter2->first) {
106  len_a += pow(iter1->second, power);
107  ++iter1;
108  } else {
109  len_b += pow(iter2->second, power);
110  ++iter2;
111  }
112  }
113  }
114 
115  for ( ; iter1 != end1; ++iter1) {
116  len_a += pow(iter1->second, power);
117  }
118 
119  for ( ; iter2 != end2; ++iter2) {
120  len_b += pow(iter2->second, power);
121  }
122 
123  mink /= pow(len_a * len_b, 1.0f / float(power));
124  return mink;
125 }
126 
127 
128 ///
129 /// Dot-product similarity
130 ///
131 template <class iterator1, class iterator2>
132 float Dot(iterator1 iter1, iterator1 end1,
133  iterator2 iter2, iterator2 end2)
134 {
135  float dot = 0;
136 
137  for ( ; iter1 != end1 && iter2 != end2; ) {
138  if (iter1->first == iter2->first) {
139  dot += float(iter1->second) * float(iter2->second);
140  ++iter1;
141  ++iter2;
142  } else {
143  if (iter1->first < iter2->first) {
144  ++iter1;
145  } else {
146  ++iter2;
147  }
148  }
149  }
150 
151  return dot;
152 }
153 
154 
155 ///
156 /// Euclidean distance measure
157 ///
158 
159 template <class iterator1, class iterator2>
160 float Distance(iterator1 iter1, iterator1 end1,
161  iterator2 iter2, iterator2 end2)
162 {
163  float dist = 0;
164  for ( ; iter1 != end1 && iter2 != end2; ) {
165  if (iter1->first == iter2->first) {
166  float diff = float(iter1->second) - iter2->second;
167  dist += diff * diff;
168  ++iter1;
169  ++iter2;
170  } else {
171  if (iter1->first < iter2->first) {
172  dist += iter1->second * iter1->second;
173  ++iter1;
174  } else {
175  dist += iter2->second * iter2->second;
176  ++iter2;
177  }
178  }
179  }
180 
181  for ( ; iter1 != end1; ++iter1) {
182  dist += iter1->second * iter1->second;
183  }
184 
185  for ( ; iter2 != end2; ++iter2) {
186  dist += iter2->second * iter2->second;
187  }
188 
189  return sqrt(dist);
190 }
191 
192 
193 ///
194 /// Dot and distance in one step
195 ///
196 
197 template <class iterator1, class iterator2>
198 void DotAndDistance(iterator1 iter1, iterator1 end1,
199  iterator2 iter2, iterator2 end2,
200  float* dot_in, float* dist_in)
201 {
202  float dot = 0;
203  float dist = 0;
204  for ( ; iter1 != end1 && iter2 != end2; ) {
205  if (iter1->first == iter2->first) {
206  float diff = iter1->second - iter2->second;
207  dist += diff * diff;
208  dot += iter1->second * iter2->second;
209 
210  ++iter1;
211  ++iter2;
212  } else {
213  if (iter1->first < iter2->first) {
214  dist += iter1->second * iter1->second;
215  ++iter1;
216  } else {
217  dist += iter2->second * iter2->second;
218  ++iter2;
219  }
220  }
221  }
222 
223  for ( ; iter1 != end1; ++iter1) {
224  dist += iter1->second * iter1->second;
225  }
226 
227  for ( ; iter2 != end2; ++iter2) {
228  dist += iter2->second * iter2->second;
229  }
230 
231  if (dot_in) {
232  *dot_in = dot;
233  }
234 
235  if (dist_in) {
236  *dist_in = sqrt(dist);
237  }
238 }
239 
240 
241 
242 
243 ///
244 /// Jaccard similarity
245 ///
246 
247 template <class iterator1, class iterator2>
248 float Jaccard(iterator1 iter1, iterator1 end1,
249  iterator2 iter2, iterator2 end2)
250 {
251  float dot = 0;
252  float score_a = 0;
253  float score_b = 0;
254 
255  for ( ; iter1 != end1 && iter2 != end2; ) {
256  if (iter1->first == iter2->first) {
257  float v1 = float(iter1->second);
258  float v2 = float(iter2->second);
259 
260  dot += v1 * v2;
261  score_a += v1 * v1;
262  score_b += v2 * v2;
263 
264  ++iter1;
265  ++iter2;
266  } else {
267  if (iter1->first < iter2->first) {
268  score_a += iter1->second * iter1->second;
269  ++iter1;
270  } else {
271  score_b += iter2->second * iter2->second;
272  ++iter2;
273  }
274  }
275  }
276 
277  for ( ; iter1 != end1; ++iter1) {
278  score_a += iter1->second * iter1->second;
279  }
280 
281  for ( ; iter2 != end2; ++iter2) {
282  score_b += iter2->second * iter2->second;
283  }
284 
285  return (dot / (score_a + score_b - dot));
286 }
287 
288 
289 ///
290 /// Dice coefficient
291 ///
292 
293 template <class iterator1, class iterator2>
294 float Dice(iterator1 iter1, iterator1 end1,
295  iterator2 iter2, iterator2 end2)
296 {
297  float dot = 0;
298  float score_a = 0;
299  float score_b = 0;
300 
301  for ( ; iter1 != end1 && iter2 != end2; ) {
302  if (iter1->first == iter2->first) {
303  float v1 = float(iter1->second);
304  float v2 = float(iter2->second);
305 
306  dot += v1 * v2;
307  score_a += iter1->second;
308  score_b += iter2->second;
309 
310  ++iter1;
311  ++iter2;
312  } else {
313  if (iter1->first < iter2->first) {
314  score_a += iter1->second;
315  ++iter1;
316  } else {
317  score_b += iter2->second;
318  ++iter2;
319  }
320  }
321  }
322 
323  for ( ; iter1 != end1; ++iter1) {
324  score_a += iter1->second;
325  }
326 
327  for ( ; iter2 != end2; ++iter2) {
328  score_b += iter2->second;
329  }
330 
331  return (dot / (score_a + score_b));
332 }
333 
334 
335 ///
336 /// Overlap measure
337 ///
338 
339 template <class iterator1, class iterator2>
340 float Overlap(iterator1 iter1, iterator1 end1,
341  iterator2 iter2, iterator2 end2)
342 {
343  float dot = 0;
344  float sum_a = 0;
345  float sum_b = 0;
346  for ( ; iter1 != end1 && iter2 != end2; ) {
347  if (iter1->first == iter2->first) {
348  dot += float(iter1->second) * float(iter2->second);
349  sum_a += iter1->second;
350  sum_b += iter2->second;
351  ++iter1;
352  ++iter2;
353  } else {
354  if (iter1->first < iter2->first) {
355  sum_a += iter1->second;
356  ++iter1;
357  } else {
358  sum_b += iter2->second;
359  ++iter2;
360  }
361  }
362  }
363 
364  for ( ; iter1 != end1; ++iter1) {
365  sum_a += iter1->second;
366  }
367 
368  for ( ; iter2 != end2; ++iter2) {
369  sum_b += iter2->second;
370  }
371 
372  return dot / min(sum_a, sum_b);
373 }
374 
375 
377 
378 #endif // ALGO_TEXT___VECTOR_SCORE__HPP
Include a standard set of the NCBI C++ Toolkit most basic headers.
const CVect2< U > & v2
Definition: globals.hpp:440
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
T min(T x_, T y_)
float Dice(iterator1 iter1, iterator1 end1, iterator2 iter2, iterator2 end2)
Dice coefficient.
float Distance(iterator1 iter1, iterator1 end1, iterator2 iter2, iterator2 end2)
Euclidean distance measure.
float Dot(iterator1 iter1, iterator1 end1, iterator2 iter2, iterator2 end2)
Dot-product similarity.
float Minkowski(iterator1 iter1, iterator1 end1, iterator2 iter2, iterator2 end2, size_t power)
Minkowski similarity measure.
float Jaccard(iterator1 iter1, iterator1 end1, iterator2 iter2, iterator2 end2)
Jaccard similarity.
float Cosine(iterator1 iter1, iterator1 end1, iterator2 iter2, iterator2 end2)
Cosine similarity measure.
float Overlap(iterator1 iter1, iterator1 end1, iterator2 iter2, iterator2 end2)
Overlap measure.
void DotAndDistance(iterator1 iter1, iterator1 end1, iterator2 iter2, iterator2 end2, float *dot_in, float *dist_in)
Dot and distance in one step.
Modified on Sat Jun 22 10:38:13 2024 by modify_doxy.py rev. 669887