NCBI C++ ToolKit
dictionary_util.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: dictionary_util.cpp 99056 2023-02-08 14:10:06Z ivanov $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Mike DiCuccio
27  *
28  * File Description:
29  *
30  */
31 
32 #include <ncbi_pch.hpp>
33 #include <util/static_map.hpp>
34 #include <util/dictionary_util.hpp>
35 
37 
38 
39 // maximum internal size for metaphone computation
40 // this is used to determine a heap vs stack allocation cutoff in the exact
41 // edit distance method; as CSimpleDictionary relies heavily on edit distance
42 // computations, the size of its internal metaphone keys is tuned by this
43 // parameter.
44 static const size_t kMaxMetaphoneStack = 10;
45 
46 
47 void CDictionaryUtil::GetMetaphone(const string& in, string* out,
48  size_t max_chars)
49 {
50  _ASSERT(out);
51  out->erase();
52  if (in.empty()) {
53  return;
54  }
55 
56  CTempString vowels("aeiou");
57 
58  ITERATE (string, iter, in) {
59  size_t prev_len = iter - in.begin();
60  size_t remaining = in.length() - prev_len - 1;
61 
62  if (prev_len &&
63  tolower((unsigned char)(*iter)) == tolower((unsigned char)(*(iter - 1))) &&
64  tolower((unsigned char)(*iter)) != 'c') {
65  continue;
66  }
67  switch (tolower((unsigned char)(*iter))) {
68  case 'a':
69  case 'e':
70  case 'i':
71  case 'o':
72  case 'u':
73  if ( !prev_len ) {
74  *out += 'a';
75  ++max_chars;
76  }
77  break;
78 
79  case 'b':
80  if (remaining != 0 ||
81  !prev_len ||
82  *(iter - 1) != 'm') {
83  *out += 'p';
84  }
85 
86  break;
87 
88  case 'f':
89  case 'j':
90  case 'l':
91  case 'n':
92  case 'r':
93  *out += (char)tolower((unsigned char)(*iter));
94  break;
95 
96  case 'c':
97  if (remaining > 2 &&
98  *(iter + 1) == 'i' &&
99  *(iter + 2) == 'a') {
100  *out += 'x';
101  iter += 2;
102  break;
103  }
104 
105  if (remaining > 1 && *(iter + 1) == 'h') {
106  *out += 'x';
107  ++iter;
108  break;
109  }
110 
111  if (remaining &&
112  ( *(iter + 1) == 'e' ||
113  *(iter + 1) == 'i' ||
114  *(iter + 1) == 'y' ) ) {
115  *out += 's';
116  ++iter;
117  break;
118  }
119 
120  if (remaining && *(iter + 1) == 'k') {
121  ++iter;
122  }
123  *out += 'k';
124  break;
125 
126  case 'd':
127  if (remaining >= 2 && prev_len) {
128  if ( *(iter + 1) == 'g' &&
129  ( *(iter + 2) == 'e' ||
130  *(iter + 2) == 'i' ||
131  *(iter + 2) == 'y' ) ) {
132  *out += 'j';
133  iter += 2;
134  break;
135  }
136  }
137  *out += 't';
138  break;
139 
140  case 'g':
141  if (remaining == 1 && *(iter + 1) == 'h') {
142  if (prev_len > 2 && ( *(iter - 3) == 'b' ||
143  *(iter - 3) == 'd') ) {
144  *out += 'k';
145  ++iter;
146  break;
147  }
148 
149  if (prev_len > 3 && *(iter - 3) == 'h') {
150  *out += 'k';
151  ++iter;
152  break;
153  }
154  if (prev_len > 4 && *(iter - 4) == 'h') {
155  *out += 'k';
156  ++iter;
157  break;
158  }
159 
160  *out += 'f';
161  ++iter;
162  break;
163  }
164 
165  if (remaining == 1 &&
166  (*(iter + 1) == 'n' || *(iter + 1) == 'm')) {
167  ++iter;
168  break;
169  }
170 
171  if (remaining && !prev_len && *(iter + 1) == 'n') {
172  ++iter;
173  *out += 'n';
174  break;
175  }
176 
177  if (remaining == 3 &&
178  *(iter + 1) == 'n' &&
179  *(iter + 1) == 'e' &&
180  *(iter + 1) == 'd') {
181  iter += 3;
182  break;
183  }
184 
185  if ( (remaining > 1 && *(iter + 1) == 'e') ||
186  (remaining && ( *(iter + 1) == 'i' ||
187  *(iter + 1) == 'y' ) ) ) {
188  *out += 'j';
189  ++iter;
190  break;
191  }
192 
193  *out += 'k';
194  break;
195 
196  case 'h':
197  if (remaining && prev_len &&
198  vowels.find(*(iter + 1)) != string::npos &&
199  CTempString("cgpst").find(*(iter - 1)) == string::npos) {
200  *out += (char)tolower((unsigned char)(*iter));
201  ++iter;
202  }
203  else if ( !prev_len ) {
204  *out += (char)tolower((unsigned char)(*iter));
205  }
206  break;
207 
208  case 'm':
209  case 'k':
210  if (!prev_len && remaining && *(iter + 1) == 'n') {
211  ++iter;
212  *out += 'n';
213  break;
214  }
215  *out += (char)tolower((unsigned char)(*iter));
216  break;
217 
218  case 'p':
219  if (prev_len == 0 && remaining && *(iter + 1) == 'n') {
220  ++iter;
221  *out += 'n';
222  break;
223  }
224  if (remaining && *(iter + 1) == 'h') {
225  *out += 'f';
226  break;
227  }
228  *out += (char)tolower((unsigned char)(*iter));
229  break;
230 
231  case 'q':
232  *out += 'k';
233  break;
234 
235  case 's':
236  if (remaining > 2 &&
237  *(iter + 1) == 'i' &&
238  ( *(iter + 2) == 'o' ||
239  *(iter + 2) == 'a' ) ) {
240  iter += 2;
241  *out += 'x';
242  break;
243  }
244  if (remaining && *(iter + 1) == 'h') {
245  *out += 'x';
246  ++iter;
247  break;
248  }
249  if (remaining > 2 &&
250  *(iter + 1) == 'c' &&
251  ( *(iter + 2) == 'e' ||
252  *(iter + 2) == 'i' ||
253  *(iter + 2) == 'y' ) ) {
254  iter += 2;
255  }
256  *out += 's';
257  break;
258 
259  case 't':
260  if (remaining > 2 &&
261  *(iter + 1) == 'i' &&
262  ( *(iter + 2) == 'o' ||
263  *(iter + 2) == 'a' ) ) {
264  iter += 2;
265  *out += 'x';
266  break;
267  }
268  if (remaining && *(iter + 1) == 'h') {
269  *out += 'o';
270  ++iter;
271  break;
272  }
273  *out += (char)tolower((unsigned char)(*iter));
274  break;
275 
276  case 'v':
277  *out += 'f';
278  break;
279 
280  case 'w':
281  if ( !prev_len ) {
282  if (remaining && ( *(iter + 1) == 'h' ||
283  *(iter + 1) == 'r') ) {
284  *out += *(iter + 1);
285  ++iter;
286  break;
287  }
288  *out += (char)tolower((unsigned char)(*iter));
289  break;
290  }
291 
292  if ( *(iter - 1) == 'a' ||
293  *(iter - 1) == 'e' ||
294  *(iter - 1) == 'i' ||
295  *(iter - 1) == 'o' ||
296  *(iter - 1) == 'u') {
297  *out += (char)tolower((unsigned char)(*iter));
298  }
299  break;
300 
301  case 'x':
302  *out += "ks";
303  break;
304 
305  case 'y':
306  if (remaining && prev_len &&
307  ( *(iter + 1) == 'a' ||
308  *(iter + 1) == 'e' ||
309  *(iter + 1) == 'i' ||
310  *(iter + 1) == 'o' ||
311  *(iter + 1) == 'u')) {
312  break;
313  }
314  *out += (char)tolower((unsigned char)(*iter));
315  break;
316 
317  case 'z':
318  *out += 's';
319  break;
320  }
321 
322  if (out->length() == max_chars) {
323  break;
324  }
325 
326  }
327 
328  //_TRACE("GetMetaphone(): " << in << " -> " << *out);
329 }
330 
331 
332 void CDictionaryUtil::GetSoundex(const string& in, string* out,
333  size_t max_chars, char pad_char)
334 {
335  static const char sc_SoundexLut[256] = {
336  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
337  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
338  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
339  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
340  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
341  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
342  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
343  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
344  0x00, 0x00, '1', '2', '3', 0x00, '1', '2',
345  0x00, 0x00, '2', '2', '4', '5', '5', 0x00,
346  '1', '2', '6', '2', '3', 0x00, '1', 0x00,
347  '2', 0x00, '2', 0x00, 0x00, 0x00, 0x00, 0x00,
348  0x00, 0x00, '1', '2', '3', 0x00, '1', '2',
349  0x00, 0x00, '2', '2', '4', '5', '5', 0x00,
350  '1', '2', '6', '2', '3', 0x00, '1', 0x00,
351  '2', 0x00, '2', 0x00, 0x00, 0x00, 0x00, 0x00,
352  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
353  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
354  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
355  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
356  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
357  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
358  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
359  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
360  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
361  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
362  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
363  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
364  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
365  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
366  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
367  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
368  };
369 
370  // basic sanity
371  out->erase();
372  if (in.empty()) {
373  return;
374  }
375 
376  // preserve the first character, in upper case
377  string::const_iterator iter = in.begin();
378  *out += (char)toupper((unsigned char)(*iter));
379 
380  // now, iterate substituting codes, using no more than four characters
381  // total
382  ITERATE (string, iter2, in) {
383  char c = sc_SoundexLut[(int)(unsigned char)*iter2];
384  if (c && *(out->end() - 1) != c) {
385  *out += c;
386  if (out->length() == max_chars) {
387  break;
388  }
389  }
390  }
391 
392  // pad with our pad character
393  if (out->length() < max_chars) {
394  *out += string(max_chars - out->length(), pad_char);
395  }
396 }
397 
398 
399 size_t CDictionaryUtil::GetEditDistance(const string& str1,
400  const string& str2,
401  EDistanceMethod method)
402 {
403  switch (method) {
405  {{
406  /// it lgically makes no difference which string
407  /// we look at as the master; we choose the shortest to make
408  /// some of the logic work better (also, it yields more accurate
409  /// results)
410  const string* pstr1 = &str1;
411  const string* pstr2 = &str2;
412  if (pstr1->length() > pstr2->length()) {
413  swap(pstr1, pstr2);
414  }
415  size_t dist = 0;
416  string::const_iterator iter1 = pstr1->begin();
417  string::const_iterator iter2 = pstr2->begin();
418  for ( ; iter1 != pstr1->end() && iter2 != pstr2->end(); ) {
419  char c1_0 = (char)tolower((unsigned char)(*iter1));
420  char c2_0 = (char)tolower((unsigned char)(*iter2));
421  if (c1_0 == c2_0) {
422  /// identity: easy out
423  ++iter1;
424  ++iter2;
425  continue;
426  }
427 
428  /// we scan for a match, starting from the corner formed
429  /// as we march forward a few letters. We use a maximum
430  /// of 3 letters as our limit
431  int max_radius = (int)min(pstr1->end() - iter1,
432  string::difference_type(3));
433 
434  string::const_iterator best_iter1 = iter1 + 1;
435  string::const_iterator best_iter2 = iter2 + 1;
436  size_t cost = 1;
437 
438  for (int radius = 1; radius <= max_radius; ++radius) {
439 
440  char corner1 = *(iter1 + radius);
441  char corner2 = *(iter2 + radius);
442  bool match = false;
443  for (int i = radius; i >= 0; --i) {
444  c1_0 = (char)tolower((unsigned char)(*(iter1 + i)));
445  c2_0 = (char)tolower((unsigned char)(*(iter2 + i)));
446  if (c1_0 == corner2) {
447  match = true;
448  cost = radius;
449  best_iter1 = iter1 + i;
450  best_iter2 = iter2 + radius;
451  break;
452  }
453  if (c2_0 == corner1) {
454  match = true;
455  cost = radius;
456  best_iter1 = iter1 + radius;
457  best_iter2 = iter2 + i;
458  break;
459  }
460  }
461  if (match) {
462  break;
463  }
464  }
465 
466  dist += cost;
467  iter1 = best_iter1;
468  iter2 = best_iter2;
469  }
470  dist += (pstr1->end() - iter1) + (pstr2->end() - iter2);
471  return dist;
472  }}
473 
474  case eEditDistance_Exact:
475  {{
476  size_t buf0[kMaxMetaphoneStack + 1];
477  size_t buf1[kMaxMetaphoneStack + 1];
478  vector<size_t> row0;
479  vector<size_t> row1;
480 
481  const string* short_str = &str1;
482  const string* long_str = &str2;
483  if (long_str->size() < short_str->size()) {
484  swap(short_str, long_str);
485  }
486  size_t short_size = short_str->size();
487  size_t long_size = long_str->size();
488 
489  size_t* row0_ptr = buf0;
490  size_t* row1_ptr = buf1;
491  if (short_size > kMaxMetaphoneStack) {
492  row0.resize(short_size + 1);
493  row1.resize(short_size + 1);
494  row0_ptr = &row0[0];
495  row1_ptr = &row1[0];
496  }
497 
498  size_t i;
499  size_t j;
500 
501  //cout << " ";
502  for (i = 0; i <= short_size; ++i) {
503  //cout << (*short_str)[i] << " ";
504  row0_ptr[i] = i;
505  row1_ptr[i] = i;
506  }
507  //cout << endl;
508 
509  for (i = 0; i < long_size; ++i) {
510  row1_ptr[0] = i + 1;
511  //cout << (*long_str)[i] << " ";
512  for (j = 0; j < short_size; ++j) {
513  int c0 = tolower((unsigned char) (*short_str)[j]);
514  int c1 = tolower((unsigned char) (*long_str)[i]);
515  size_t cost = (c0 == c1 ? 0 : 1);
516  row1_ptr[j + 1] =
517  min(row0_ptr[j] + cost,
518  min(row0_ptr[j + 1] + 1, row1_ptr[j] + 1));
519  //cout << setw(2) << row1_ptr[j + 1] << " ";
520  }
521 
522  //cout << endl;
523 
524  swap(row0_ptr, row1_ptr);
525  }
526 
527  return row0_ptr[short_size];
528  }}
529  }
530 
531  // undefined
532  return (size_t)-1;
533 }
534 
535 
536 /// Compute a nearness score for two different words or phrases
537 int CDictionaryUtil::Score(const string& word1, const string& word2,
538  size_t max_metaphone)
539 {
540  string meta1;
541  string meta2;
542  GetMetaphone(word1, &meta1, max_metaphone);
543  GetMetaphone(word2, &meta2, max_metaphone);
544  return Score(word1, meta1, word2, meta2);
545 }
546 
547 
548 /// Compute a nearness score based on metaphone as well as raw distance
549 int CDictionaryUtil::Score(const string& word1, const string& meta1,
550  const string& word2, const string& meta2,
551  EDistanceMethod method)
552 {
553  // score:
554  // start with edit distance
555  size_t score = CDictionaryUtil::GetEditDistance(word1, word2, method);
556 
557  // normalize to length of word
558  // this allows negative scores to be omittied
559  score = word1.length() - score;
560 
561  // down-weight for metaphone distance
562  score -= CDictionaryUtil::GetEditDistance(meta1, meta2, method);
563 
564  // one point for first letter of word being same
565  //score += (tolower((unsigned char) word1[0]) == tolower((unsigned char) word2[0]));
566 
567  // one point for identical lengths of words
568  //score += (word1.length() == word2.length());
569 
570  return (int)score;
571 }
572 
573 
574 /////////////////////////////////////////////////////////////////////////////
575 ///
576 /// Porter's Stemming Algorithm
577 ///
578 
579 enum ECharType {
582  eVowel
583 };
584 
586 {
587 public:
589  {
590  // This cycle is processed in backward order to avoid buggy
591  // optimization by ICC 9.1 on 64-bit platforms.
592  // The optimizer calls buggy intel_fast_mem(cpy|set) even with
593  // -fno-builtin-memset -fno-builtin-memcpy.
594  for (int i = 256; i--; ) {
595  s_char_type[i] = eOther;
596  }
597 
598  for (int i = 0; i < 26; ++i) {
599  s_char_type[i + 'a'] = eConsonant;
600  s_char_type[i + 'A'] = eConsonant;
601  }
602 
603  s_char_type[(int)'a'] = eVowel;
604  s_char_type[(int)'e'] = eVowel;
605  s_char_type[(int)'i'] = eVowel;
606  s_char_type[(int)'o'] = eVowel;
607  s_char_type[(int)'u'] = eVowel;
608  }
609 
611  return s_char_type[c];
612  }
613 
614 private:
616 };
617 
618 
619 static inline ECharType s_GetCharType(int c)
620 {
621  static CSafeStatic<CFillTypes> fill_types;
622  _ASSERT(c < 256 && c >= 0);
623  return fill_types->GetChar(c);
624 }
625 
626 static inline int s_MeasureWord(string::const_iterator iter,
627  string::const_iterator end)
628 {
629  int m = 0;
630 
631  /**
632  {{
633  ECharType first_char_type = s_GetCharType(*iter);
634  while (iter != end && s_GetCharType(*iter) == first_char_type) {
635  ++iter;
636  }
637  }}
638  **/
639 
640 
641  // skip leading entities
642  ECharType prev_type = s_GetCharType(*iter);
643  for ( ; iter != end; ++iter) {
644  ECharType type = s_GetCharType(*iter);
645  if (type != prev_type) {
646  prev_type = type;
647  break;
648  }
649  }
650 
651  //prev_type = s_GetCharType(*iter);
652  for ( ; iter != end; ++iter) {
653  ECharType type = s_GetCharType(*iter);
654  if (type != prev_type) {
655  prev_type = type;
656  ++m;
657  }
658  }
659  /**
660  for ( ; iter != end; ++m) {
661  string::const_iterator prev(iter);
662  ECharType prev_type = s_GetCharType(*prev);
663  for (++iter; iter != end; ) {
664  ECharType type = s_GetCharType(*iter);
665  if (type != prev_type) {
666  break;
667  }
668  prev_type = type;
669  prev = iter++;
670  }
671  if (iter != end) {
672  prev = iter;
673  prev_type = s_GetCharType(*prev);
674  for (++iter; iter != end; ) {
675  ECharType type = s_GetCharType(*iter);
676  if (type != prev_type) {
677  break;
678  }
679  prev_type = type;
680  prev = iter++;
681  }
682  }
683  }
684  **/
685 
686  return m;
687 }
688 
689 
690 static inline bool s_EndsWith(const string& str1, const string& str2)
691 {
692  string::const_reverse_iterator iter1(str1.end());
693  string::const_reverse_iterator end1 (str1.begin());
694  string::const_reverse_iterator iter2(str2.end());
695  string::const_reverse_iterator end2 (str2.begin());
696  for ( ; iter1 != end1 && iter2 != end2; ++iter1, ++iter2) {
697  if (*iter1 != *iter2) {
698  return false;
699  }
700  }
701  return true;
702 }
703 
704 static inline bool s_EndsWith(const string& str1, const char* p)
705 {
706  string::const_reverse_iterator iter1(str1.end());
707  string::const_reverse_iterator end1 (str1.begin());
708  const char* iter2 = p + strlen(p) - 1;
709  for ( ; iter1 != end1; ++iter1, --iter2) {
710  if (*iter1 != *iter2) {
711  return false;
712  }
713  if (iter2 == p) {
714  break;
715  }
716  }
717  return true;
718 }
719 
720 static string::size_type s_FindFirstVowel(const string& str)
721 {
722  for (string::size_type i = 0; i < str.size(); ++i) {
723  if (s_GetCharType(str[i]) == eVowel) {
724  return i;
725  }
726  }
727  return string::npos;
728 }
729 
731 static inline bool s_ReplaceEnding(string& word,
732  const string& match,
733  const string& substitute,
734  int min_measure = 0)
735 {
736  if (word.length() < match.length()) {
737  return false;
738  }
739 
740  if ( !s_EndsWith(word, match) ) {
741  return false;
742  }
743 
744  if (s_MeasureWord(word.begin(),
745  word.end() - match.length()) <= min_measure) {
746  return false;
747  }
748 
749  word.erase(word.length() - match.length());
750  word += substitute;
751  return true;
752 }
753 
754 
755 static inline bool s_ReplaceEnding(string& word,
756  const char* match,
757  const char* substitute,
758  int min_measure = 0)
759 {
760  size_t match_len = strlen(match);
761  if (word.length() < match_len) {
762  return false;
763  }
764 
765  if ( !s_EndsWith(word, match) ) {
766  return false;
767  }
768 
769  if (s_MeasureWord(word.begin(),
770  word.end() - match_len) <= min_measure) {
771  return false;
772  }
773 
774  word.erase(word.length() - match_len);
775  word += substitute;
776  return true;
777 }
778 
779 
780 static inline bool s_TruncateEnding(string& word,
781  const char* match,
782  size_t new_ending_size,
783  int min_measure = 0)
784 {
785  size_t match_len = strlen(match);
786  if (word.length() < match_len) {
787  return false;
788  }
789 
790  if ( !s_EndsWith(word, match) ) {
791  return false;
792  }
793 
794  if (s_MeasureWord(word.begin(),
795  word.end() - match_len) <= min_measure) {
796  return false;
797  }
798 
799  word.erase(word.length() - match_len + new_ending_size);
800  return true;
801 }
802 
803 
804 void CDictionaryUtil::Stem(const string& in_str, string* out_str)
805 {
806  *out_str = in_str;
807  string& str = *out_str;
808 
809  // the steps outlined below follow the general scheme at:
810  //
811  // http://snowball.tartarus.org/algorithms/porter/stemmer.html
812  //
813 
814  // step 1a: common 's' endings
815  //
816  // sses -> ss
817  // ies -> i
818  // ss -> ss
819  // s ->
820  if (str[ str.length()-1 ] == 's') {
821  do {
822  //if (s_ReplaceEnding(str, "sses", "ss")) {
823  if (s_TruncateEnding(str, "sses", 2)) {
824  break;
825  }
826 
827  //if (s_ReplaceEnding(str, "ies", "i")) {
828  if (s_TruncateEnding(str, "ies", 1)) {
829  break;
830  }
831 
832  if ( !s_EndsWith(str, "ss") ) {
833  //s_ReplaceEnding(str, "s", "");
834  s_TruncateEnding(str, "s", 0);
835  }
836  }
837  while (false);
838  }
839 
840  // step 1b: ed/ing
841  //
842  // eed -> ed (not .eed)
843  // ed -> (*v*)
844  // ing -> (*v*)
845  if (s_EndsWith(str, "eed") && str.length() > 4) {
846  str.erase(str.length() - 1);
847  } else {
848  bool extra = false;
849  if (s_EndsWith(str, "ed") &&
850  s_FindFirstVowel(str) < str.length() - 3) {
851  str.erase(str.length() - 2);
852  extra = true;
853  } else if (s_EndsWith(str, "ing") &&
854  s_FindFirstVowel(str) < str.length() - 3) {
855  str.erase(str.length() - 3);
856  extra = true;
857  }
858 
859  if (extra) {
860  if (s_EndsWith(str, "at") ||
861  s_EndsWith(str, "bl") ||
862  s_EndsWith(str, "iz")) {
863  str += 'e';
864  } else if (str[str.length() - 1] != 'l' &&
865  str[str.length() - 1] != 's' &&
866  str[str.length() - 1] != 'z' &&
867  str[str.length() - 1] == str[str.length() - 2]) {
868  str.erase(str.length() - 1);
869  } else if (str.length() == 3 &&
870  s_GetCharType(str[0]) == eConsonant &&
871  s_GetCharType(str[1]) == eVowel &&
872  s_GetCharType(str[2]) == eConsonant) {
873  str += 'e';
874  }
875  }
876  }
877 
878  // step 1c: y -> i
879  if (str[str.length() - 1] == 'y' &&
880  s_FindFirstVowel(str) < str.length() - 1) {
881  str[str.length() - 1] = 'i';
882  }
883 
884  // step 2
885 
886  if (str.length() > 3) {
887  switch (str[ str.length() - 2 ]) {
888  case 'a':
889  if ( !s_ReplaceEnding(str, "ational", "ate") ) {
890  s_ReplaceEnding(str, "tional", "tion");
891  //s_TruncateEnding(str, "tional", 4);
892  }
893  break;
894 
895  case 'c':
896  if ( !s_ReplaceEnding(str, "enci", "ence") ) {
897  s_ReplaceEnding(str, "anci", "ance");
898  }
899  break;
900 
901  case 'e':
902  s_ReplaceEnding(str, "izer", "ize");
903  //s_TruncateEnding(str, "izer", 3);
904  break;
905 
906  case 'l':
907  if (str[ str.length()-1 ] == 'i' &&
908  !s_ReplaceEnding(str, "abli", "able") &&
909  !s_ReplaceEnding(str, "alli", "al") &&
910  !s_ReplaceEnding(str, "entli", "ent") &&
911  !s_ReplaceEnding(str, "eli", "e") ) {
912  s_ReplaceEnding(str, "ousli", "ous");
913  }
914  /**
915  if (str[ str.length()-1 ] == 'i' &&
916  !s_ReplaceEnding(str, "abli", "able") &&
917  !s_TruncateEnding(str, "alli", 2) &&
918  !s_TruncateEnding(str, "entli", 3) &&
919  !s_TruncateEnding(str, "eli", 1) ) {
920  s_TruncateEnding(str, "ousli", 3);
921  }
922  **/
923  break;
924 
925  case 'o':
926  if ( !s_ReplaceEnding(str, "ization", "ize") &&
927  !s_ReplaceEnding(str, "ation", "ate") ) {
928  s_ReplaceEnding(str, "ator", "ate");
929  }
930  break;
931 
932  case 's':
933  if ( !s_ReplaceEnding(str, "alism", "al") &&
934  !s_ReplaceEnding(str, "iveness", "ive") &&
935  !s_ReplaceEnding(str, "fulness", "ful") ) {
936  s_ReplaceEnding(str, "ousness", "ous");
937  }
938  /**
939  if ( !s_TruncateEnding(str, "alism", 2) &&
940  !s_TruncateEnding(str, "iveness", 3) &&
941  !s_TruncateEnding(str, "fulness", 3) ) {
942  s_TruncateEnding(str, "ousness", 3);
943  }
944  **/
945  break;
946 
947  case 't':
948  if ( !s_ReplaceEnding(str, "aliti", "al") &&
949  !s_ReplaceEnding(str, "iviti", "ive") ) {
950  s_ReplaceEnding(str, "biliti", "ble");
951  }
952  break;
953 
954  default:
955  break;
956  }
957  }
958 
959  // 'us' endings
960  //s_ReplaceEnding(str, "u", "us");
961 
962  // step 3
963  typedef SStaticPair<const char*, const char*> TReplace;
964  static const TReplace rep_step3[] = {
965  { "icate", "ic" },
966  { "ative", "" },
967  { "alize", "al" },
968  { "iciti", "ic" },
969  { "ical", "ic" },
970  { "ful", "" },
971  { "ness", "" },
972  { NULL, NULL } /// end
973  };
974  {{
975  static const char* s_Step3_Endings("eils");
976  if (CTempString(s_Step3_Endings).find(str[str.length()-1]) != string::npos) {
977  for (const TReplace* p = rep_step3; p->first; ++p) {
978  if (s_ReplaceEnding(str, p->first, p->second)) {
979  break;
980  }
981  }
982  }
983  }}
984 
985  // step 4
986  if (str.length() > 2) {
987  switch (str[ str.length() - 2]) {
988  case 'a':
989  if (str[ str.length()-1 ] == 'l') {
990  if (s_ReplaceEnding(str, "ual", "", 1)) {
991  break;
992  }
993  if (s_ReplaceEnding(str, "ial", "", 1)) {
994  break;
995  }
996  s_ReplaceEnding(str, "al", "", 1);
997  }
998  break;
999 
1000  case 'c':
1001  if (str[ str.length()-1 ] == 'e') {
1002  if ( !s_ReplaceEnding(str, "ance", "", 1) ) {
1003  s_ReplaceEnding(str, "ence", "", 1);
1004  }
1005  }
1006  break;
1007 
1008  case 'e':
1009  s_ReplaceEnding(str, "er", "", 1);
1010  break;
1011 
1012  case 'i':
1013  if (s_ReplaceEnding(str, "ix", "ic", 0)) {
1014  break;
1015  }
1016  s_ReplaceEnding(str, "ic", "", 1);
1017  break;
1018 
1019  case 'l':
1020  if ( !s_ReplaceEnding(str, "able", "", 1) ) {
1021  s_ReplaceEnding(str, "ible", "", 1);
1022  }
1023  break;
1024 
1025  case 'n':
1026  if ( !s_ReplaceEnding(str, "ant", "", 1) ) {
1027  if ( !s_ReplaceEnding(str, "ement", "", 1) ) {
1028  if ( !s_ReplaceEnding(str, "ment", "", 1) ) {
1029  s_ReplaceEnding(str, "ent", "", 1);
1030  }
1031  }
1032  }
1033  break;
1034 
1035  case 'o':
1036  if ( !s_ReplaceEnding(str, "sion", "s", 1) ) {
1037  if ( !s_ReplaceEnding(str, "tion", "t", 1) ) {
1038  s_ReplaceEnding(str, "ou", "", 1);
1039  }
1040  }
1041  break;
1042 
1043  case 's':
1044  s_ReplaceEnding(str, "ism", "", 1);
1045  break;
1046 
1047  case 't':
1048  if ( !s_ReplaceEnding(str, "ate", "", 1) ) {
1049  s_ReplaceEnding(str, "iti", "", 1);
1050  }
1051  break;
1052 
1053  case 'u':
1054  s_ReplaceEnding(str, "ous", "", 1);
1055  break;
1056 
1057  case 'v':
1058  s_ReplaceEnding(str, "ive", "", 1);
1059  break;
1060 
1061  case 'z':
1062  s_ReplaceEnding(str, "ize", "", 1);
1063  break;
1064  }
1065  }
1066 
1067  // step 5a
1068  //s_ReplaceEnding(str, "e", "", 1);
1069  s_TruncateEnding(str, "e", 0, 1);
1070 
1071  // step 5b
1072  if (s_MeasureWord(str.begin(), str.end()) > 1 &&
1073  str[str.length() - 1] == 'l' &&
1074  str[str.length() - 2] == 'l') {
1075  str.erase(str.length() - 1);
1076  }
1077 
1078 }
1079 
1080 
static void Stem(const string &in_str, string *out_str)
Compute the Porter stem for a given word.
static void GetSoundex(const string &in, string *out, size_t max_chars=eMaxSoundex, char pad_char='0')
Compute the Soundex key for a given word The Soundex key is defined as:
static size_t GetEditDistance(const string &str1, const string &str2, EDistanceMethod method=eEditDistance_Exact)
static int Score(const string &word1, const string &word2, size_t max_metaphone=eMaxMetaphone)
Compute a nearness score for two different words or phrases.
EDistanceMethod
Return the Levenshtein edit distance between two words.
@ eEditDistance_Exact
This method performs an exhausive search, and has an algorithmic complexity of O(n x m),...
@ eEditDistance_Similar
This method performs a simpler search, looking for the distance between similar words.
static void GetMetaphone(const string &in, string *out, size_t max_chars=eMaxMetaphone)
Compute the Metaphone key for a given word Metaphone is a more advanced algorithm than Soundex; inste...
ECharType GetChar(int c)
ECharType s_char_type[256]
CSafeStatic<>::
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
Definition: tempstr.hpp:65
ECharType
Porter's Stemming Algorithm.
@ eVowel
@ eConsonant
@ eOther
static const size_t kMaxMetaphoneStack
static bool s_EndsWith(const string &str1, const string &str2)
static NCBI_UNUSED bool s_ReplaceEnding(string &word, const string &match, const string &substitute, int min_measure=0)
static ECharType s_GetCharType(int c)
static bool s_TruncateEnding(string &word, const char *match, size_t new_ending_size, int min_measure=0)
static string::size_type s_FindFirstVowel(const string &str)
static int s_MeasureWord(string::const_iterator iter, string::const_iterator end)
std::ofstream out("events_result.xml")
main entry point for tests
static int type
Definition: getdata.c:31
static const char * str(char *buf, int n)
Definition: stats.c:84
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
void swap(NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair1, NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair2)
Definition: ncbimisc.hpp:1508
string
Definition: cgiapp.hpp:687
#define NULL
Definition: ncbistd.hpp:225
#define NCBI_UNUSED
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
size_type find(const CTempString match, size_type pos=0) const
Find the first instance of the entire matching string within the current string, beginning at an opti...
Definition: tempstr.hpp:655
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
int i
int tolower(Uchar c)
Definition: ncbictype.hpp:72
int toupper(Uchar c)
Definition: ncbictype.hpp:73
T min(T x_, T y_)
std::istream & in(std::istream &in_, double &x_)
static int match(register const pcre_uchar *eptr, register const pcre_uchar *ecode, const pcre_uchar *mstart, int offset_top, match_data *md, eptrblock *eptrb, unsigned int rdepth)
Definition: pcre_exec.c:513
Template structure SStaticPair is simlified replacement of STL pair<> Main reason of introducing this...
Definition: static_set.hpp:60
Definition: type.c:6
#define _ASSERT
Modified on Sun Apr 14 05:28:48 2024 by modify_doxy.py rev. 669887