NCBI C++ ToolKit
dictionary.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: dictionary.cpp 33815 2007-05-04 17:18:18Z kazimird $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Mike DiCuccio
27  *
28  * File Description:
29  *
30  */
31 
32 #include <ncbi_pch.hpp>
33 #include <util/dictionary.hpp>
34 #include <util/dictionary_util.hpp>
35 #include <algorithm>
36 
37 
39 
40 
41 
43  : m_MetaphoneKeySize(meta_key_size)
44 {
45 }
46 
47 
49  size_t meta_key_size)
50  : m_MetaphoneKeySize(meta_key_size)
51 {
52  CNcbiIfstream istr(file.c_str());
53  Read(istr);
54 }
55 
56 
58  size_t meta_key_size)
59  : m_MetaphoneKeySize(meta_key_size)
60 {
61  Read(istr);
62 }
63 
64 
66 {
67  string line;
68  string metaphone;
69  string word;
70  while (NcbiGetlineEOL(istr, line)) {
71  string::size_type pos = line.find_first_of("|");
72  if (pos == string::npos) {
73  word = line;
75  } else {
76  metaphone = line.substr(0, pos);
77  word = line.substr(pos + 1, line.length() - pos - 1);
78  }
79 
80  // insert forward and reverse dictionary pairings
82  TStringSet& word_set = m_ReverseDict[metaphone];
83  word_set.insert(word_set.end(), word);
84  }
85 }
86 
88 {
90  ITERATE (TStringSet, word_iter, iter->second) {
91  ostr << iter->first << "|" << *word_iter << endl;
92  }
93  }
94 }
95 
96 
97 void CSimpleDictionary::AddWord(const string& word)
98 {
99  if (word.empty()) {
100  return;
101  }
102 
103  // compute the metaphone code
104  string metaphone;
106 
107  // insert forward and reverse dictionary pairings
108  m_ForwardDict.insert(word);
109  m_ReverseDict[metaphone].insert(word);
110 }
111 
112 
113 bool CSimpleDictionary::CheckWord(const string& word) const
114 {
116  return (iter != m_ForwardDict.end());
117 }
118 
119 
120 void CSimpleDictionary::SuggestAlternates(const string& word,
121  TAlternates& alternates,
122  size_t max_alts) const
123 {
124  string metaphone;
126  list<TReverseDict::const_iterator> keys;
127  x_GetMetaphoneKeys(metaphone, keys);
128 
129  typedef set<SAlternate, SAlternatesByScore> TAltSet;
130  TAltSet words;
131 
132  SAlternate alt;
133  size_t count = 0;
134  ITERATE (list<TReverseDict::const_iterator>, key_iter, keys) {
135 
136  bool used = false;
137  ITERATE (TStringSet, set_iter, (*key_iter)->second) {
138  // score:
139  // start with edit distance
140  alt.score =
141  CDictionaryUtil::Score(word, metaphone,
142  *set_iter, (*key_iter)->first);
143  if (alt.score <= 0) {
144  continue;
145  }
146 
147  _TRACE(" hit: "
148  << metaphone << " <-> " << (*key_iter)->first
149  << ", " << word << " <-> " << *set_iter
150  << " (" << alt.score << ")");
151  used = true;
152  alt.alternate = *set_iter;
153  words.insert(alt);
154  }
155  count += used ? 1 : 0;
156  }
157 
158  _TRACE(word << ": "
159  << keys.size() << " keys searched "
160  << count << " keys used");
161 
162  if ( !words.empty() ) {
163  TAlternates alts;
164  TAltSet::const_iterator iter = words.begin();
165  alts.push_back(*iter);
166  TAltSet::const_iterator prev = iter;
167  for (++iter;
168  iter != words.end() &&
169  (alts.size() < max_alts || prev->score == iter->score);
170  ++iter) {
171  alts.push_back(*iter);
172  prev = iter;
173  }
174 
175  alternates.insert(alternates.end(), alts.begin(), alts.end());
176  }
177 }
178 
179 
180 void CSimpleDictionary::x_GetMetaphoneKeys(const string& metaphone,
181  list<TReverseDict::const_iterator>& keys) const
182 {
183  if ( !metaphone.length() ) {
184  return;
185  }
186 
187  const size_t max_meta_edit_dist = 1;
188  const CDictionaryUtil::EDistanceMethod method =
190 
191  string::const_iterator iter = metaphone.begin();
192  string::const_iterator end = iter + max_meta_edit_dist + 1;
193 
194  size_t count = 0;
195  _TRACE("meta key: " << metaphone);
196  for ( ; iter != end; ++iter) {
197  string seed(1, *iter);
198  _TRACE(" meta key seed: " << seed);
200  for ( ;
201  lower != m_ReverseDict.end() && lower->first[0] == *iter;
202  ++lower, ++count) {
203 
204  size_t dist =
205  CDictionaryUtil::GetEditDistance(lower->first, metaphone,
206  method);
207  if (dist > max_meta_edit_dist) {
208  continue;
209  }
210 
211  keys.push_back(lower);
212  }
213  }
214 
215  _TRACE("exmained " << count << " keys, returning " << keys.size());
216 }
217 
218 
219 /////////////////////////////////////////////////////////////////////////////
220 ///
221 /// CMultiDictionary
222 ///
223 
226  const CMultiDictionary::SDictionary& d2) const
227  {
228  return (d1.priority < d2.priority);
229  }
230 };
231 
233 {
234  SDictionary d;
235  d.dict.Reset(&dict);
236  d.priority = priority;
237 
238  m_Dictionaries.push_back(d);
240 }
241 
242 
243 bool CMultiDictionary::CheckWord(const string& word) const
244 {
246  if ( iter->dict->CheckWord(word) ) {
247  return true;
248  }
249  }
250 
251  return false;
252 }
253 
254 
255 void CMultiDictionary::SuggestAlternates(const string& word,
256  TAlternates& alternates,
257  size_t max_alts) const
258 {
259  TAlternates alts;
261  iter->dict->SuggestAlternates(word, alts, max_alts);
262  }
263 
264  std::sort(alts.begin(), alts.end(), SAlternatesByScore());
265  if (alts.size() > max_alts) {
266  TAlternates::iterator prev = alts.begin() + max_alts;
267  TAlternates::iterator iter = prev;
268  ++iter;
269  for ( ; iter != alts.end() && iter->score == prev->score; ++iter) {
270  prev = iter;
271  }
272  alts.erase(iter, alts.end());
273  }
274 
275  alternates.swap(alts);
276 }
277 
278 
280  : m_Dict(&dict)
281 {
282 }
283 
284 
285 bool CCachedDictionary::CheckWord(const string& word) const
286 {
287  return m_Dict->CheckWord(word);
288 }
289 
290 
291 void CCachedDictionary::SuggestAlternates(const string& word,
292  TAlternates& alternates,
293  size_t max_alts) const
294 {
295  TAltCache::iterator iter = m_Misses.find(word);
296  if (iter != m_Misses.end()) {
297  alternates = iter->second;
298  return;
299  }
300 
301  m_Dict->SuggestAlternates(word, m_Misses[word], max_alts);
302  alternates = m_Misses[word];
303 }
304 
305 
bool CheckWord(const string &word) const
Virtual requirement: check a word for existence in the dictionary.
Definition: dictionary.cpp:285
TAltCache m_Misses
Definition: dictionary.hpp:212
void SuggestAlternates(const string &word, TAlternates &alternates, size_t max_alternates=20) const
Scan for a list of words similar to the indicated word.
Definition: dictionary.cpp:291
CCachedDictionary(IDictionary &dict)
Definition: dictionary.cpp:279
CRef< IDictionary > m_Dict
Definition: dictionary.hpp:209
static size_t GetEditDistance(const string &str1, const string &str2, EDistanceMethod method=eEditDistance_Exact)
static int Score(const string &word1, const string &word2, size_t max_metaphone=eMaxMetaphone)
Compute a nearness score for two different words or phrases.
EDistanceMethod
Return the Levenshtein edit distance between two words.
@ eEditDistance_Similar
This method performs a simpler search, looking for the distance between similar words.
static void GetMetaphone(const string &in, string *out, size_t max_chars=eMaxMetaphone)
Compute the Metaphone key for a given word Metaphone is a more advanced algorithm than Soundex; inste...
TDictionaries m_Dictionaries
Definition: dictionary.hpp:187
void RegisterDictionary(IDictionary &dict, int priority=ePriority_Default)
Definition: dictionary.cpp:232
void SuggestAlternates(const string &word, TAlternates &alternates, size_t max_alternates=20) const
Scan for a list of words similar to the indicated word.
Definition: dictionary.cpp:255
bool CheckWord(const string &word) const
Virtual requirement: check a word for existence in the dictionary.
Definition: dictionary.cpp:243
vector< SDictionary > TDictionaries
Definition: dictionary.hpp:173
void x_GetMetaphoneKeys(const string &metaphone, list< TReverseDict::const_iterator > &keys) const
Definition: dictionary.cpp:180
void SuggestAlternates(const string &word, TAlternates &alternates, size_t max_alternates=20) const
Scan for a list of words similar to the indicated word.
Definition: dictionary.cpp:120
void Read(CNcbiIstream &istr)
Definition: dictionary.cpp:65
bool CheckWord(const string &word) const
Virtual requirement: check a word for existence in the dictionary.
Definition: dictionary.cpp:113
CSimpleDictionary(size_t metaphone_key_size=5)
Definition: dictionary.cpp:42
void Write(CNcbiOstream &ostr) const
Definition: dictionary.cpp:87
void AddWord(const string &str)
Add a word to the dictionary.
Definition: dictionary.cpp:97
TReverseDict m_ReverseDict
Definition: dictionary.hpp:143
TForwardDict m_ForwardDict
Definition: dictionary.hpp:138
const size_t m_MetaphoneKeySize
the size of our metaphone keys
Definition: dictionary.hpp:146
class IDictionary defines an abstract interface for dictionaries.
Definition: dictionary.hpp:57
vector< SAlternate > TAlternates
Definition: dictionary.hpp:70
virtual bool CheckWord(const string &word) const =0
Virtual requirement: check a word for existence in the dictionary.
virtual void SuggestAlternates(const string &word, TAlternates &alternates, size_t max_alternates=20) const =0
Scan for a list of words similar to the indicated word.
const_iterator end() const
Definition: map.hpp:152
const_iterator lower_bound(const key_type &key) const
Definition: map.hpp:154
iterator_bool insert(const value_type &val)
Definition: map.hpp:165
const_iterator find(const key_type &key) const
Definition: map.hpp:153
iterator_bool insert(const value_type &val)
Definition: set.hpp:149
const_iterator find(const key_type &key) const
Definition: set.hpp:137
const_iterator end() const
Definition: set.hpp:136
parent_type::const_iterator const_iterator
Definition: set.hpp:79
static DLIST_TYPE *DLIST_NAME() prev(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
Definition: dlist.tmpl.h:61
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define _TRACE(message)
Definition: ncbidbg.hpp:122
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
CNcbiIstream & NcbiGetlineEOL(CNcbiIstream &is, string &str, string::size_type *count=NULL)
Read from "is" to "str" the next line (taking into account platform specifics of End-of-Line)
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
Definition: ncbistre.hpp:149
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
Definition: ncbistre.hpp:146
IO_PREFIX::ifstream CNcbiIfstream
Portable alias for ifstream.
Definition: ncbistre.hpp:439
FILE * file
constexpr auto sort(_Init &&init)
CRef< IDictionary > dict
Definition: dictionary.hpp:169
SAlternate wraps a word and its score.
Definition: dictionary.hpp:63
functor for sorting alternates list by score
Definition: dictionary.hpp:73
CMultiDictionary.
Definition: dictionary.cpp:224
bool operator()(const CMultiDictionary::SDictionary &d1, const CMultiDictionary::SDictionary &d2) const
Definition: dictionary.cpp:225
static int seed
Definition: test_table.cpp:132
Modified on Mon May 20 05:06:12 2024 by modify_doxy.py rev. 669887