NCBI C++ ToolKit
text_util.hpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 #ifndef ALGO_TEXT___QUERY_UTIL__HPP
2 #define ALGO_TEXT___QUERY_UTIL__HPP
3 
4 /* $Id: text_util.hpp 91306 2020-10-08 11:57:15Z gouriano $
5  * ===========================================================================
6  *
7  * PUBLIC DOMAIN NOTICE
8  * National Center for Biotechnology Information
9  *
10  * This software/database is a "United States Government Work" under the
11  * terms of the United States Copyright Act. It was written as part of
12  * the author's official duties as a United States Government employee and
13  * thus cannot be copyrighted. This software/database is freely available
14  * to the public for use. The National Library of Medicine and the U.S.
15  * Government have not placed any restriction on its use or reproduction.
16  *
17  * Although all reasonable efforts have been taken to ensure the accuracy
18  * and reliability of the software and data, the NLM and the U.S.
19  * Government do not and cannot warrant the performance or results that
20  * may be obtained by using this software or data. The NLM and the U.S.
21  * Government disclaim all warranties, express or implied, including
22  * warranties of performance, merchantability or fitness for any particular
23  * purpose.
24  *
25  * Please cite the author in any work or product based on this material.
26  *
27  * ===========================================================================
28  *
29  * Authors: Mike DiCuccio
30  *
31  * File Description:
32  *
33  */
34 
35 #include <corelib/ncbiobj.hpp>
36 #include <util/simple_buffer.hpp>
37 
38 #include <algo/text/vector.hpp>
39 
40 
42 
43 
44 
45 ///
46 /// string comparison using integers instead of const char*
47 /// this is coded in C++ and beats an assembler-based memcmp()
48 /// this algorithm makes no adjustment for memory alignment and may be either
49 /// slower or illegal on some platforms.
50 ///
51 template <typename TComp>
53 {
54  bool operator() (const string& s1, const string& s2) const
55  {
56  const TComp* p1 = (const TComp*)s1.data();
57  const TComp* p1_end = p1 + s1.size() / sizeof(TComp);
58  const TComp* p2 = (const TComp*)s2.data();
59  const TComp* p2_end = p2 + s2.size() / sizeof(TComp);
60  for ( ; p1 != p1_end && p2 != p2_end; ++p1, ++p2) {
61  if (*p1 < *p2) {
62  return true;
63  }
64  if (*p2 < *p1) {
65  return false;
66  }
67  }
68 
69  const char* pc1 = (const char*)p1;
70  const char* pc1_end = s1.data() + s1.size();
71  const char* pc2 = (const char*)p2;
72  const char* pc2_end = s2.data() + s2.size();
73  for ( ; pc1 != pc1_end && pc2 != pc2_end; ++pc1, ++pc2) {
74  if (*pc1 < *pc2) {
75  return true;
76  }
77  if (*pc2 < *pc1) {
78  return false;
79  }
80  }
81 
82  return (pc2 != pc2_end);
83  }
84 };
85 
86 
87 ///
88 /// This appears to be the best overall hash function for use in English
89 /// dictionaries. It amounts to an iterative string walk in which the hash is
90 /// computed as
91 ///
92 /// hval[n] = (hval[n-1] * 17) + c
93 ///
94 /// for each c in the string
95 ///
96 template<class IT>
97 size_t StringHash17(IT start, IT end)
98 {
99  size_t hval = 0;
100  for (; start != end; ++start) {
101  hval = ((hval << 4) + hval) + *start;
102  }
103  return hval;
104 }
105 
106 /// Functor-adaptor for StringHash17
108 {
109  size_t operator()(const string& s) const
110  {
111  return StringHash17(s.begin(), s.end());
112  }
113 };
114 
115 
116 
118 {
119 public:
120 
121  enum EOptions {
123  fPorterStem = 0x02,
124  fTrimStops = 0x04,
125  fNoNumeric = 0x08,
129 
132  };
133  typedef int TFlags;
134 
135  /// typedef for word frequencies
137 
138  /// retrieve word frequencies for a given piece of text
139  static void GetWordFrequencies(const string& text, TWordFreq& freq,
141 
142  /// retrieve word frequencies from a file
143  static void GetWordFrequencies(CNcbiIstream& istr, TWordFreq& freq,
145 
146  /// convert an integer into a set of word frequencies
147  /// this maps ints to smaller strings to compress the dictionary
148  static void GetWordFrequencies(Int4 i, TWordFreq& freq);
149  static void GetWordFrequencies(Uint4 i, TWordFreq& freq);
150  static void GetWordFrequencies(Int8 i, TWordFreq& freq);
151  static void GetWordFrequencies(Uint8 i, TWordFreq& freq);
152  static void GetWordFrequencies(float i, TWordFreq& freq);
153  static void GetWordFrequencies(double i, TWordFreq& freq);
154 
155  /// split a set of word frequencies into phrase and non-phrase frequencies
156  /// this is done to treat the two separately
157  static void SplitWordFrequencies(const TWordFreq& wf_in,
158  TWordFreq& wf_out, TWordFreq& phrase_out);
159 
160  /// retrieve stem frequencies from a set of word frequencies
161  static void GetStemFrequencies(const TWordFreq& freq,
162  TWordFreq& stems,
164 
165  /// add a set of frequencies into another set
166  static void AddWordFrequencies(TWordFreq& freq,
167  const TWordFreq& wf,
168  TFlags flags = 0);
169  static void AddWordFrequencies(TWordFreq& freq,
170  const TWordFreq& wf,
171  const string& prefix,
172  TFlags flags = 0);
173 
174  /// return true if the provided word is a stop word
175  static bool IsStopWord(const string& str);
176 
177  /// eliminate the stop words frm a set of word frequencies
178  static void TrimStopWords(TWordFreq& freq);
179 
180  static void EncodeFreqs(const TWordFreq& freq,
181  vector<char>& data);
182  static void EncodeFreqs(const TWordFreq& freq,
183  vector<unsigned char>& data);
184  static void EncodeFreqs(const TWordFreq& freq,
185  CSimpleBuffer& data);
186  static void DecodeFreqs(TWordFreq& freq,
187  const vector<char>& data);
188  static void DecodeFreqs(TWordFreq& freq,
189  const vector<unsigned char>& data);
190  static void DecodeFreqs(TWordFreq& freq,
191  const CSimpleBuffer& data);
192  static void DecodeFreqs(TWordFreq& freq,
193  const void* data, size_t data_len);
194 
195  /// perform a set of punctuational clean-ups on a string suitable for a
196  /// journal or book title
197  static void CleanJournalTitle(string& title);
198 };
199 
200 
201 
203 
204 #endif // ALGO_TEXT___QUERY_UTIL__HPP
pair< TPtrPair, SComparison > TComp
Reallocable memory buffer (no memory copy overhead) Mimics vector<>, without the overhead of explicit...
static void GetStemFrequencies(const TWordFreq &freq, TWordFreq &stems, TFlags flags=fDefaults)
retrieve stem frequencies from a set of word frequencies
Definition: text_util.cpp:508
static bool IsStopWord(const string &str)
return true if the provided word is a stop word
Definition: text_util.cpp:652
static void TrimStopWords(TWordFreq &freq)
eliminate the stop words frm a set of word frequencies
Definition: text_util.cpp:659
static void CleanJournalTitle(string &title)
perform a set of punctuational clean-ups on a string suitable for a journal or book title
Definition: text_util.cpp:683
static void SplitWordFrequencies(const TWordFreq &wf_in, TWordFreq &wf_out, TWordFreq &phrase_out)
split a set of word frequencies into phrase and non-phrase frequencies this is done to treat the two ...
Definition: text_util.cpp:197
@ fPhrase_NoStems
Definition: text_util.hpp:127
@ fDiphthongReplace
Definition: text_util.hpp:122
@ fPhrase_NoPrefix
Definition: text_util.hpp:128
@ fIncludePhrases
Definition: text_util.hpp:126
CScoreVector< string, float > TWordFreq
typedef for word frequencies
Definition: text_util.hpp:136
static void EncodeFreqs(const TWordFreq &freq, vector< char > &data)
Definition: text_util.cpp:700
static void AddWordFrequencies(TWordFreq &freq, const TWordFreq &wf, TFlags flags=0)
add a set of frequencies into another set
Definition: text_util.cpp:211
static void GetWordFrequencies(const string &text, TWordFreq &freq, TFlags flags=fDefaults)
retrieve word frequencies for a given piece of text
Definition: text_util.cpp:336
static void DecodeFreqs(TWordFreq &freq, const vector< char > &data)
Definition: text_util.cpp:722
static uch flags
static const char pc1[]
Definition: des.c:116
static const char pc2[]
Definition: des.c:134
double wf(double lambda, double D_LR, double D_LU, double D_LD, double D_RU, double D_RD, double D_DU)
Definition: gme.cpp:78
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
int64_t Int8
8-byte (64-bit) signed integer
Definition: ncbitype.h:104
uint64_t Uint8
8-byte (64-bit) unsigned integer
Definition: ncbitype.h:105
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
Definition: ncbistre.hpp:146
int i
static void text(MDB_val *v)
Definition: mdb_dump.c:62
Portable reference counted smart and weak pointers using CWeakRef, CRef, CObject and CObjectEx.
static const char * prefix[]
Definition: pcregrep.c:405
static const char * str(char *buf, int n)
Definition: stats.c:84
Functor-adaptor for StringHash17.
Definition: text_util.hpp:108
size_t operator()(const string &s) const
Definition: text_util.hpp:109
string comparison using integers instead of const char* this is coded in C++ and beats an assembler-b...
Definition: text_util.hpp:53
bool operator()(const string &s1, const string &s2) const
Definition: text_util.hpp:54
size_t StringHash17(IT start, IT end)
This appears to be the best overall hash function for use in English dictionaries.
Definition: text_util.hpp:97
Modified on Wed Feb 21 09:54:14 2024 by modify_doxy.py rev. 669887