NCBI C++ ToolKit
strsearch.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: strsearch.cpp 70627 2016-01-08 13:02:41Z ivanov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Mati Shomrat
27 *
28 * File Description:
29 * String search utilities.
30 *
31 * Currently there are two search utilities:
32 * 1. An implementation of the Boyer-Moore algorithm.
33 * 2. A finite state automaton.
34 *
35 */
36 
37 #include <ncbi_pch.hpp>
38 #include <util/strsearch.hpp>
39 #include <algorithm>
40 #include <vector>
41 
42 
44 
45 
46 //==============================================================================
47 // CBoyerMooreMatcher
48 //==============================================================================
49 
50 // Public:
51 // =======
52 
54  NStr::ECase case_sensitive,
55  unsigned int whole_word)
56 : m_Pattern(pattern),
57  m_PatLen(pattern.length()),
58  m_CaseSensitive(case_sensitive),
59  m_WholeWord(whole_word),
60  m_LastOccurrence(sm_AlphabetSize),
61  m_WordDelimiters(sm_AlphabetSize)
62 {
63  x_InitPattern();
64  // Init the word deimiting alphabet
65  if (m_WholeWord) {
66  for (int i = 0; i < sm_AlphabetSize; ++i) {
67  m_WordDelimiters[i] = (isspace((unsigned char) i) != 0);
68  }
69  }
70 }
71 
73  const string& word_delimeters,
74  NStr::ECase case_sensitive,
75  bool invert_delimiters)
76 : m_Pattern(pattern),
77  m_PatLen(pattern.length()),
78  m_CaseSensitive(case_sensitive),
79  m_WholeWord(true),
80  m_LastOccurrence(sm_AlphabetSize),
81  m_WordDelimiters(sm_AlphabetSize)
82 {
83  x_InitPattern();
84  SetWordDelimiters(word_delimeters, invert_delimiters);
85 }
86 
87 void CBoyerMooreMatcher::SetWordDelimiters(const string& word_delimeters,
88  bool invert_delimiters)
89 {
91 
92  string word_d = word_delimeters;
94  NStr::ToUpper(word_d);
95  }
96 
97  // Init the word delimiting alphabet
98  for (int i = 0; i < sm_AlphabetSize; ++i) {
99  char ch = (char)(m_CaseSensitive ? i : toupper((unsigned char)i));
100  string::size_type n = word_d.find_first_of(ch);
101  m_WordDelimiters[i] = (!invert_delimiters) == (n != string::npos);
102  }
103 }
104 
105 void CBoyerMooreMatcher::AddDelimiters(const string& word_delimeters)
106 {
107  if (m_WholeWord == 0) {
109  }
110 
111  string word_d = word_delimeters;
113  NStr::ToUpper(word_d);
114  }
115 
116  for (int i = 0; i < sm_AlphabetSize; ++i) {
117  char ch = (char)(m_CaseSensitive ? i : toupper((unsigned char)i));
118  string::size_type n = word_d.find_first_of(ch);
119  if (n != NPOS) {
120  m_WordDelimiters[i] = true;
121  }
122  }
123 }
124 
126 {
127  if (m_WholeWord == 0) {
129  }
130  m_WordDelimiters[(unsigned char)ch] = true;
131 
133  m_WordDelimiters[toupper((unsigned char) ch)] = true;
134  }
135 }
136 
138 {
139  if (m_WholeWord == 0) {
141  }
142 
143  for (int i = 0; i < sm_AlphabetSize; ++i) {
144  char ch = (char)(m_CaseSensitive ? i : toupper((unsigned char)i));
145  if ((ch >= 'A' && ch <= 'Z') ||
146  (ch >= '0' && ch <= '9') ||
147  (ch == '_')){
148  } else {
149  m_WordDelimiters[i] = true;
150  }
151  }
152 }
153 
155 {
156  if ( m_CaseSensitive == NStr::eNocase) {
158  }
159 
160  // For each character in the alpahbet compute its last occurrence in
161  // the pattern.
162 
163  // Initilalize vector
164  size_t size = m_LastOccurrence.size();
165  for ( size_t i = 0; i < size; ++i ) {
167  }
168 
169  // compute right-most occurrence
170  for ( int j = 0; j < (int)m_PatLen - 1; ++j ) {
171  /* int lo = */
172  m_LastOccurrence[(unsigned char)m_Pattern[j]] = m_PatLen - j - 1;
173  }
174 }
175 
176 
178  SIZE_TYPE shift,
179  SIZE_TYPE text_len) const
180 {
181  // Implementation note.
182  // Case sensitivity check has been taken out of loop.
183  // Code size for performance optimization. (We generally choose speed).
184  // (Anatoliy)
185  if (m_CaseSensitive == NStr::eCase) {
186  while (shift + m_PatLen <= text_len) {
187  int j = (int)m_PatLen - 1;
188 
189  for( ; j >= 0 && m_Pattern[j]==text[shift + j]; --j )
190  {}
191 
192  if ( (j == -1) && IsWholeWord(text, shift, text_len) ) {
193  return shift;
194  } else {
195  shift += (unsigned int)m_LastOccurrence[(unsigned char)text[shift + (int)m_PatLen - 1]];
196  }
197  }
198  } else { // case insensitive NStr::eNocase
199  while (shift + m_PatLen <= text_len) {
200  int j = (int)m_PatLen - 1;
201 
202  for( ; j >= 0 && m_Pattern[j]==char(toupper((unsigned char) text[shift + j])); --j )
203  {}
204 
205  if ( (j == -1) && IsWholeWord(text, shift, text_len) ) {
206  return shift;
207  } else {
208  shift +=
209  (unsigned int)m_LastOccurrence[toupper((unsigned char) text[shift + (int)m_PatLen - 1])];
210  }
211  }
212  }
213  return (SIZE_TYPE)-1;
214 }
215 
216 
217 // Private:
218 // ========
219 
220 // Constants
221 const int CBoyerMooreMatcher::sm_AlphabetSize = 256; // assuming ASCII
222 
223 
224 // Member Functions
226  SIZE_TYPE pos,
227  SIZE_TYPE text_len) const
228 {
229  SIZE_TYPE left, right;
230  left = right = 1;
231 
232  // Words at the beginning and end of text are also considered "whole"
233 
234  // check on the left
235  if (m_WholeWord & ePrefixMatch) {
236  left = (pos == 0) ||
237  ((pos > 0) && m_WordDelimiters[(unsigned char)text[pos - 1]]);
238  }
239 
240  // check on the right
241  if (m_WholeWord & eSuffixMatch) {
242  pos += (unsigned int)m_PatLen;
243  right = (pos == text_len) ||
244  ((pos < text_len) && m_WordDelimiters[(unsigned char)text[pos]]);
245  }
246 
247 
248  return (left && right);
249 }
250 
251 
#define true
Definition: bool.h:35
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
CBoyerMooreMatcher(const string &pattern, NStr::ECase case_sensitive=NStr::eNocase, unsigned int whole_word=eSubstrMatch)
Initialize a matcher with the pattern to be matched.
Definition: strsearch.cpp:53
NStr::ECase m_CaseSensitive
Definition: strsearch.hpp:198
bool IsWholeWord(const char *text, SIZE_TYPE pos, SIZE_TYPE text_len) const
Check if the pattern at position pos in the text lies on a whole word boundry.
Definition: strsearch.cpp:225
void x_InitPattern(void)
Definition: strsearch.cpp:154
unsigned int m_WholeWord
Definition: strsearch.hpp:199
void SetWordDelimiters(const string &word_delimeters, bool invert_delimiters=false)
Set word delimiting characters.
Definition: strsearch.cpp:87
void InitCommonDelimiters()
Init delimiters most common for the English language, (whitespaces, punctuations, etc)
Definition: strsearch.cpp:137
void AddDelimiters(const string &word_delimeters)
Add new word delimiters.
Definition: strsearch.cpp:105
vector< size_t > m_LastOccurrence
Definition: strsearch.hpp:200
SIZE_TYPE m_PatLen
Definition: strsearch.hpp:197
vector< unsigned char > m_WordDelimiters
Definition: strsearch.hpp:201
static const int sm_AlphabetSize
Definition: strsearch.hpp:184
size_t Search(const string &text, size_t pos=0) const
Search for the pattern over text starting at position pos.
Definition: strsearch.hpp:161
NCBI_NS_STD::string::size_type SIZE_TYPE
Definition: ncbistr.hpp:132
#define NPOS
Definition: ncbistr.hpp:133
ECase
Which type of string comparison.
Definition: ncbistr.hpp:1204
static string & ToUpper(string &str)
Convert string to upper case – string& version.
Definition: ncbistr.cpp:424
@ eNocase
Case insensitive compare.
Definition: ncbistr.hpp:1206
@ eCase
Case sensitive compare.
Definition: ncbistr.hpp:1205
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
int i
yy_size_t n
static void text(MDB_val *v)
Definition: mdb_dump.c:62
const struct ncbi::grid::netcache::search::fields::SIZE size
int isspace(Uchar c)
Definition: ncbictype.hpp:69
int toupper(Uchar c)
Definition: ncbictype.hpp:73
String search utilities.
Modified on Sat Dec 02 09:22:25 2023 by modify_doxy.py rev. 669887