NCBI C++ ToolKit
strsearch.hpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 #ifndef UTIL___STRSEARCH__HPP
2 #define UTIL___STRSEARCH__HPP
3 
4 /* $Id: strsearch.hpp 81721 2018-03-28 12:46:32Z ivanov $
5 * ===========================================================================
6 *
7 * PUBLIC DOMAIN NOTICE
8 * National Center for Biotechnology Information
9 *
10 * This software/database is a "United States Government Work" under the
11 * terms of the United States Copyright Act. It was written as part of
12 * the author's official duties as a United States Government employee and
13 * thus cannot be copyrighted. This software/database is freely available
14 * to the public for use. The National Library of Medicine and the U.S.
15 * Government have not placed any restriction on its use or reproduction.
16 *
17 * Although all reasonable efforts have been taken to ensure the accuracy
18 * and reliability of the software and data, the NLM and the U.S.
19 * Government do not and cannot warrant the performance or results that
20 * may be obtained by using this software or data. The NLM and the U.S.
21 * Government disclaim all warranties, express or implied, including
22 * warranties of performance, merchantability or fitness for any particular
23 * purpose.
24 *
25 * Please cite the author in any work or product based on this material.
26 *
27 * ===========================================================================
28 *
29 * Author: Mati Shomrat
30 * Anatoliy Kuznetsov
31 *
32 * File Description:
33 * String search utilities.
34 *
35 */
36 
37 /// @file strsearch.hpp
38 /// String search utilities.
39 ///
40 /// Currently there are two search utilities:
41 /// 1. An implementation of the Boyer-Moore algorithm.
42 /// 2. A finite state automata.
43 
44 
45 #include <corelib/ncbistd.hpp>
46 #include <corelib/ncbistr.hpp>
47 #include <string>
48 #include <vector>
49 #include <iterator>
50 
51 
52 /** @addtogroup StringSearch
53  *
54  * @{
55  */
56 
57 
59 
60 
61 //============================================================================//
62 // CBoyerMooreMatcher //
63 //============================================================================//
64 
65 
66 /// This implemetation uses the Boyer-Moore alg. in order to search a single
67 /// pattern over varying texts.
68 
69 
71 {
72 public:
74  {
75  eSubstrMatch = 0,
76  ePrefixMatch = (1 << 0),
77  eSuffixMatch = (1 << 1),
78  eWholeWordMatch = (ePrefixMatch | eSuffixMatch)
79  };
80 public:
81  /// Initialize a matcher with the pattern to be matched.
82  ///
83  /// @param pattern
84  /// Pattern to be matched
85  /// @param case_sensitive
86  /// should the search be case sensitive (false by default).
87  /// @param whole_word
88  /// a match is found ony if the pattern was found to
89  /// be between delimiting characters (whitespaces)
90  CBoyerMooreMatcher(const string& pattern,
91  NStr::ECase case_sensitive = NStr::eNocase,
92  unsigned int whole_word = eSubstrMatch);
93 
94 
95  /// Initialize a matcher with the pattern to be matched.
96  ///
97  /// @param pattern
98  /// Pattern to be matched
99  /// @param word_delimeters
100  /// a match is found ony if the pattern was found to
101  /// be between delimiting characters like whitespaces
102  /// and punctuation marks
103  /// @param case_sensitive
104  /// should the search be case sensitive (false by default).
105  /// @param invert_delimiters
106  /// when TRUE means all characters NOT belonging to
107  /// word_delimeters are to be used
108  CBoyerMooreMatcher(const string& pattern,
109  const string& word_delimeters,
110  NStr::ECase case_sensitive = NStr::eNocase,
111  bool invert_delimiters = false);
112 
113 
114  /// Set word delimiting characters
115  ///
116  /// @param word_delimeters
117  /// string of characters used as word delimiters
118  /// @param invert_delimiters
119  /// when TRUE means all characters NOT belonging to
120  /// word_delimeters are to be used
121  void SetWordDelimiters(const string& word_delimeters,
122  bool invert_delimiters = false);
123 
124  /// Add new word delimiters
125  ///
126  /// @param word_delimeters
127  /// string of characters used as word delimiters
128  ///
129  void AddDelimiters(const string& word_delimeters);
130 
131  /// Add new word delimiter charracter
132  ///
133  /// @param word_delimeters
134  /// string of characters used as word delimiters
135  ///
136  void AddDelimiters(char ch);
137 
138  /// Init delimiters most common for the English language,
139  /// (whitespaces, punctuations, etc)
140  void InitCommonDelimiters();
141 
142 
143  /// Set word matching mode
144  ///
145  /// @param whole_word
146  /// word matching mode. Can be OR combination of EWordMatch values
147  void SetWordMatching(unsigned int whole_word = eWholeWordMatch)
148  {
149  m_WholeWord = whole_word;
150  }
151 
152  /// Search for the pattern over text starting at position pos.
153  ///
154  /// @param text
155  /// Text to search in.
156  /// @param pos
157  /// Position shift in text
158  /// @return
159  /// the position at which the pattern was found,
160  /// size_t(-1) (NPOS) otherwise.
161  size_t Search(const string& text, size_t pos = 0) const
162  {
163  return Search(text.c_str(), pos, text.length());
164  }
165 
166 
167  /// Search for the pattern over text starting at position pos.
168  ///
169  /// @param text
170  /// Text memory taken as NON ASCIIZ string.
171  /// @param pos
172  /// String offset position from the start
173  /// @param text_len
174  /// Length of text
175  /// @return
176  /// the position at which the pattern was found,
177  /// SIZE_TYPE(-1) (NPOS) otherwise.
178  SIZE_TYPE Search(const char* text,
179  SIZE_TYPE pos,
180  SIZE_TYPE text_len) const;
181 
182 private:
183  // Constants
184  static const int sm_AlphabetSize;
185 
186  // Member Functions:
187 
188  /// Check if the pattern at position pos in the text lies on a
189  /// whole word boundry.
190  bool IsWholeWord(const char* text,
191  SIZE_TYPE pos,
192  SIZE_TYPE text_len) const;
193 
194  void x_InitPattern(void);
195 private:
196  string m_Pattern;
199  unsigned int m_WholeWord;
200  vector<size_t> m_LastOccurrence;
201  vector<unsigned char> m_WordDelimiters;
202 
203 };
204 
205 
206 //============================================================================//
207 // Finite State Automata //
208 //============================================================================//
209 
210 
211 
212 template <typename MatchType>
213 class CTextFsm
214 {
215 public:
216  // Constants (done as an enum to avoid link errors on Darwin)
218  eFailState = -1
219  };
220 
221  // Constructors and Destructors:
222  CTextFsm(bool case_sensitive = false);
223  ~CTextFsm(void);
224 
225  // Add a word to the Fsm. Store a match for later retreival.
226  void AddWord(const string& word, const MatchType& match);
227 
228  // Prime the FSM.
229  // After finishing adding all the words to the FSM it needs to be
230  // primed to enable usage.
231  bool IsPrimed(void) const { return m_Primed; }
232  void Prime(void);
233 
234  // Retreive the FSM's initial state.
235  int GetInitialState(void) const { return 0; }
236 
237  // Get the next state, based on the currect state and a transition
238  // character.
239  int GetNextState(int state, char letter) const;
240 
241  // Are there any matches stored in state?
242  bool IsMatchFound(int state) const;
243 
244  // Retrieve the stored matches in state.
245  const vector<MatchType>& GetMatches(int state) const;
246 
247 private:
248  // Internal representation of states.
249  class CState
250  {
251  public:
252  // Ad-hoc definitions
254 
255  // Constructors and Destructors
256  CState(void) : m_OnFailure(0) {}
257  ~CState(void) {};
258 
259  // Add a transition to the table.
260  void AddTransition(char letter, int to) { m_Transitions[letter] = to; }
261 
262  // Retreive the transition state, give the transition character.
263  int GetNextState(char letter) const {
265  return it != m_Transitions.end() ? it->second : eFailState;
266  }
267 
268 
269  // Are there any matches stored in this state?
270  bool IsMatchFound(void) const { return !m_Matches.empty(); }
271 
272  // Retreive the stored matches
273  vector<MatchType>& GetMatches(void) { return m_Matches; }
274  const vector<MatchType>& GetMatches(void) const { return m_Matches; }
275 
276  // Store a match.
277  void AddMatch(const MatchType& match) { m_Matches.push_back(match); }
278 
279  const TMapCharInt& GetTransitions(void) const { return m_Transitions; };
280 
281  // Getter and Setter for failure transition.
283  int GetOnFailure(void) const { return m_OnFailure; }
284 
285  private:
286 
287  // Member Variables:
288  TMapCharInt m_Transitions; // Transition table
289  vector<MatchType> m_Matches; // Stored matches
290  int m_OnFailure; // Transition state in failure.
291 
292  }; // end of class CState
293 
294  // Private Methods:
295  CState *GetState(int state);
296  int GetNextState(const CState& from, char letter) const;
297 
298  // Compute the transition to be performed on failure.
299  void ComputeFail(void);
300  void FindFail(int state, int new_state, char ch);
301  void QueueAdd(vector<int>& in_queue, int qbeg, int val);
302 
303  // Member Variables
304  bool m_Primed;
305  vector< CState > m_States;
307 
308 }; // end of class CTextFsm
309 
310 
311 // Convenience class when the MatchType is of string type (most cases)
312 class NCBI_XUTIL_EXPORT CTextFsa : public CTextFsm<string>
313 {
314 public:
315  CTextFsa(bool case_sensitive = false) :
316  CTextFsm<string>(case_sensitive)
317  {}
318 
319  void AddWord(const string& word) {
320  CTextFsm<string>::AddWord(word, word);
321  }
322 };
323 
324 
325 //============================================================================//
326 // Finite State Automata Implementation //
327 //============================================================================//
328 
329 
330 // Public:
331 // =======
332 
333 template <typename MatchType>
334 CTextFsm<MatchType>::CTextFsm(bool case_sensitive) :
335 m_Primed(false), m_CaseSensitive(case_sensitive)
336 {
337  CState initial;
338  m_States.push_back(initial);
339 }
340 
341 
342 template <typename MatchType>
343 void CTextFsm<MatchType>::AddWord(const string& word, const MatchType& match)
344 {
345  string temp = word;
346  if ( !m_CaseSensitive ) {
347  NStr::ToUpper(temp);
348  }
349 
350  int next, i,
351  state = 0,
352  word_len = (int)temp.length();
353 
354  // try to overlay beginning of word onto existing table
355  for ( i = 0; i < word_len; ++i ) {
356  next = m_States[state].GetNextState(temp[i]);
357  if ( next == eFailState ) break;
358  state = next;
359  }
360 
361  // now create new states for remaining characters in word
362  for ( ; i < word_len; ++ i ) {
363  CState new_state;
364 
365  m_States.push_back(new_state);
366  m_States[state].AddTransition(temp[i], (int)m_States.size() - 1);
367  state = m_States[state].GetNextState(temp[i]);
368  }
369 
370  // add match information
371  m_States[state].AddMatch(match);
372 }
373 
374 
375 template <typename MatchType>
377 {
378  if ( m_Primed ) return;
379 
380  ComputeFail();
381 
382  m_Primed = true;
383 }
384 
385 
386 template <typename MatchType>
388 {
389  if ( size_t(state) >= m_States.size() ) {
390  return 0;
391  }
392  return &(m_States[state]);
393 }
394 
395 
396 template <typename MatchType>
397 int CTextFsm<MatchType>::GetNextState(const CState& from, char letter) const {
398  char ch = m_CaseSensitive ? letter : (char)toupper((unsigned char) letter);
399  return from.GetNextState(ch);
400 }
401 
402 
403 template <typename MatchType>
405 {
406  if ( size_t(state) >= m_States.size() ) {
407  return eFailState;
408  }
409 
410  int next;
411  int initial = GetInitialState();
412  while ( (next = GetNextState(m_States[state], letter)) == eFailState ) {
413  if ( state == initial ) {
414  next = initial;
415  break;
416  }
417  state = m_States[state].GetOnFailure();
418  }
419 
420  return next;
421 }
422 
423 
424 template <typename MatchType>
425 void CTextFsm<MatchType>::QueueAdd(vector<int>& in_queue, int qbeg, int val)
426 {
427  int q;
428 
429  q = in_queue [qbeg];
430  if (q == 0) {
431  in_queue [qbeg] = val;
432  } else {
433  for ( ; in_queue [q] != 0; q = in_queue [q]) continue;
434  in_queue [q] = val;
435  }
436  in_queue [val] = 0;
437 }
438 
439 
440 template <typename MatchType>
442 {
443  int qbeg, r, s, state;
444  vector<int> state_queue(m_States.size());
445 
446  qbeg = 0;
447  state_queue [0] = 0;
448 
449  // queue up states reached directly from state 0 (depth 1)
450 
451  ITERATE ( typename CState::TMapCharInt,
452  it,
453  m_States[GetInitialState()].GetTransitions() ) {
454  s = it->second;
455  m_States[s].SetOnFailure(0);
456  QueueAdd(state_queue, qbeg, s);
457  }
458 
459  while (state_queue [qbeg] != 0) {
460  r = state_queue [qbeg];
461  qbeg = r;
462 
463  // depth 1 states beget depth 2 states, etc.
464 
465  ITERATE ( typename CState::TMapCharInt, it,
466  m_States[r].GetTransitions() ) {
467  s = it->second;
468  QueueAdd(state_queue, qbeg, s);
469 
470  // State Substring Transitions Failure
471  // 2 st a -> 3 6
472  // 3 sta l -> 4
473  // 6 t a -> 7 0
474  // 7 ta p -> 8
475  //
476  // For example, r = 2 (st), if 'a' would go to s = 3 (sta).
477  // From previous computation, 2 (st) fails to 6 (t).
478  // Thus, check state 6 (t) for any transitions using 'a'.
479  // Since 6 (t) 'a' -> 7 (ta), therefore set fail [3] -> 7.
480 
481  state = m_States[r].GetOnFailure();
482  FindFail(state, s, it->first);
483  }
484  }
485 }
486 
487 
488 template <typename MatchType>
489 void CTextFsm<MatchType>::FindFail(int state, int new_state, char ch)
490 {
491  int next;
492 
493  // traverse existing failure path
494 
495  while ( (next = GetNextState(state, ch)) == eFailState) {
496  if ( state == 0 ) {
497  next = 0; break;
498  }
499  state = m_States[state].GetOnFailure();
500  }
501 
502  // add new failure state
503 
504  m_States[new_state].SetOnFailure(next);
505 
506  // add matches of substring at new state
507 
508  copy( m_States[next].GetMatches().begin(),
509  m_States[next].GetMatches().end(),
510  back_inserter(m_States[new_state].GetMatches()) );
511 }
512 
513 
514 template <typename MatchType>
515 const vector<MatchType>& CTextFsm<MatchType>::GetMatches(int state) const {
516  return m_States[state].GetMatches();
517 }
518 
519 
520 template <typename MatchType>
522 {
523  return m_States[state].IsMatchFound();
524 }
525 
526 
527 template <typename MatchType>
529 {}
530 
531 
533 
534 
535 /* @} */
536 
537 #endif // UTIL___STRSEARCH__HPP
This implemetation uses the Boyer-Moore alg.
Definition: strsearch.hpp:71
container_type::const_iterator const_iterator
Definition: map.hpp:53
const_iterator end() const
Definition: map.hpp:152
const_iterator find(const key_type &key) const
Definition: map.hpp:153
Include a standard set of the NCBI C++ Toolkit most basic headers.
The NCBI C++ standard methods for dealing with std::string.
#define false
Definition: bool.h:36
static DLIST_TYPE *DLIST_NAME() next(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
Definition: dlist.tmpl.h:56
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
string
Definition: cgiapp.hpp:687
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
void ComputeFail(void)
Definition: strsearch.hpp:441
void FindFail(int state, int new_state, char ch)
Definition: strsearch.hpp:489
void AddWord(const string &word, const MatchType &match)
Definition: strsearch.hpp:343
NStr::ECase m_CaseSensitive
Definition: strsearch.hpp:198
vector< MatchType > & GetMatches(void)
Definition: strsearch.hpp:273
bool m_CaseSensitive
Definition: strsearch.hpp:306
vector< CState > m_States
Definition: strsearch.hpp:305
void QueueAdd(vector< int > &in_queue, int qbeg, int val)
Definition: strsearch.hpp:425
int GetNextState(int state, char letter) const
Definition: strsearch.hpp:404
TMapCharInt m_Transitions
Definition: strsearch.hpp:288
int GetOnFailure(void) const
Definition: strsearch.hpp:283
const TMapCharInt & GetTransitions(void) const
Definition: strsearch.hpp:279
unsigned int m_WholeWord
Definition: strsearch.hpp:199
void AddWord(const string &word)
Definition: strsearch.hpp:319
const vector< MatchType > & GetMatches(void) const
Definition: strsearch.hpp:274
void SetOnFailure(int state)
Definition: strsearch.hpp:282
CTextFsa(bool case_sensitive=false)
Definition: strsearch.hpp:315
const vector< MatchType > & GetMatches(int state) const
Definition: strsearch.hpp:515
CTextFsm(bool case_sensitive=false)
Definition: strsearch.hpp:334
void SetWordMatching(unsigned int whole_word=eWholeWordMatch)
Set word matching mode.
Definition: strsearch.hpp:147
int GetNextState(char letter) const
Definition: strsearch.hpp:263
bool IsMatchFound(void) const
Definition: strsearch.hpp:270
bool IsMatchFound(int state) const
Definition: strsearch.hpp:521
map< char, int > TMapCharInt
Definition: strsearch.hpp:253
void AddMatch(const MatchType &match)
Definition: strsearch.hpp:277
void Prime(void)
Definition: strsearch.hpp:376
vector< MatchType > m_Matches
Definition: strsearch.hpp:289
void AddTransition(char letter, int to)
Definition: strsearch.hpp:260
CState * GetState(int state)
Definition: strsearch.hpp:387
bool m_Primed
Definition: strsearch.hpp:304
bool IsPrimed(void) const
Definition: strsearch.hpp:231
int GetInitialState(void) const
Definition: strsearch.hpp:235
vector< size_t > m_LastOccurrence
Definition: strsearch.hpp:200
SIZE_TYPE m_PatLen
Definition: strsearch.hpp:197
vector< unsigned char > m_WordDelimiters
Definition: strsearch.hpp:201
static const int sm_AlphabetSize
Definition: strsearch.hpp:184
int GetNextState(const CState &from, char letter) const
Definition: strsearch.hpp:397
~CTextFsm(void)
Definition: strsearch.hpp:528
size_t Search(const string &text, size_t pos=0) const
Search for the pattern over text starting at position pos.
Definition: strsearch.hpp:161
NCBI_NS_STD::string::size_type SIZE_TYPE
Definition: ncbistr.hpp:132
ECase
Which type of string comparison.
Definition: ncbistr.hpp:1204
static string & ToUpper(string &str)
Convert string to upper case – string& version.
Definition: ncbistr.cpp:424
@ eNocase
Case insensitive compare.
Definition: ncbistr.hpp:1206
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
int i
static void text(MDB_val *v)
Definition: mdb_dump.c:62
int toupper(Uchar c)
Definition: ncbictype.hpp:73
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
void copy(Njn::Matrix< S > *matrix_, const Njn::Matrix< T > &matrix0_)
Definition: njn_matrix.hpp:613
static int match(register const pcre_uchar *eptr, register const pcre_uchar *ecode, const pcre_uchar *mstart, int offset_top, match_data *md, eptrblock *eptrb, unsigned int rdepth)
Definition: pcre_exec.c:513
NCBI_XUTIL_EXPORT
Parameter to control printing diagnostic message about conversion of static array data from a differe...
Definition: static_set.hpp:72
static Uint4 letter(char c)
Modified on Wed Jun 12 11:17:11 2024 by modify_doxy.py rev. 669887