NCBI C++ ToolKit
multipattern_search.hpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 #ifndef UTIL___MULTIPATTERN_SEARCH__HPP
2 #define UTIL___MULTIPATTERN_SEARCH__HPP
3 
4 /* $Id: multipattern_search.hpp 98553 2022-12-05 12:50:08Z gotvyans $
5 * ===========================================================================
6 *
7 * PUBLIC DOMAIN NOTICE
8 * National Center for Biotechnology Information
9 *
10 * This software/database is a "United States Government Work" under the
11 * terms of the United States Copyright Act. It was written as part of
12 * the author's official duties as a United States Government employee and
13 * thus cannot be copyrighted. This software/database is freely available
14 * to the public for use. The National Library of Medicine and the U.S.
15 * Government have not placed any restriction on its use or reproduction.
16 *
17 * Although all reasonable efforts have been taken to ensure the accuracy
18 * and reliability of the software and data, the NLM and the U.S.
19 * Government do not and cannot warrant the performance or results that
20 * may be obtained by using this software or data. The NLM and the U.S.
21 * Government disclaim all warranties, express or implied, including
22 * warranties of performance, merchantability or fitness for any particular
23 * purpose.
24 *
25 * Please cite the author in any work or product based on this material.
26 *
27 * ===========================================================================
28 *
29 * Author: Sema Kachalo, NCBI
30 *
31 */
32 
33 /// @file multipattern_search.hpp
34 /// Simultaneous search of multiple RegEx patterns in the input string
35 
36 #include <corelib/ncbistd.hpp>
37 #include <functional>
38 
40 
41 class CRegExFSA;
42 
43 namespace FSM
44 {
45  class CCompiledFSM;
46 }
47 
48 ///////////////////////////////////////////////////////////////////////
49 ///
50 /// CMultipatternSearch
51 ///
52 /// Simultaneous search of multiple string or RegEx patterns in the input string
53 ///
54 /// CMultipatternSearch builds a Finite State Machine (FSM)
55 /// from a list of search strings or regular expression patterns.
56 /// It can then search all patterns simultaneously,
57 /// requiring only a single traversal of the input string.
58 /// Use this class to increase the search performance
59 /// when the number of search patterns is large (10 and more)
60 /// If the patterns are known in advance, FSM can be exported as C code
61 /// and compiled for the further performance improvement.
62 
64 {
65 public:
68 
69  /// Search flags (for non-RegEx patterns only!)
70  enum EFlags {
71  fNoCase = 1 << 0,
72  fBeginString = 1 << 1,
73  fEndString = 1 << 2,
74  fWholeString = fBeginString | fEndString,
75  fBeginWord = 1 << 3,
76  fEndWord = 1 << 4,
77  fWholeWord = fBeginWord | fEndWord
78  };
80 
81 
82  ///@{
83  /// Add search pattern to the FSM
84  ///
85  /// @param pattern
86  /// A search pattern to add to the FSM.
87  /// If begins with a slash (/), then it is considered a RegEx.
88  /// @param flags
89  /// Additional search conditions.
90  /// If the first argument is a RegEx, then the flags are ignored.
91  void AddPattern(const char* pattern, TFlags flags = 0);
92  void AddPattern(const string& pattern, TFlags flags = 0) { AddPattern(pattern.c_str(), flags); }
93  void AddPatterns(const vector<string>& patterns);
94  void AddPatterns(const vector<pair<string, TFlags>>& patterns);
95  ///@}
96 
97  /// Quote special characters to insert string into regular expression
98  static string QuoteString(const string& str);
99 
100  /// Generate graphical representation of the FSM in DOT format.
101  /// For more details, see http://www.graphviz.org/
102  ///
103  /// @param out
104  /// A stream to receive the output.
105  void GenerateDotGraph(ostream& out) const;
106 
107  /// Generate C++ array/map data.
108  ///
109  /// @param out
110  /// A stream to receive the output.
111  void GenerateArrayMapData(ostream& out) const;
112 
113  /// Generate C code for the FSM search.
114  ///
115  /// @param out
116  /// A stream to receive the output.
117  void GenerateSourceCode(ostream& out) const;
118 
119  /// When the pattern is found, the search can be stopped or continued
120  enum EOnFind {
122  eContinueSearch
123  };
124 
125  ///@{
126  /// Run the FSM search on the input string
127  ///
128  /// @param input
129  /// Input string.
130  /// @param found_callback
131  /// Function or function-like object to call when the pattern is found.
132  /// It can accept one or two parameters and return void or EOnFind.
133  /// If it returns eStopSearch, the search terminates.
134  /// If it returns eContinueSearch or void, the search continues.
135  /// @sa CFoundCallback
136  ///
137  /// @code
138  /// Search(str, [](size_t pattern) { cout << "Found " << pattern << "\n"; });
139  /// Search(str, [](size_t pattern) -> CMultipatternSearch::EOnFind { cout << "Found " << pattern << "\n"; return CMultipatternSearch::eContinueSearch; });
140  /// Search(str, [](size_t pattern, size_t position) { cout << "Found " << pattern << " " << position << "\n"; });
141  /// Search(str, [](size_t pattern, size_t position) -> CMultipatternSearch::EOnFind { cout << "Found " << pattern << " " << position << "\n"; return CMultipatternSearch::eContinueSearch; });
142  /// @endcode
143 
144  typedef std::function<void(size_t)> VoidCall1;
145  typedef std::function<void(size_t, size_t)> VoidCall2;
146  typedef std::function<bool(size_t)> BoolCall1;
147  typedef std::function<bool(size_t, size_t)> BoolCall2;
148 
149  void Search(const char* input, VoidCall1 found_callback) const;
150  void Search(const string& input, VoidCall1 found_callback) const { Search(input.c_str(), found_callback); }
151  void Search(const char* input, VoidCall2 found_callback) const;
152  void Search(const string& input, VoidCall2 found_callback) const { Search(input.c_str(), found_callback); }
153  void Search(const char* input, BoolCall1 found_callback) const;
154  void Search(const string& input, BoolCall1 found_callback) const { Search(input.c_str(), found_callback); }
155  void Search(const char* input, BoolCall2 found_callback) const;
156  void Search(const string& input, BoolCall2 found_callback) const { Search(input.c_str(), found_callback); }
157  ///@}
158 
159  ///@{
160  /// Run the FSM search on the input string using the strucutre prebuilt by multipattern -A
161  ///
162  /// @param input
163  /// Input string.
164  /// @param states
165  /// generated by multipattern -A
166  /// @param emit
167  /// generated by multipattern -A
168  /// @param hits
169  /// generated by multipattern -A
170  /// @param found_callback
171  /// Function or function-like object to call when the pattern is found.
172  /// It can accept one or two parameters and return void or EOnFind.
173  /// If it returns eStopSearch, the search terminates.
174  /// If it returns eContinueSearch or void, the search continues.
175  /// @sa CFoundCallback
176  ///
177  /// @code
178  /// Search(str, [](size_t pattern) { cout << "Found " << pattern << "\n"; });
179  /// Search(str, [](size_t pattern) -> CMultipatternSearch::EOnFind { cout << "Found " << pattern << "\n"; return CMultipatternSearch::eContinueSearch; });
180  /// Search(str, [](size_t pattern, size_t position) { cout << "Found " << pattern << " " << position << "\n"; });
181  /// Search(str, [](size_t pattern, size_t position) -> CMultipatternSearch::EOnFind { cout << "Found " << pattern << " " << position << "\n"; return CMultipatternSearch::eContinueSearch; });
182  /// @endcode
183  ///
184  /// Example:
185  ///
186  /// static void Screen(const char* str, bool *result)
187  /// {
188  /// #define _FSM_EMIT static bool emit[]
189  /// #define _FSM_HITS static map<size_t, vector<size_t>> hits
190  /// #define _FSM_STATES static size_t states[]
191  /// #include "GENERATED_FILE.inc"
192  /// #undef _FSM_EMIT
193  /// #undef _FSM_HITS
194  /// #undef _FSM_STATES
195  /// CMultipatternSearch::Search(str, states, emit, hits, [result](size_t n) { result[n] = true; });
196  /// }
197 
198  static void Search(const char* input, const FSM::CCompiledFSM& fsm, VoidCall1 found_callback);
199  static void Search(const string& input, const FSM::CCompiledFSM& fsm, VoidCall1 found_callback) { Search(input.c_str(), fsm, found_callback); }
200  static void Search(const char* input, const FSM::CCompiledFSM& fsm, VoidCall2 found_callback);
201  static void Search(const string& input, const FSM::CCompiledFSM& fsm, VoidCall2 found_callback) { Search(input.c_str(), fsm, found_callback); }
202  static void Search(const char* input, const FSM::CCompiledFSM& fsm, BoolCall1 found_callback);
203  static void Search(const string& input, const FSM::CCompiledFSM& fsm, BoolCall1 found_callback) { Search(input.c_str(), fsm, found_callback); }
204  static void Search(const char* input, const FSM::CCompiledFSM& fsm, BoolCall2 found_callback);
205  static void Search(const string& input, const FSM::CCompiledFSM& fsm, BoolCall2 found_callback) { Search(input.c_str(), fsm, found_callback); }
206  /// @endcode
207 
208 private:
209  /// Finit State Machine that does all work
210  unique_ptr<CRegExFSA> m_FSM;
211 };
212 
213 
215 
216 #endif /* UTIL___MULTIPATTERN_SEARCH__HPP */
CMultipatternSearch.
static void Search(const string &input, const FSM::CCompiledFSM &fsm, VoidCall2 found_callback)
unique_ptr< CRegExFSA > m_FSM
std::function< void(size_t, size_t)> VoidCall2
void AddPattern(const string &pattern, TFlags flags=0)
static void Search(const string &input, const FSM::CCompiledFSM &fsm, VoidCall1 found_callback)
std::function< bool(size_t, size_t)> BoolCall2
static void Search(const char *input, const FSM::CCompiledFSM &fsm, VoidCall2 found_callback)
static void Search(const char *input, const FSM::CCompiledFSM &fsm, BoolCall1 found_callback)
void Search(const string &input, BoolCall1 found_callback) const
static void Search(const string &input, const FSM::CCompiledFSM &fsm, BoolCall1 found_callback)
EOnFind
When the pattern is found, the search can be stopped or continued.
static void Search(const char *input, const FSM::CCompiledFSM &fsm, VoidCall1 found_callback)
void Search(const string &input, VoidCall1 found_callback) const
std::function< void(size_t)> VoidCall1
static void Search(const string &input, const FSM::CCompiledFSM &fsm, BoolCall2 found_callback)
void Search(const string &input, BoolCall2 found_callback) const
EFlags
Search flags (for non-RegEx patterns only!)
std::function< bool(size_t)> BoolCall1
void Search(const string &input, VoidCall2 found_callback) const
DECLARE_SAFE_FLAGS_TYPE(EFlags, TFlags)
static void Search(const char *input, const FSM::CCompiledFSM &fsm, BoolCall2 found_callback)
Include a standard set of the NCBI C++ Toolkit most basic headers.
static uch flags
std::ofstream out("events_result.xml")
main entry point for tests
#define bool
Definition: bool.h:34
static const char * str(char *buf, int n)
Definition: stats.c:84
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
static int input()
static patstr * patterns
Definition: pcregrep.c:259
string QuoteString(const string &s)
Definition: prt2fsm.cpp:70
NCBI_XUTIL_EXPORT
Parameter to control printing diagnostic message about conversion of static array data from a differe...
Definition: static_set.hpp:72
Modified on Wed Jun 19 17:08:00 2024 by modify_doxy.py rev. 669887