NCBI C++ ToolKit
multipattern_search_impl.hpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 #ifndef UTIL___MULTIPATTERN_SEARCH_IMPL__HPP
2 #define UTIL___MULTIPATTERN_SEARCH_IMPL__HPP
3 
4 /* $Id: multipattern_search_impl.hpp 100700 2023-08-31 19:20:11Z lavr $
5 * ===========================================================================
6 *
7 * PUBLIC DOMAIN NOTICE
8 * National Center for Biotechnology Information
9 *
10 * This software/database is a "United States Government Work" under the
11 * terms of the United States Copyright Act. It was written as part of
12 * the author's official duties as a United States Government employee and
13 * thus cannot be copyrighted. This software/database is freely available
14 * to the public for use. The National Library of Medicine and the U.S.
15 * Government have not placed any restriction on its use or reproduction.
16 *
17 * Although all reasonable efforts have been taken to ensure the accuracy
18 * and reliability of the software and data, the NLM and the U.S.
19 * Government do not and cannot warrant the performance or results that
20 * may be obtained by using this software or data. The NLM and the U.S.
21 * Government disclaim all warranties, express or implied, including
22 * warranties of performance, merchantability or fitness for any particular
23 * purpose.
24 *
25 * Please cite the author in any work or product based on this material.
26 *
27 * ===========================================================================
28 *
29 * Author: Sema
30 *
31 * File Description:
32 *
33 */
34 
36 
37 #include <iostream>
38 #include <array>
39 #include <sstream>
40 
42 
44 class CRegExFSA;
45 
46 class CRegEx
47 {
48 public:
49  enum EType {
50  eTypeNone = 0,
53  eTypeWord = 4,
54  eTypeStop = 8,
60  };
61  // if string atarts with "/", treat it as RefEx and ignore flags
62  // otherwise, treat it as plain text search with flags
63  CRegEx(const char* s, CMultipatternSearch::TFlags f = 0) : m_Str(s), m_Flag(f) { x_Parse(); }
64  CRegEx(const string& s, CMultipatternSearch::TFlags f = 0) : m_Str(s), m_Flag(f) { x_Parse(); }
65  operator bool() const { return m_RegX != 0; }
66  static bool IsWordCharacter(unsigned char c) { return (c >= '0' && c <= '9') || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || c == '_'; }
67 
68 protected:
69 
70  // inner classes
71  struct CRegX // root for other RegX types; also an empty RegX
72  {
73  virtual ~CRegX() {}
74  virtual operator bool() { return true; }
75  virtual void SetCaseInsensitive() {}
76  virtual bool IsCaseInsensitive() const = 0;
77  virtual bool IsAssert() const { return false; }
78  virtual void Print(ostream& out, size_t off) const = 0;
79  virtual void Render(CRegExFSA& fsa, size_t from, size_t to) const = 0;
80  static void PrintOffset(ostream& out, size_t off) { for (size_t n = 0; n < off; n++) out << ' '; }
81  static void DummyTrans(CRegExFSA& fsa, size_t x, unsigned char t);
82  };
83 
84  struct CRegXEmpty : public CRegX // /()/
85  {
86  operator bool() { return false; }
87  bool IsCaseInsensitive() const { return true; }
88  void Print(ostream& out, size_t off) const { PrintOffset(out, off); out << "<empty>\n"; }
89  void Render(CRegExFSA& fsa, size_t from, size_t to) const;
90  };
91 
92  struct CRegXChar : public CRegX // /a/
93  {
94  CRegXChar(char c, bool neg = false) : m_Neg(neg) { m_Set.insert(c); }
95  CRegXChar(const set<unsigned char>& t, bool neg = false) : m_Neg(neg), m_Set(t) {}
96  void Set(const set<unsigned char>& t) { m_Set = t; }
97  void SetCaseInsensitive();
98  bool IsCaseInsensitive() const;
99  void Print(ostream& out, size_t off) const;
100  void Render(CRegExFSA& fsa, size_t from, size_t to) const;
101  bool m_Neg;
103  };
104 
105  struct CRegXTerm : public CRegX // /a?/ /a+/ /a*/ /a{1,2}/
106  {
107  CRegXTerm(unique_ptr<CRegX>& x, unsigned int min, unsigned int max, bool lazy = false) : m_RegX(std::move(x)), m_Min(min), m_Max(max), m_Lazy(lazy) {}
108  void SetCaseInsensitive() { m_RegX->SetCaseInsensitive(); }
109  bool IsCaseInsensitive() const { return m_RegX->IsCaseInsensitive(); }
110  void Print(ostream& out, size_t off) const;
111  void Render(CRegExFSA& fsa, size_t from, size_t to) const;
112  unique_ptr<CRegX> m_RegX;
113  unsigned int m_Min;
114  unsigned int m_Max;
115  bool m_Lazy;
116  };
117 
118  struct CRegXConcat : public CRegX // /abc/
119  {
121  CRegXConcat(vector<unique_ptr<CRegX> >& v) : m_Vec(std::move(v)) {}
122  void SetCaseInsensitive() { for (size_t n = 0; n < m_Vec.size(); n++) m_Vec[n]->SetCaseInsensitive(); }
123  bool IsCaseInsensitive() const { for (size_t n = 0; n < m_Vec.size(); n++) if (!m_Vec[n]->IsCaseInsensitive()) return false; return true; }
124  void Print(ostream& out, size_t off) const { PrintOffset(out, off); out << "<concat>\n"; for (size_t n = 0; n < m_Vec.size(); n++) m_Vec[n]->Print(out, off + 2); }
125  void Render(CRegExFSA& fsa, size_t from, size_t to) const;
126  vector<unique_ptr<CRegX> > m_Vec;
127  };
128 
129  struct CRegXSelect : public CRegX // /a|b|c/
130  {
132  CRegXSelect(vector<unique_ptr<CRegX> >& v) : m_Vec(std::move(v)) {}
133  void SetCaseInsensitive() { for (size_t n = 0; n < m_Vec.size(); n++) m_Vec[n]->SetCaseInsensitive(); }
134  bool IsCaseInsensitive() const { for (size_t n = 0; n < m_Vec.size(); n++) if (!m_Vec[n]->IsCaseInsensitive()) return false; return true; }
135  void Print(ostream& out, size_t off) const { PrintOffset(out, off); out << "<select>\n"; for (size_t n = 0; n < m_Vec.size(); n++) m_Vec[n]->Print(out, off + 2); }
136  void Render(CRegExFSA& fsa, size_t from, size_t to) const;
137  vector<unique_ptr<CRegX> > m_Vec;
138  };
139 
140  struct CRegXAssert : public CRegX // /^$\b\B(?=)(?!)/
141  {
142  enum EAssert {
146  eAssertWord, // \b
148  eAssertLookAhead, // (?=...) no FSM
149  eAssertLookAheadNeg, // (?!...) no FSM
150  eAssertLookBack, // (?<=...) will not support
151  eAssertLookBackNeg // (?<!...) will not support
152  };
154  CRegXAssert(EAssert a, unique_ptr<CRegX>& x) : m_Assert(a), m_RegX(std::move(x)) {}
155  void SetCaseInsensitive() { if (m_RegX) m_RegX->SetCaseInsensitive(); }
156  bool IsCaseInsensitive() const { return m_RegX ? m_RegX->IsCaseInsensitive() : true; }
157  virtual bool IsAssert() const { return true; }
158  void Print(ostream& out, size_t off) const;
159  void Render(CRegExFSA& fsa, size_t from, size_t to) const;
161  unique_ptr<CRegX> m_RegX;
162  };
163 
164  struct CRegXBackRef : public CRegX // /\1/
165  {
166  CRegXBackRef(unsigned int n) : m_Num(n) {}
167  bool IsCaseInsensitive() const override
168  { return false; }
169  void Print(ostream& out, size_t off) const override
170  { PrintOffset(out, off); out << "<bkref>\t" << m_Num << "\n"; }
171  void Render(CRegExFSA& /*fsa*/, size_t /*from*/, size_t /*to*/) const override
172  { throw string("back reference"); }
173  unsigned int m_Num;
174  };
175 
176  void x_Print(ostream& out) const;
177  void x_Parse();
178  void x_ParseOptions();
179  unique_ptr<CRegX> x_ParseSelect();
180  unique_ptr<CRegX> x_ParseConcat();
181  unique_ptr<CRegX> x_ParseTerm();
182  unique_ptr<CRegX> x_ParseAtom();
183  unique_ptr<CRegX> x_ParsePlain();
184  bool x_ParseRepeat(int& from, int& to, bool& lazy);
186  unsigned char x_ParseEscape();
187  int x_ParseHex(size_t len = 0);
188  int x_ParseDec(size_t len = 0);
191  void x_ThrowError(const string msg, size_t pos, size_t len);
192  void x_Consume(char c);
193 
194  string m_Str;
195  string m_Err;
196  size_t m_Cur;
197  CMultipatternSearch::TFlags m_Flag;
198  bool m_Unsupported; // RegEx is syntatically correct, but not supported: lookahead, back reference
199  unique_ptr<CRegX> m_RegX;
200  friend ostream& operator<<(ostream&, const CRegEx&);
201  friend class CRegExFSA;
202 };
203 
204 
206 {
207 public:
208  struct CRegExState
209  {
210  int m_Type;
217  CRegExState(unsigned char t = CRegEx::eTypePass) : m_Type(t), m_Trans({ { // may be faster than array::fill()
218  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
219  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
220  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
221  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
222  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
223  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
224  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
225  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
226  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
227  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
228  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
229  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
230  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
231  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
232  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
233  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
234  } }) {}
235  void Trans(unsigned char c, size_t n) { m_Trans[c] = n; };
236  void Short(size_t n) { m_Short.insert(n); };
237  void Emit(size_t n) { m_Emit.insert(n); };
238  };
239  struct THasher {
240  size_t operator()(const vector<size_t>& s) {
241  size_t ret = 0;
242  for (auto e : s) ret ^= hash<size_t>()((e));
243  return ret;
244  }
245  };
246  typedef vector<unique_ptr<CRegExState>> TStates;
247  typedef pair<size_t, CRegEx::EType> TNode;
248  typedef vector<TNode> TNodeSet;
250  typedef vector<TNodeSet> TNodeSetList;
252 
253  CRegExFSA();
254  size_t AddState(unsigned char t = CRegEx::eTypePass) { size_t n = m_States.size(); m_States.push_back(unique_ptr<CRegExState>(new CRegExState(t))); return n; }
255  void Trans(size_t x, unsigned char c, size_t y) { m_States[x]->Trans(c, y); };
256  void Short(size_t x, size_t y) { m_States[x]->Short(y); }
257  void Emit(size_t x, size_t n) { m_States[x]->Emit(n); }
258  void Create(const CRegEx& rx, size_t emit);
259  void Add(const CRegEx& rx);
260  void Add(const vector<unique_ptr<CRegEx>>& v);
261  void Merge(unique_ptr<CRegExFSA> fsa); // fsa will be consumed
262  void GenerateDotGraph(ostream& out) const;
263  void GenerateSourceCode(ostream& out) const;
264  void GenerateArrayMapData(ostream& out) const;
265  void Refine(); // build DSA from NSA
266  static size_t Collect(TScratch& VV, CRegEx::EType t, TStates& src, TStates& dest, TNodeSetMap& NM, TNodeSetList& NL, TNodeSet& NS, TScratch& HH);
267  static void Extend(size_t x, unsigned char c, TStates& src, TStates& dest, TNodeSetMap& NM, TNodeSetList& NL, TNodeSet& NS, TScratch& VV, TScratch& HH);
268  static void Push(size_t x, vector<size_t>& v, vector<size_t>& h) { // performance critical, keep it inline
269  size_t i;
270  for (i = 0; i < h.size(); ++i) {
271  if (h[i] == x) return;
272  if (h[i] > x) break;
273  }
274  v.push_back(x);
275  h.push_back(x);
276  for (size_t j = h.size() - 1; j > i; --j) h[j] = h[j - 1];
277  h[i] = x;
278  }
279  static bool In(size_t x, vector<size_t>& h) { // performance critical, keep it inline
280  size_t i;
281  for (i = 0; i < h.size(); ++i) {
282  if (h[i] == x) return true;
283  if (h[i] > x) break;
284  }
285  return false;
286  }
288  vector<string> m_Str;
289 };
290 
291 
293 
294 #endif /* UTIL___MULTIPATTERN_SEARCH_IMPL__HPP */
CMultipatternSearch.
pair< size_t, CRegEx::EType > TNode
vector< string > m_Str
vector< TNode > TNodeSet
static void Push(size_t x, vector< size_t > &v, vector< size_t > &h)
void GenerateArrayMapData(ostream &out) const
void Trans(size_t x, unsigned char c, size_t y)
void Add(const CRegEx &rx)
void Emit(size_t x, size_t n)
void GenerateSourceCode(ostream &out) const
static bool In(size_t x, vector< size_t > &h)
void Create(const CRegEx &rx, size_t emit)
map< TNodeSet, size_t > TNodeSetMap
static size_t Collect(TScratch &VV, CRegEx::EType t, TStates &src, TStates &dest, TNodeSetMap &NM, TNodeSetList &NL, TNodeSet &NS, TScratch &HH)
void GenerateDotGraph(ostream &out) const
void Short(size_t x, size_t y)
static void Extend(size_t x, unsigned char c, TStates &src, TStates &dest, TNodeSetMap &NM, TNodeSetList &NL, TNodeSet &NS, TScratch &VV, TScratch &HH)
array< vector< size_t >, 4 > TScratch
size_t AddState(unsigned char t=CRegEx::eTypePass)
void Merge(unique_ptr< CRegExFSA > fsa)
vector< TNodeSet > TNodeSetList
vector< unique_ptr< CRegExState > > TStates
CRegEx(const string &s, CMultipatternSearch::TFlags f=0)
unique_ptr< CRegX > x_ParseAtom()
unique_ptr< CRegX > x_ParsePlain()
unique_ptr< CRegX > x_ParseConcat()
void x_Consume(char c)
void x_ParseSquare(set< unsigned char > &t)
bool x_ParseRepeat(int &from, int &to, bool &lazy)
void x_Print(ostream &out) const
void x_ThrowError(const string msg, size_t pos, size_t len)
unsigned char x_ParseEscape()
CRegEx(const char *s, CMultipatternSearch::TFlags f=0)
unique_ptr< CRegX > x_ParseTerm()
int x_ParseHex(size_t len=0)
CMultipatternSearch::TFlags m_Flag
void x_ThrowUnexpectedCharacter()
int x_ParseDec(size_t len=0)
friend ostream & operator<<(ostream &, const CRegEx &)
void x_ThrowUnexpectedEndOfLine()
unique_ptr< CRegX > m_RegX
static bool IsWordCharacter(unsigned char c)
void x_ParseOptions()
unique_ptr< CRegX > x_ParseSelect()
Definition: map.hpp:338
iterator_bool insert(const value_type &val)
Definition: set.hpp:149
std::ofstream out("events_result.xml")
main entry point for tests
#define bool
Definition: bool.h:34
#define HH(a, b, c, d, x, s)
Definition: md4.c:190
string
Definition: cgiapp.hpp:687
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
int i
yy_size_t n
int len
Simultaneous search of multiple RegEx patterns in the input string.
unsigned int a
Definition: ncbi_localip.c:102
EIPRangeType t
Definition: ncbi_localip.c:101
T max(T x_, T y_)
T min(T x_, T y_)
double f(double x_, const double &y_)
Definition: njn_root.hpp:188
CRegExState(unsigned char t=CRegEx::eTypePass)
void Trans(unsigned char c, size_t n)
size_t operator()(const vector< size_t > &s)
virtual bool IsAssert() const
void Print(ostream &out, size_t off) const
CRegXAssert(EAssert a, unique_ptr< CRegX > &x)
void Render(CRegExFSA &fsa, size_t from, size_t to) const
void Render(CRegExFSA &, size_t, size_t) const override
void Print(ostream &out, size_t off) const override
bool IsCaseInsensitive() const override
CRegXChar(char c, bool neg=false)
void Set(const set< unsigned char > &t)
CRegXChar(const set< unsigned char > &t, bool neg=false)
void Render(CRegExFSA &fsa, size_t from, size_t to) const
void Print(ostream &out, size_t off) const
set< unsigned char > m_Set
bool IsCaseInsensitive() const
void Print(ostream &out, size_t off) const
void Render(CRegExFSA &fsa, size_t from, size_t to) const
vector< unique_ptr< CRegX > > m_Vec
CRegXConcat(vector< unique_ptr< CRegX > > &v)
void Render(CRegExFSA &fsa, size_t from, size_t to) const
void Print(ostream &out, size_t off) const
void Print(ostream &out, size_t off) const
CRegXSelect(vector< unique_ptr< CRegX > > &v)
void Render(CRegExFSA &fsa, size_t from, size_t to) const
vector< unique_ptr< CRegX > > m_Vec
unique_ptr< CRegX > m_RegX
void Print(ostream &out, size_t off) const
CRegXTerm(unique_ptr< CRegX > &x, unsigned int min, unsigned int max, bool lazy=false)
void Render(CRegExFSA &fsa, size_t from, size_t to) const
static void DummyTrans(CRegExFSA &fsa, size_t x, unsigned char t)
virtual void SetCaseInsensitive()
static void PrintOffset(ostream &out, size_t off)
virtual bool IsCaseInsensitive() const =0
virtual void Render(CRegExFSA &fsa, size_t from, size_t to) const =0
virtual bool IsAssert() const
virtual void Print(ostream &out, size_t off) const =0
Definition: _hash_fun.h:40
Modified on Wed May 29 18:42:58 2024 by modify_doxy.py rev. 669887