NCBI C++ ToolKit
multipattern_search.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: multipattern_search.cpp 100700 2023-08-31 19:20:11Z lavr $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Sema
27  *
28  * File Description:
29  *
30  */
31 
32 #include <ncbi_pch.hpp>
35 
37 
39 
42 
43 void CMultipatternSearch::AddPattern(const char* s, TFlags f) { m_FSM->Add(CRegEx(s, f)); }
44 
45 void CMultipatternSearch::AddPatterns(const vector<string>& patterns)
46 {
47  vector<unique_ptr<CRegEx>> v;
48  for (const string& s : patterns) {
49  v.push_back(unique_ptr<CRegEx>(new CRegEx(s)));
50  }
51  m_FSM->Add(v);
52 }
53 
54 void CMultipatternSearch::AddPatterns(const vector<pair<string, TFlags>>& patterns)
55 {
56  vector<unique_ptr<CRegEx>> v;
57  for (auto& p : patterns) {
58  v.push_back(unique_ptr<CRegEx>(new CRegEx(p.first, p.second)));
59  }
60  m_FSM->Add(v);
61 }
62 
63 void CMultipatternSearch::GenerateDotGraph(ostream& out) const { m_FSM->GenerateDotGraph(out); }
64 
65 void CMultipatternSearch::GenerateArrayMapData(ostream& out) const { m_FSM->GenerateArrayMapData(out); }
66 
67 void CMultipatternSearch::GenerateSourceCode(ostream& out) const { m_FSM->GenerateSourceCode(out); }
68 
70 {
71  string out;
72  for (auto c : str) {
73  switch (c) {
74  case ' ':
75  out += "\\s+";
76  break;
77  case '^':
78  case '$':
79  case '(':
80  case ')':
81  case '[':
82  case ']':
83  case '\\':
84  case '/':
85  case '?':
86  case '*':
87  case '+':
88  case '.':
89  case '|':
90  out += '\\';
91  out += c;
92  break;
93  default:
94  out += c;
95  }
96  }
97  return out;
98 }
99 
100 ////////////////////////////////////////////////////////////////
101 
102 static void xMultiPatternSearch(const char* input, const CRegExFSA& fsa, CMultipatternSearch::BoolCall2 report)
103 {
104  const unsigned char* p = reinterpret_cast<const unsigned char*>(input);
105  size_t state = 1;
106 
107  const auto& emit1 = fsa.m_States[state]->m_Emit;
108  for (auto e : emit1) {
109  if (report(e, p - (const unsigned char*)input)) {
110  return;
111  }
112  }
113  while (true) {
114  state = fsa.m_States[state]->m_Trans[*p];
115  const auto& emit2 = fsa.m_States[state]->m_Emit;
116  for (auto e : emit2) {
117  if (report(e, p - (const unsigned char*)input)) {
118  return;
119  }
120  }
121  if (!*p) {
122  return;
123  }
124  ++p;
125  }
126 }
127 
128 #if 0
129  // original code, left here for reference
130  const unsigned char* p = reinterpret_cast<const unsigned char*>(input);
131  size_t state = 0;
132 
133  if (emit[state]) {
134  for (auto e : hits.at(state)) {
135  report(e);
136  }
137  }
138  while (true) {
139  state = states[state * 256 + *p];
140  if (emit[state]) {
141  for (auto e : hits.at(state)) {
142  report(e);
143  }
144  }
145  if (!*p) {
146  return;
147  }
148  ++p;
149  }
150 #endif
151 
152 static void xMultiPatternSearch(const char* input, const CCompiledFSM& fsm, CMultipatternSearch::BoolCall2 report)
153 {
154  const unsigned char* p = reinterpret_cast<const unsigned char*>(input);
155  CCompiledFSM::index_type state = 0;
156 
157  if (fsm.m_emit.test(state)) {
158  for (auto e : fsm.get_hits(state)) {
159  if (report(e, p - (const unsigned char*)input))
160  return;
161  }
162  }
163  while (true) {
164  state = fsm.m_states.at(uintptr_t(state) * 256 + *p);
165  if (fsm.m_emit.test(state)) {
166  for (auto e : fsm.get_hits(state)) {
167  if (report(e, p - (const unsigned char*)input))
168  return;
169  }
170  }
171  if (!*p) {
172  return;
173  }
174  ++p;
175  }
176 }
177 
178 void CMultipatternSearch::Search(const char* input, VoidCall1 report) const
179 {
180  BoolCall2 call = [report](size_t p1, size_t /*p2*/) { report(p1); return false; };
182 }
183 
184 
185 void CMultipatternSearch::Search(const char* input, VoidCall2 report) const
186 {
187  BoolCall2 call = [report](size_t p1, size_t p2) { report(p1, p2); return false; };
189 }
190 
191 
192 void CMultipatternSearch::Search(const char* input, BoolCall1 report) const
193 {
194  BoolCall2 call = [report](size_t p1, size_t /*p2*/) { return report(p1); };
196 }
197 
198 
199 void CMultipatternSearch::Search(const char* input, BoolCall2 report) const
200 {
201  BoolCall2 call = [report](size_t p1, size_t p2) { return report(p1, p2); };
203 }
204 
205 
206 void CMultipatternSearch::Search(const char* input, const CCompiledFSM& fsm, VoidCall1 report)
207 {
208  BoolCall2 call = [report](size_t p1, size_t /*p2*/) { report(p1); return false; };
209  xMultiPatternSearch(input, fsm, call);
210 }
211 
212 
213 void CMultipatternSearch::Search(const char* input, const CCompiledFSM& fsm, VoidCall2 report)
214 {
215  BoolCall2 call = [report](size_t p1, size_t p2) { report(p1, p2); return false; };
216  xMultiPatternSearch(input, fsm, call);
217 }
218 
219 
220 void CMultipatternSearch::Search(const char* input, const CCompiledFSM& fsm, BoolCall1 report)
221 {
222  BoolCall2 call = [report](size_t p1, size_t /*p2*/) { return report(p1); };
223  xMultiPatternSearch(input, fsm, call);
224 }
225 
226 
227 void CMultipatternSearch::Search(const char* input, const CCompiledFSM& fsm, BoolCall2 report)
228 {
229  BoolCall2 call = [report](size_t p1, size_t p2) { return report(p1, p2); };
230  xMultiPatternSearch(input, fsm, call);
231 }
232 
233 
234 ////////////////////////////////////////////////////////////////
235 
237 {
238  throw (string) "unexpected end of string";
239 }
240 
241 
243 {
244  ostringstream oss;
245  oss << "unexpected character "
246  << (m_Str[m_Cur] == '\'' ? '\"' : '\'') << m_Str[m_Cur] << (m_Str[m_Cur] == '\'' ? '\"' : '\'')
247  << " in position " << (m_Cur + 1);
248  throw oss.str();
249 }
250 
251 
252 void CRegEx::x_ThrowError(const string msg, size_t pos, size_t len)
253 {
254  ostringstream oss;
255  oss << msg << " \'" << m_Str.substr(pos, len) << "\' in position " << (pos + 1);
256  throw oss.str();
257 }
258 
259 
260 void CRegEx::x_Consume(char c)
261 {
262  if (m_Cur >= m_Str.length()) {
264  }
265  if (m_Str[m_Cur] != c) {
267  }
268  m_Cur++;
269 }
270 
271 
273 {
274  m_Cur = 0;
275  try {
276  if (m_Cur < m_Str.length() && m_Str[m_Cur] == '/') {
277  m_Cur++;
278  m_RegX = x_ParseSelect();
279  x_Consume('/');
280  x_ParseOptions();
281  }
282  else {
283  m_RegX = x_ParsePlain();
285  m_RegX->SetCaseInsensitive();
286  }
287  }
288  }
289  catch (const string& s) {
290  m_Err = s;
291  m_RegX.reset();
292  }
293 }
294 
295 
296 unique_ptr<CRegEx::CRegX> CRegEx::x_ParsePlain() // plain string
297 {
298  vector<unique_ptr<CRegX> > V;
300  V.push_back(unique_ptr<CRegX>(new CRegXAssert(CRegXAssert::eAssertBegin)));
301  }
303  V.push_back(unique_ptr<CRegX>(new CRegXAssert(CRegXAssert::eAssertWord)));
304  }
305  for (size_t i = 0; i < m_Str.length(); i++) {
306  V.push_back(unique_ptr<CRegX>(new CRegXChar(m_Str[i])));
307  }
309  V.push_back(unique_ptr<CRegX>(new CRegXAssert(CRegXAssert::eAssertEnd)));
310  }
312  V.push_back(unique_ptr<CRegX>(new CRegXAssert(CRegXAssert::eAssertWord)));
313  }
314  if (V.empty()) {
315  return unique_ptr<CRegX>(new CRegXEmpty());
316  }
317  if (V.size() == 1) {
318  return std::move(V[0]);
319  }
320  return unique_ptr<CRegX>(new CRegXConcat(V));
321 }
322 
323 
324 unique_ptr<CRegEx::CRegX> CRegEx::x_ParseSelect() // e.g.: /a|b|c/
325 {
326  vector<unique_ptr<CRegX> > V;
327  while (unique_ptr<CRegX> x = x_ParseConcat()) {
328  V.push_back(std::move(x));
329  if (m_Cur >= m_Str.length() || m_Str[m_Cur] != '|') {
330  break;
331  }
332  m_Cur++;
333  }
334  if (V.empty()) {
335  return unique_ptr<CRegX>(new CRegXEmpty());
336  }
337  if (V.size() == 1) {
338  return std::move(V[0]);
339  }
340  return unique_ptr<CRegX>(new CRegXSelect(V));
341 }
342 
343 
344 unique_ptr<CRegEx::CRegX> CRegEx::x_ParseConcat() // e.g.: /abc/
345 {
346  vector<unique_ptr<CRegX> > V;
347  while (unique_ptr<CRegX> x = x_ParseTerm()) {
348  if (*x) {
349  V.push_back(std::move(x));
350  }
351  if (m_Cur >= m_Str.length() || m_Str[m_Cur] == '|' || m_Str[m_Cur] == ')' || m_Str[m_Cur] == '/') {
352  break;
353  }
354  }
355  if (V.empty()) {
356  return unique_ptr<CRegX>(new CRegXEmpty());
357  }
358  if (V.size() == 1) {
359  return std::move(V[0]);
360  }
361  return unique_ptr<CRegX>(new CRegXConcat(V));
362 }
363 
364 
365 unique_ptr<CRegEx::CRegX> CRegEx::x_ParseTerm() // e.g.: /a+/ /a?/ /a{2,3}/
366 {
367  if (m_Cur >= m_Str.length()) {
368  return 0;
369  }
370  int from, to;
371  bool lazy;
372  size_t k = m_Cur;
373  if (x_ParseRepeat(from, to, lazy)) {
374  x_ThrowError("nothing to repeat:", k, m_Cur - k);
375  }
376  unique_ptr<CRegX> x = x_ParseAtom();
377  k = m_Cur;
378  if (x && !x->IsAssert() && x_ParseRepeat(from, to, lazy)) {
379  if (to && from > to) {
380  x_ThrowError("numbers out of order:", k, m_Cur - k);
381  }
382  return unique_ptr<CRegX>(new CRegXTerm(x, from, to, lazy));
383  }
384  return x;
385 }
386 
387 
388 bool CRegEx::x_ParseRepeat(int& from, int& to, bool& lazy)
389 {
390  if (m_Cur >= m_Str.length()) {
391  return false;
392  }
393  switch (m_Str[m_Cur]) {
394  case '?':
395  m_Cur++;
396  from = 0;
397  to = 1;
398  break;
399  case '+':
400  m_Cur++;
401  from = 1;
402  to = 0;
403  break;
404  case '*':
405  m_Cur++;
406  from = 0;
407  to = 0;
408  break;
409  case '{': // {3} {3,} {,5} {3,5}, but not {} {,}
410  {
411  size_t k = m_Cur;
412  m_Cur++;
413  from = x_ParseDec();
414  if (from >= 0 && m_Cur < m_Str.length() && m_Str[m_Cur] == '}') {
415  m_Cur++;
416  to = from;
417  break;
418  }
419  else if (m_Cur < m_Str.length() && m_Str[m_Cur] == ',') {
420  m_Cur++;
421  to = x_ParseDec();
422  if ((from >= 0 || to >= 0) && m_Cur < m_Str.length() && m_Str[m_Cur] == '}') {
423  m_Cur++;
424  from = from < 0 ? 0 : from;
425  to = to < 0 ? 0 : to;
426  break;
427  }
428  }
429  m_Cur = k;
430  return false;
431  }
432  default:
433  return false;
434  }
435  lazy = false;
436  if (m_Cur < m_Str.length() && m_Str[m_Cur] == '?') {
437  m_Cur++;
438  lazy = true;
439  }
440  return true;
441 }
442 
443 
444 static inline void InsertDigit(set<unsigned char>& t)
445 {
446  for (unsigned char c = '0'; c <= '9'; c++) {
447  t.insert(c);
448  }
449 }
450 
451 
452 static inline void InsertNoDigit(set<unsigned char>& t)
453 {
454  for (unsigned c = 1; c <= 255; c++) {
455  if (c < '0' || c > '9') {
456  t.insert((unsigned char)c);
457  }
458  }
459 }
460 
461 
462 static inline void InsertAlpha(set<unsigned char>& t)
463 {
464  for (unsigned char c = '0'; c <= '9'; c++) {
465  t.insert(c);
466  }
467  for (unsigned char c = 'A'; c <= 'Z'; c++) {
468  t.insert(c);
469  }
470  for (unsigned char c = 'a'; c <= 'z'; c++) {
471  t.insert(c);
472  }
473  t.insert('_');
474 }
475 
476 
477 static inline void InsertNoAlpha(set<unsigned char>& t)
478 {
479  for (unsigned c = 1; c <= 255; c++) {
480  if ((c >= '0' && c <= '9') || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || c == '_') {
481  continue;
482  }
483  t.insert((unsigned char)c);
484  }
485 }
486 
487 
488 static inline void InsertSpace(set<unsigned char>& t)
489 {
490  t.insert(' ');
491  t.insert('\f');
492  t.insert('\n');
493  t.insert('\r');
494  t.insert('\t');
495  t.insert('\v');
496 }
497 
498 
499 static inline void InsertNoSpace(set<unsigned char>& t)
500 {
501  for (unsigned c = 1; c <= 255; c++) {
502  if (c != ' ' && c != '\f' && c != '\n' && c != '\r' && c != '\t' && c != '\v') {
503  t.insert((unsigned char)c);
504  }
505  }
506 }
507 
508 
509 unique_ptr<CRegEx::CRegX> CRegEx::x_ParseAtom() // single character or an expression in braces
510 {
511  switch (m_Str[m_Cur]) {
512  case '|':
513  case ')':
514  case '/':
515  return 0;
516  case '.':
517  {
518  m_Cur++;
520  for (int c = 1; c < 256; c++) {
521  t.insert((unsigned char)c);
522  }
523  return unique_ptr<CRegX>(new CRegXChar(t));
524  }
525  case '^':
526  m_Cur++;
527  return unique_ptr<CRegX>(new CRegXAssert(CRegXAssert::eAssertBegin));
528  case '$':
529  m_Cur++;
530  return unique_ptr<CRegX>(new CRegXAssert(CRegXAssert::eAssertEnd));
531  case '(':
532  {
533  m_Cur++;
535  if (m_Cur + 2 < m_Str.length() && m_Str[m_Cur] == '?') {
536  if (m_Str[m_Cur + 1] == ':') { // non-capturing parentheses: (?:...)
537  m_Cur += 2;
538  }
539  else if (m_Str[m_Cur + 1] == '=') { // lookahead is unsupported: (?=...)
541  m_Cur += 2;
542  }
543  else if (m_Str[m_Cur + 1] == '!') { // lookahead is unsupported: (?!...)
545  m_Cur += 2;
546  }
547  else if (m_Str[m_Cur + 1] == '<' && m_Cur + 2 < m_Str.length()) {
548  if (m_Str[m_Cur + 2] == '=') { // lookahead is unsupported: (?<=...)
550  m_Cur += 3;
551  }
552  else if (m_Str[m_Cur + 2] == '!') { // lookahead is unsupported: (?<!...)
554  m_Cur += 3;
555  }
556  }
557  }
558  unique_ptr<CRegX> x = x_ParseSelect();
559  x_Consume(')');
561  m_Unsupported = true;
562  return unique_ptr<CRegX>(new CRegXAssert(assert, x));
563  }
564  return x;
565  }
566  case '[':
567  {
568  m_Cur++;
569  bool neg = false;
571  if (m_Cur + 1 < m_Str.length() && m_Str[m_Cur] == '^') {
572  neg = true;
573  m_Cur++;
574  }
575  x_ParseSquare(t);
576  x_Consume(']');
577  return unique_ptr<CRegX>(new CRegXChar(t, neg));
578  }
579  case '\\':
580  m_Cur++;
581  if (m_Cur >= m_Str.length()) {
583  }
584  switch (m_Str[m_Cur]) {
585  case 'b':
586  m_Cur++;
587  return unique_ptr<CRegX>(new CRegXAssert(CRegXAssert::eAssertWord));
588  case 'B':
589  m_Cur++;
590  return unique_ptr<CRegX>(new CRegXAssert(CRegXAssert::eAssertWordNeg));
591  case 'd':
592  case 'D':
593  {
595  InsertDigit(t);
596  return unique_ptr<CRegX>(new CRegXChar(t, m_Str[m_Cur++] == 'D'));
597  }
598  case 'w':
599  case 'W':
600  {
602  InsertAlpha(t);
603  return unique_ptr<CRegX>(new CRegXChar(t, m_Str[m_Cur++] == 'W'));
604  }
605  case 's':
606  case 'S':
607  {
609  InsertSpace(t);
610  return unique_ptr<CRegX>(new CRegXChar(t, m_Str[m_Cur++] == 'S'));
611  }
612  case '1':
613  case '2':
614  case '3':
615  case '4':
616  case '5':
617  case '6':
618  case '7':
619  case '8':
620  case '9':
621  m_Unsupported = true;
622  return unique_ptr<CRegX>(new CRegXBackRef(m_Str[m_Cur++] - '0'));
623  default:
624  {
625  unsigned char c = x_ParseEscape();
626  return unique_ptr<CRegX>(new CRegXChar(c));
627  }
628  }
629  default:
630  return unique_ptr<CRegX>(new CRegXChar(m_Str[m_Cur++]));
631  }
632 }
633 
634 
636 {
637  unsigned char range = 0; // 0: initial; 1: valid begining; 2: dash; 3: valid ending;
638  unsigned char curr = 0;
639  unsigned char prev = 0;
640  size_t pos = 0;
641  size_t prev_pos = 0;
642 
643  // Commented out Perl syntax: closing square bracket is treated as regular character if in the first position
644  //if (m_Cur < m_Str.length() && m_Str[m_Cur] == ']') {
645  // range = 1;
646  // prev = m_Str[m_Cur++];
647  // t.insert(prev);
648  //}
649  while (m_Cur < m_Str.length()) {
650  pos = m_Cur;
651  switch (m_Str[m_Cur]) {
652  case ']':
653  if (range == 2) {
654  t.insert('-');
655  }
656  return;
657  case '-':
658  curr = '-';
659  range++;
660  break;
661  case '\\':
662  m_Cur++;
663  if (m_Cur >= m_Str.length()) {
665  }
666  switch (m_Str[m_Cur]) {
667  case 'd':
668  InsertDigit(t);
669  if (range == 2) {
670  t.insert('-');
671  }
672  range = 0;
673  break;
674  case 'D':
675  InsertNoDigit(t);
676  if (range == 2) {
677  t.insert('-');
678  }
679  range = 0;
680  break;
681  case 'w':
682  InsertAlpha(t);
683  if (range == 2) {
684  t.insert('-');
685  }
686  range = 0;
687  break;
688  case 'W':
689  InsertNoAlpha(t);
690  if (range == 2) {
691  t.insert('-');
692  }
693  range = 0;
694  break;
695  case 's':
696  InsertSpace(t);
697  if (range == 2) {
698  t.insert('-');
699  }
700  range = 0;
701  break;
702  case 'S':
703  InsertNoSpace(t);
704  if (range == 2) {
705  t.insert('-');
706  }
707  range = 0;
708  break;
709  default:
710  curr = x_ParseEscape();
711  m_Cur--;
712  if (range != 1) {
713  range++;
714  }
715  }
716  break;
717  default:
718  curr = m_Str[m_Cur];
719  if (range != 1) {
720  range++;
721  }
722  }
723  if (range == 3) {
724  if (prev > curr) {
725  x_ThrowError("invalid range:", prev_pos, m_Cur - prev_pos + 1);
726  }
727  for (unsigned int n = prev; n <= curr; n++) {
728  t.insert((unsigned char)n);
729  }
730  range = 0;
731  }
732  if (range == 1) {
733  t.insert(curr);
734  prev = curr;
735  prev_pos = pos;
736  }
737  m_Cur++;
738  }
739 }
740 
741 
742 unsigned char CRegEx::x_ParseEscape()
743 {
744  switch (m_Str[m_Cur]) {
745  case '0':
746  m_Cur++;
747  return 0;
748  case 'b':
749  m_Cur++;
750  return '\b';
751  case 'c': // control letter if followed by letter, otherwise just 'c'
752  m_Cur++;
753  if (m_Cur < m_Str.length()) {
754  if (m_Str[m_Cur] >= 'A' && m_Str[m_Cur] <= 'Z') {
755  return (unsigned char)(m_Str[m_Cur++] - 'A' + 1);
756  }
757  if (m_Str[m_Cur] >= 'a' && m_Str[m_Cur] <= 'z') {
758  return (unsigned char)(m_Str[m_Cur++] - 'a' + 1);
759  }
760  }
761  return 'c';
762  case 'f':
763  m_Cur++;
764  return '\f';
765  case 'n':
766  m_Cur++;
767  return '\n';
768  case 'r':
769  m_Cur++;
770  return '\r';
771  case 't':
772  m_Cur++;
773  return '\t';
774  case 'v':
775  m_Cur++;
776  return '\v';
777  case 'x':
778  m_Cur++;
779  if (m_Cur < m_Str.length()) {
780  int n = x_ParseHex(2);
781  if (n >= 0) {
782  return (unsigned char)n;
783  }
784  }
785  return 'x';
786  case 'u':
787  m_Cur++;
788  if (m_Cur + 1 < m_Str.length() && m_Str[m_Cur] == '{') { // \u{XXXX}
789  size_t k = m_Cur;
790  m_Cur++;
791  int n = x_ParseHex(4);
792  if (n >= 0 && m_Cur < m_Str.length() && m_Str[m_Cur] == '}') {
793  m_Cur++;
794  if (n > 255) {
795  m_Unsupported = true;
796  return 0;
797  }
798  return (unsigned char)n;
799  }
800  m_Cur = k;
801  }
802  else if (m_Cur < m_Str.length()) { // \uXXXX
803  int n = x_ParseHex(4);
804  if (n >= 0) {
805  if (n > 255) {
806  m_Unsupported = true;
807  return 0;
808  }
809  return (unsigned char)n;
810  }
811  }
812  return 'u';
813  default:
814  return m_Str[m_Cur++];
815  }
816 }
817 
818 
820 {
821  int x = 0;
822  for (size_t n = 0; n < len || !len; n++) {
823  if (m_Cur < m_Str.length()) {
824  char& c = m_Str[m_Cur];
825  if (c >= '0' && c <= '9') {
826  x = x * 16 + c - '0';
827  }
828  else if (c >= 'A' && c <= 'F') {
829  x = x * 16 + 10 + c - 'A';
830  }
831  else if (c >= 'a' && c <= 'f') {
832  x = x * 16 + 10 + c - 'a';
833  }
834  else {
835  return n ? x : -1;
836  }
837  m_Cur++;
838  }
839  else {
840  return n ? x : -1;
841  }
842  }
843  return x;
844 }
845 
846 
848 {
849  int x = 0;
850  for (size_t n = 0; n < len || !len; n++) {
851  if (m_Cur < m_Str.length()) {
852  char& c = m_Str[m_Cur];
853  if (c >= '0' && c <= '9') {
854  x = x * 10 + c - '0';
855  }
856  else {
857  return n ? x : -1;
858  }
859  m_Cur++;
860  }
861  else {
862  return n ? x : -1;
863  }
864  }
865  return x;
866 }
867 
868 
869 void CRegEx::x_ParseOptions() // options after the final slash, e.g.: /***/i
870 {
871  for (; m_Cur < m_Str.length(); m_Cur++) {
872  switch (m_Str[m_Cur]) {
873  case 'i':
874  m_RegX->SetCaseInsensitive();
875  break;
876  case 'g': // [ignored] global
877  case 'm': // [ignored] multi-line
878  case 'u': // [ignored] unicode
879  case 'y': // [ignored] sticky
880  break;
881  default:
883  }
884  }
885 }
886 
887 // Printing
888 
889 ostream& operator<<(ostream& ostr, const CRegEx& regex)
890 {
891  regex.x_Print(ostr);
892  return ostr;
893 }
894 
895 
896 void CRegEx::x_Print(ostream& ostr) const
897 {
898  ostr << "<<RegEx>> " << m_Str << "\n";
899  if (m_Err.length()) {
900  ostr << " <ERROR>\t" << m_Err << "\n";
901  }
902  else {
903  m_RegX->Print(ostr, 2);
904  }
905 }
906 
907 
908 void CRegEx::CRegXChar::Print(ostream& out, size_t off) const
909 {
910  PrintOffset(out, off);
911  out << (m_Neg ? "<char>!\t" : "<char>\t");
912  if (m_Set.empty()) {
913  out << "<empty>";
914  }
915  for (auto it = m_Set.begin(); it != m_Set.end(); it++) {
916  switch (*it) {
917  case 0:
918  out << "\\0";
919  break;
920  case '\b':
921  out << "\\b";
922  break;
923  case '\f':
924  out << "\\f";
925  break;
926  case '\n':
927  out << "\\n";
928  break;
929  case '\r':
930  out << "\\r";
931  break;
932  case '\t':
933  out << "\\t";
934  break;
935  case '\v':
936  out << "\\v";
937  break;
938  default:
939  out << *it;
940  }
941  }
942  out << "\n";
943 }
944 
945 
946 void CRegEx::CRegXTerm::Print(ostream& out, size_t off) const
947 {
948  PrintOffset(out, off);
949  out << "<repeat>\t" << m_Min << " : ";
950  if (m_Max) {
951  out << m_Max;
952  }
953  else {
954  out << "inf";
955  }
956  out << (m_Lazy ? " : lazy\n" : "\n");
957  m_RegX->Print(out, off + 2);
958 }
959 
960 
961 void CRegEx::CRegXAssert::Print(ostream& out, size_t off) const
962 {
963  static const string A[] = { "error", "beginning of string", "end of string", "word boundary", "not word boundary", "look ahead", "look ahead negative", "look back", "look back negative" };
964  PrintOffset(out, off);
965  out << "<assert>\t" << A[m_Assert] << "\n";
966  if (m_RegX) {
967  m_RegX->Print(out, off + 2);
968  }
969 }
970 
971 
973 {
974  for (unsigned char c = 'A'; c <= 'Z'; c++) {
975  if (m_Set.find(c) != m_Set.end()) {
976  m_Set.insert((unsigned char)(c + 32));
977  }
978  else if (m_Set.find((unsigned char)(c + 32)) != m_Set.end()) {
979  m_Set.insert(c);
980  }
981  }
982 }
983 
984 
986 {
987  for (unsigned char c = 'A'; c <= 'Z'; c++) {
988  if ((m_Set.find(c) == m_Set.end()) != (m_Set.find((unsigned char)(c + 32)) == m_Set.end())) {
989  return false;
990  }
991  }
992  return true;
993 }
994 
995 
996 // Render
997 
998 void CRegEx::CRegXEmpty::Render(CRegExFSA& fsa, size_t from, size_t to) const
999 {
1000  fsa.Short(from, to);
1001 }
1002 
1003 
1004 void CRegEx::CRegXChar::Render(CRegExFSA& fsa, size_t from, size_t to) const
1005 {
1006  size_t x = fsa.AddState(eTypeStop | eTypeWord | eTypeNoWord);
1007  for (unsigned n = 1; n < 256; n++) {
1008  unsigned char c = (unsigned char)n;
1009  if ((m_Set.find(c) == m_Set.end()) == m_Neg) {
1010  fsa.Trans(from, c, x);
1011  fsa.Short(x, to);
1012  }
1013  }
1014 }
1015 
1016 
1017 void CRegEx::CRegXConcat::Render(CRegExFSA& fsa, size_t from, size_t to) const
1018 {
1019  if (m_Vec.empty()) {
1020  fsa.Short(from, to);
1021  return;
1022  }
1023  size_t current = from;
1024  for (size_t n = 0; n < m_Vec.size(); n++) {
1025  size_t next = to;
1026  if (n < m_Vec.size() - 1) {
1027  next = fsa.AddState();
1028  }
1029  m_Vec[n]->Render(fsa, current, next);
1030  current = next;
1031  }
1032 }
1033 
1034 
1035 void CRegEx::CRegXSelect::Render(CRegExFSA& fsa, size_t from, size_t to) const
1036 {
1037  if (m_Vec.empty()) {
1038  fsa.Short(from, to);
1039  return;
1040  }
1041  for (size_t n = 0; n < m_Vec.size(); n++) {
1042  size_t current = fsa.AddState();
1043  fsa.Short(from, current);
1044  m_Vec[n]->Render(fsa, current, to);
1045  }
1046 }
1047 
1048 
1049 void CRegEx::CRegXTerm::Render(CRegExFSA& fsa, size_t from, size_t to) const
1050 {
1051  size_t current = from;
1052  size_t last = current;
1053  for (size_t n = 0; n < m_Min; n++) {
1054  size_t next = to;
1055  if (n + 1 < m_Min || n + 1 < m_Max) {
1056  next = fsa.AddState();
1057  }
1058  m_RegX->Render(fsa, current, next);
1059  last = current;
1060  current = next;
1061  }
1062  if (!m_Max) {
1063  if (!m_Min) {
1064  m_RegX->Render(fsa, from, to);
1065  fsa.Short(from, to);
1066  }
1067  fsa.Short(to, last);
1068  return;
1069  }
1070  for (size_t n = m_Min; n < m_Max; n++) {
1071  size_t next = to;
1072  if (n + 1 < m_Max) {
1073  next = fsa.AddState();
1074  }
1075  m_RegX->Render(fsa, current, next);
1076  fsa.Short(current, to);
1077  current = next;
1078  }
1079 }
1080 
1081 
1082 void CRegEx::CRegX::DummyTrans(CRegExFSA& fsa, size_t x, unsigned char t)
1083 {
1084  size_t y;
1085  if (t & CRegEx::eTypeStop) {
1086  y = fsa.AddState(CRegEx::eTypeStop);
1087  fsa.Trans(x, 0, y);
1088  }
1089  if (t & CRegEx::eTypeWord) {
1090  y = fsa.AddState(CRegEx::eTypeWord);
1091  for (unsigned n = 1; n < 256; n++) {
1092  if (CRegEx::IsWordCharacter((unsigned char)n)) {
1093  fsa.Trans(x, (unsigned char)n, y);
1094  }
1095  }
1096  }
1097  if (t & CRegEx::eTypeNoWord) {
1098  y = fsa.AddState(CRegEx::eTypeNoWord);
1099  for (unsigned n = 1; n < 256; n++) {
1100  if (!CRegEx::IsWordCharacter((unsigned char)n)) {
1101  fsa.Trans(x, (unsigned char)n, y);
1102  }
1103  }
1104  }
1105 }
1106 
1107 
1108 void CRegEx::CRegXAssert::Render(CRegExFSA& fsa, size_t from, size_t to) const
1109 {
1110  size_t x;
1111  switch (m_Assert) {
1112  case eAssertBegin: // ^
1113  x = fsa.AddState(eTypeStart);
1114  fsa.Short(from, x);
1115  fsa.Short(x, to);
1116  return;
1117  case eAssertEnd: // $
1118  x = fsa.AddState(eTypePass | eTypeToStop);
1119  DummyTrans(fsa, x, eTypeStop);
1120  fsa.Short(from, x);
1121  fsa.Short(x, to);
1122  return;
1123  case eAssertWord: // \b
1125  DummyTrans(fsa, x, eTypeWord);
1126  fsa.Short(from, x);
1127  fsa.Short(x, to);
1129  DummyTrans(fsa, x, eTypeNoWord);
1130  DummyTrans(fsa, x, eTypeStop);
1131  fsa.Short(from, x);
1132  fsa.Short(x, to);
1133  return;
1134  case eAssertWordNeg: // \B
1136  DummyTrans(fsa, x, eTypeNoWord);
1137  DummyTrans(fsa, x, eTypeStop);
1138  fsa.Short(from, x);
1139  fsa.Short(x, to);
1140  x = fsa.AddState(eTypeWord | eTypeToWord);
1141  DummyTrans(fsa, x, eTypeWord);
1142  fsa.Short(from, x);
1143  fsa.Short(x, to);
1144  return;
1145  case eAssertLookAhead:
1146  throw string("(?=...) - lookahead is not supported");
1147  case eAssertLookAheadNeg:
1148  throw string("(?!...) - lookahead is not supported");
1149  case eAssertLookBack:
1150  throw string("(?<=...) - lookback is not supported");
1151  case eAssertLookBackNeg:
1152  throw string("(?<!...) - lookback is not supported");
1153  default:
1154  return;
1155  }
1156 }
1157 
1158 
1160 {
1161  AddState(); // 0: dummy
1162  AddState(CRegEx::eTypeStart); // 1: begin of the input
1163 }
1164 
1165 
1166 void CRegExFSA::Add(const CRegEx& rx)
1167 {
1168  Create(rx, m_Str.size());
1169  m_Str.push_back(rx.m_Str);
1170 }
1171 
1172 
1173 void CRegExFSA::Add(const vector<unique_ptr<CRegEx>>& v)
1174 {
1175  if (!v.size()) {
1176  return;
1177  }
1178  vector<unique_ptr<CRegExFSA>> w;
1179  for (auto& rx : v) {
1180  unique_ptr<CRegExFSA> p(new CRegExFSA);
1181  p->Create(*rx, m_Str.size());
1182  m_Str.push_back(rx->m_Str);
1183  w.push_back(std::move(p));
1184  }
1185  while (w.size() > 1) {
1186  size_t h = (w.size() + 1) / 2;
1187  for (size_t i = 0, j = h; j < w.size(); i++, j++) {
1188  w[i]->Merge(std::move(w[j]));
1189  }
1190  w.resize(h);
1191  }
1192  Merge(std::move(w[0]));
1193 }
1194 
1195 
1196 void CRegExFSA::Merge(unique_ptr<CRegExFSA> fsa)
1197 {
1198  size_t shift = m_States.size();
1199  for (auto& state : fsa->m_States) {
1200  for (auto& trans : state->m_Trans) {
1201  trans += shift;
1202  }
1203  m_States.push_back(std::move(state));
1204  }
1205  Short(0, shift);
1206  Short(shift, 0);
1207  Short(1, shift + 1);
1208  Short(shift + 1, 1);
1209  Refine();
1210 }
1211 
1212 
1213 void CRegExFSA::Create(const CRegEx& rx, size_t emit)
1214 {
1215  if (!rx.m_RegX) {
1216  throw "Invalid Regular Expression: " + rx.m_Str + " -- " + rx.m_Err;
1217  }
1218  Short(0, AddState(CRegEx::eTypeStop));
1219 
1220  size_t from = AddState();
1221  size_t to = AddState();
1222  Emit(to, emit);
1223  try {
1224  rx.m_RegX->Render(*this, from, to);
1225  }
1226  catch (const string& s) {
1227  Refine();
1228  throw "Unsupported Regular Expression: " + rx.m_Str + " -- " + s;
1229  }
1230  Short(0, from);
1231  Refine();
1232 }
1233 
1234 
1236 {
1237  TStates DSA;
1238  vector<size_t> Set;
1239  vector<size_t> Queue;
1240  TNodeSetMap NM;
1241  TNodeSetList NL;
1242  TNodeSet NS;
1243  TScratch VV, HH;
1244  DSA.push_back(unique_ptr<CRegExState>(new CRegExState));
1245  NS.push_back(TNode(0, CRegEx::eTypePass));
1246  NL.push_back(NS);
1247 
1248  Push(0, VV[0], HH[0]);
1249  Push(1, VV[0], HH[0]);
1250  Collect(VV, CRegEx::eTypeStart, m_States, DSA, NM, NL, NS, HH);
1251 
1252  for (size_t n = 1; n < DSA.size(); n++) {
1253  if (DSA[n]->m_Type != CRegEx::eTypeStop) {
1254  for (unsigned c = 0; c < 256; c++) {
1255  Extend(n, (char)c, m_States, DSA, NM, NL, NS, VV, HH);
1256  }
1257  }
1258  }
1259  m_States.swap(DSA);
1260 }
1261 
1262 
1264 {
1265  NS.clear();
1266  vector<size_t>& V0 = VV[0];
1267  vector<size_t>& V1 = VV[1];
1268  vector<size_t>& V2 = VV[2];
1269  vector<size_t>& V3 = VV[3];
1270  vector<size_t>& H0 = HH[0];
1271  vector<size_t>& H1 = HH[1];
1272  vector<size_t>& H2 = HH[2];
1273  vector<size_t>& H3 = HH[3];
1274 
1275  for (size_t i = 0; i < V0.size(); ++i) {
1276  size_t n = V0[i];
1277  for (size_t s : src[n]->m_Short) {
1278  if (src[s]->m_Type & ttt) {
1279  auto t = src[s]->m_Type;
1280  if (t & CRegEx::eTypeCheck) {
1281  if (t & CRegEx::eTypeToNoWord) {
1282  Push(s, V1, H1);
1283  }
1284  if (t & CRegEx::eTypeToWord) {
1285  Push(s, V2, H2);
1286  }
1287  if (t & CRegEx::eTypeToStop) {
1288  Push(s, V3, H3);
1289  }
1290  }
1291  else {
1292  Push(s, V0, H0);
1293  }
1294  }
1295  }
1296  }
1297  for (size_t i = 0; i < V1.size(); ++i) {
1298  size_t n = V1[i];
1299  for (size_t s : src[n]->m_Short) {
1300  if (src[s]->m_Type & ttt) {
1301  auto t = src[s]->m_Type;
1302  if ((t & CRegEx::eTypeToNoWord) || !(t & CRegEx::eTypeCheck)) {
1303  if (!In(s, H0)) {
1304  Push(s, V1, H1);
1305  }
1306  }
1307  }
1308  }
1309  }
1310  for (size_t i = 0; i < V2.size(); ++i) {
1311  size_t n = V2[i];
1312  for (size_t s : src[n]->m_Short) {
1313  if (src[s]->m_Type & ttt) {
1314  auto t = src[s]->m_Type;
1315  if ((t & CRegEx::eTypeToWord) || !(t & CRegEx::eTypeCheck)) {
1316  if (!In(s, H0)) {
1317  Push(s, V2, H2);
1318  }
1319  }
1320  }
1321  }
1322  }
1323  for (size_t i = 0; i < V3.size(); ++i) {
1324  size_t n = V3[i];
1325  for (size_t s : src[n]->m_Short) {
1326  if (src[s]->m_Type & ttt) {
1327  auto t = src[s]->m_Type;
1328  if ((t & CRegEx::eTypeToStop) || !(t & CRegEx::eTypeCheck)) {
1329  if (!In(s, H0)) {
1330  Push(s, V3, H3);
1331  }
1332  }
1333  }
1334  }
1335  }
1336  for (size_t n : H0) { // these are already ordered by Push()
1337  NS.push_back(TNode(n, CRegEx::eTypeNone));
1338  }
1339  for (size_t n : H1) {
1340  NS.push_back(TNode(n, CRegEx::eTypeNoWord));
1341  }
1342  for (size_t n : H2) {
1343  NS.push_back(TNode(n, CRegEx::eTypeWord));
1344  }
1345  for (size_t n : H3) {
1346  NS.push_back(TNode(n, CRegEx::eTypeStop));
1347  }
1348  auto I = NM.find(NS);
1349  if (I != NM.end()) {
1350  dest[I->second]->m_Type |= ttt;
1351  return I->second;
1352  }
1353  size_t x = NL.size();
1354  NM[NS] = x;
1355  NL.push_back(NS);
1356  dest.push_back(unique_ptr<CRegExState>(new CRegExState(ttt)));
1357  for (size_t n : H0) {
1358  dest[x]->m_Emit.insert(src[n]->m_Emit.begin(), src[n]->m_Emit.end());
1359  }
1360  for (size_t n : H1) {
1361  dest[x]->m_Forward1.insert(src[n]->m_Emit.begin(), src[n]->m_Emit.end());
1362  }
1363  for (size_t n : H2) {
1364  dest[x]->m_Forward2.insert(src[n]->m_Emit.begin(), src[n]->m_Emit.end());
1365  }
1366  for (size_t n : H3) {
1367  dest[x]->m_Forward3.insert(src[n]->m_Emit.begin(), src[n]->m_Emit.end());
1368  }
1369  return x;
1370 }
1371 
1372 
1373 void CRegExFSA::Extend(size_t x, unsigned char c, TStates& src, TStates& dest, TNodeSetMap& NM, TNodeSetList& NL, TNodeSet& NS, TScratch& VV, TScratch& HH)
1374 {
1375  for (auto& a : VV) { // that is cheaper than creating new arrays
1376  a.clear();
1377  }
1378  for (auto& a : HH) { // that is cheaper than creating new arrays
1379  a.clear();
1380  }
1381  Push(0, VV[0], HH[0]);
1383  for (auto p : NL[x]) {
1384  if (!p.second || p.second == ttt) { // if (p.first) - check performance with and without
1385  Push(src[p.first]->m_Trans[c], VV[0], HH[0]);
1386  }
1387  }
1388  size_t d = Collect(VV, ttt, src, dest, NM, NL, NS, HH);
1389  dest[x]->m_Trans[c] = d;
1390  if (ttt == CRegEx::eTypeNoWord) {
1391  dest[d]->m_Emit.insert(dest[x]->m_Forward1.begin(), dest[x]->m_Forward1.end());
1392  }
1393  if (ttt == CRegEx::eTypeWord) {
1394  dest[d]->m_Emit.insert(dest[x]->m_Forward2.begin(), dest[x]->m_Forward2.end());
1395  }
1396  if (ttt == CRegEx::eTypeStop) {
1397  dest[d]->m_Emit.insert(dest[x]->m_Forward3.begin(), dest[x]->m_Forward3.end());
1398  }
1399 }
1400 
1401 
1402 static string QuoteDot(const string& s, bool space = false)
1403 {
1404  string out;
1405  for (size_t i = 0; i < s.size(); i++) {
1406  switch (s[i]) {
1407  case ' ':
1408  out += space ? "&#9251;" : " ";
1409  break;
1410  case '\"':
1411  out += "\\\"";
1412  break;
1413  case '\\':
1414  out += "\\\\";
1415  break;
1416  case 0x7f: // DEL
1417  out += "&#9249;";
1418  break;
1419  default:
1420  if ((unsigned char)s[i] < 32) { // non-printable characters
1421  out += "&#" + to_string(9216 + s[i]) + ";";
1422  }
1423  else if ((unsigned char)s[i] > 0x7f) { // second half of ASCII table
1424  out += "&#" + to_string(912 + (unsigned char)s[i]) + ";";
1425  }
1426  else {
1427  out += s[i];
1428  }
1429  }
1430  }
1431  return out;
1432 }
1433 
1434 
1435 static string Pack(const string& s)
1436 {
1437  string out;
1438  char from = 0;
1439  char to = 0;
1440  for (size_t i = 0; i < s.size(); i++) {
1441  if ((s[i] >= '0' && s[i] <= '9') || (s[i] >= 'A' && s[i] <= 'Z') || (s[i] >= 'a' && s[i] <= 'z')) {
1442  if (!from) {
1443  from = s[i];
1444  to = from;
1445  continue;
1446  }
1447  else if (s[i] == to + 1) {
1448  to++;
1449  continue;
1450  }
1451  else {
1452  out += from;
1453  if (to > from + 1) {
1454  out += '-';
1455  }
1456  if (to > from) {
1457  out += to;
1458  }
1459  from = 0;
1460  i--;
1461  continue;
1462  }
1463  }
1464  if (from) {
1465  out += from;
1466  if (to > from + 1) {
1467  out += '-';
1468  }
1469  if (to > from) {
1470  out += to;
1471  }
1472  from = 0;
1473  }
1474  out += s[i];
1475  }
1476  if (from) {
1477  out += from;
1478  if (to > from + 1) {
1479  out += '-';
1480  }
1481  if (to > from) {
1482  out += to;
1483  }
1484  from = 0;
1485  }
1486  return out;
1487 }
1488 
1489 
1490 void CRegExFSA::GenerateDotGraph(ostream& out) const // Dump in DOT format (http://www.graphviz.org/)
1491 {
1492  out << "digraph {\n";
1493  for (size_t n = 1; n < m_States.size(); ++n) {
1494  string label = to_string(n) + ": ";
1495  if (CRegEx::eTypePass == (m_States[n]->m_Type & CRegEx::eTypePass)) {
1496  label += "*";
1497  }
1498  else {
1499  if (m_States[n]->m_Type & CRegEx::eTypeStart) {
1500  label += "^";
1501  }
1502  if (m_States[n]->m_Type & CRegEx::eTypeWord) {
1503  label += "w";
1504  }
1505  if (m_States[n]->m_Type & CRegEx::eTypeNoWord) {
1506  label += "#";
1507  }
1508  if (m_States[n]->m_Type & CRegEx::eTypeStop) {
1509  label += "$";
1510  }
1511  }
1512  if (m_States[n]->m_Type & CRegEx::eTypeCheck) {
1513  label += " : ";
1514  if (m_States[n]->m_Type & CRegEx::eTypeToWord) {
1515  label += "w";
1516  }
1517  if (m_States[n]->m_Type & CRegEx::eTypeToNoWord) {
1518  label += "#";
1519  }
1520  if (m_States[n]->m_Type & CRegEx::eTypeToStop) {
1521  label += "$";
1522  }
1523  }
1524  if (!m_States[n]->m_Emit.empty()) {
1525  label += "\\nfound:";
1526  bool first = true;
1527  for (size_t e : m_States[n]->m_Emit) {
1528  if (!first) {
1529  label += ",";
1530  }
1531  first = false;
1532  label += " &#8220;" + QuoteDot(m_Str[e]) + "&#8221;";
1533  }
1534  }
1535  out << n << " [label=\"" << label << "\"]\n";
1536  for (size_t t : m_States[n]->m_Short) {
1537  out << n << " -> " << t << " [style=dotted]\n";
1538  }
1539  if (m_States[n]->m_Type != CRegEx::eTypeStop) {
1540  map<size_t, string> arrows;
1541  for (unsigned m = 1; m < 256; m++) {
1542  arrows[m_States[n]->m_Trans[m]] += char(m);
1543  }
1544  arrows[m_States[n]->m_Trans[0]] += "&#9216;";
1545  size_t max_len = 0;
1546  size_t other;
1547  for (auto &A : arrows) {
1548  A.second = Pack(A.second);
1549  if (A.second.length() > max_len) {
1550  other = A.first;
1551  max_len = A.second.length();
1552  }
1553  }
1554  arrows[other] = "...";
1555  for (auto &A : arrows) {
1556  out << n << " -> " << A.first << " [label=\"" << QuoteDot(A.second, true) << "\"]\n";
1557  }
1558  }
1559  }
1560  out << "}\n";
1561 }
1562 
1563 
1565 {
1566  out << "// Input from the outer code: const unsigned char* p;\n//\n\n const unsigned char* _p = p;\n";
1567  for (size_t n = 1; n < m_States.size(); ++n) {
1568  if (n > 1) {
1569  out << "_" << n << ":\n";
1570  }
1571  for (auto e : m_States[n]->m_Emit) {
1572  out << " if (_FSM_REPORT(" << e << ", p - _p)) return; // " << m_Str[e] << "\n";
1573  }
1574  if (m_States[n]->m_Type & CRegEx::eTypeStop) {
1575  out << " return;\n";
1576  continue;
1577  }
1578  if (n > 1) {
1579  out << " ++p;\n";
1580  }
1581  out << " switch (*p) {\n";
1582  map<size_t, string> arrows;
1583  for (unsigned m = 0; m < 256; m++) {
1584  arrows[m_States[n]->m_Trans[m]] += char(m); // it's ok for an std::string to have a null character inside!
1585  }
1586  size_t max_len = 0;
1587  size_t other = 0;
1588  for (auto A = arrows.begin(); A != arrows.end(); ++A) {
1589  if (A->second.length() > max_len) {
1590  other = A->first;
1591  max_len = A->second.length();
1592  }
1593  }
1594  for (auto A = arrows.begin(); A != arrows.end(); ++A) {
1595  if (A->first != other) {
1596  for (auto p = A->second.begin(); p != A->second.end(); ++p) {
1597  out << " case ";
1598  if (*p == '\'' || *p == '\"' || *p == '\\') {
1599  out << "\'\\" << *p << "\':\n";
1600  }
1601  else if (*p >= 32 && *p < 127) {
1602  out << "\'" << *p << "\':\n";
1603  }
1604  else {
1605  out << (unsigned)*p << ":\n";
1606  }
1607  }
1608  out << " goto _" << A->first << ";\n";
1609  }
1610  }
1611  out << " default:\n";
1612  out << " goto _" << other << ";\n";
1613  out << " }\n";
1614  }
1615 }
1616 
1618 {
1619  uint64_t m_current = 0;
1620  size_t m_index = 0;
1621 
1622  void Out1(std::ostream& out, bool bit)
1623  {
1624  out << (bit ? "1" : "0") << ",";
1625  m_index++;
1626  if (m_index == 32) {
1627  m_index = 0;
1628  out << "\n";
1629  } else {
1630  out << " ";
1631  }
1632  }
1633  void Out64(std::ostream& out, bool bit)
1634  {
1635  if (bit) {
1636  m_current |= (1ULL << m_index);
1637  }
1638  m_index++;
1639  if (m_index == 64) {
1640  out << " 0x" << NStr::NumericToString(m_current, 0, 16) << "ULL,\n";
1641  m_index = 0;
1642  m_current = 0;
1643  }
1644  }
1645  void Final64(std::ostream& out) {
1646  if (m_index)
1647  out << " 0x" << NStr::NumericToString(m_current, 0, 16) << "ULL";
1648  }
1649 };
1650 
1652 {
1653  size_t max_vec_size = 0;
1654  size_t num_hits = 0;
1655  for (size_t n = 1; n < m_States.size(); n++) {
1656  if (m_States[n]->m_Emit.size())
1657  num_hits++;
1658  max_vec_size = std::max(max_vec_size, m_States[n]->m_Emit.size());
1659  }
1660 
1661  out << "NCBI_FSM_PREPARE(\n"
1662  << " " << m_States.size() - 1 << ", // states size \n"
1663  << " " << max_vec_size << ", // max vector size\n"
1664  << " " << num_hits << ", // num hits\n"
1665  << " " << (m_States.size() - 1 + 63)/64 << " // emit compacted size\n"
1666  << ")\n";
1667 
1668  out << "/*\n";
1669  #if 0
1670  out << "NCBI_FSM_EMIT = {\n"; // #define _FSM_EMIT static bool emit[]
1671  for (size_t n = 0; n < m_States.size() - 1; n++) {
1672  cout << (n ? n % 32 ? ", " : ",\n" : "") << (m_States[n + 1]->m_Emit.size() ? "1" : "0");
1673  }
1674  out << "\n};\n";
1675  #endif
1676 
1677  out << "NCBI_FSM_EMIT = {\n"; // #define _FSM_EMIT static bool emit[]
1678  {
1679  C64MaskOut mask64;
1680  for (size_t n = 1; n < m_States.size(); n++) {
1681  mask64.Out1(out, (m_States[n]->m_Emit.size()));
1682  }
1683  }
1684  out << "\n};\n";
1685  out << "*/\n";
1686 
1687  out << "NCBI_FSM_EMIT_COMPACT = {\n";
1688  {
1689  C64MaskOut mask64;
1690  for (size_t n = 1; n < m_States.size(); n++) {
1691  mask64.Out64(out, (m_States[n]->m_Emit.size()));
1692  }
1693  mask64.Final64(out);
1694  }
1695  out << "\n};\n";
1696 
1697  out << "/*\n";
1698  out << "NCBI_FSM_HITS = {\n"; // #define _FSM_HITS static map<size_t, vector<size_t>> hits
1699  size_t count = 0;
1700  for (size_t n = 0; n < m_States.size(); n++) {
1701  if (m_States[n]->m_Emit.size()) {
1702  count++;
1703  }
1704  }
1705  for (size_t n = 0; n < m_States.size(); n++) {
1706  if (m_States[n]->m_Emit.size()) {
1707  count--;
1708  out << "{ " << (n - 1) << ", { ";
1709  size_t i = 0;
1710  for (auto e : m_States[n]->m_Emit) {
1711  out << (i ? ", " : "") << e;
1712  i++;
1713  }
1714  out << " }}" << (count ? ", " : " ");
1715  for (auto e : m_States[n]->m_Emit) {
1716  out << " // " << e << ": " << m_Str[e];
1717  i++;
1718  }
1719  out << "\n";
1720  }
1721  }
1722  out << "};\n";
1723  out << "*/\n";
1724 
1725  out << "NCBI_FSM_HITS_1(" << num_hits << ") = {\n"; // #define _FSM_HITS static map<size_t, vector<size_t>> hits
1726  for (size_t n = 0; n < m_States.size(); n++) {
1727  if (m_States[n]->m_Emit.size()) {
1728  out << (n - 1) << ", // ";
1729  for (auto e : m_States[n]->m_Emit) {
1730  out << " " << e << ": " << m_Str[e];
1731  }
1732  out << "\n";
1733  }
1734  }
1735  out << "};\n";
1736 
1737  out << "NCBI_FSM_HITS_2(" << num_hits << ") = { {\n"; // #define _FSM_HITS static map<size_t, vector<size_t>> hits
1738  for (size_t n = 0; n < m_States.size(); n++) {
1739  if (m_States[n]->m_Emit.size()) {
1740  out << "{ ";
1741  for (auto e : m_States[n]->m_Emit) {
1742  out << e << ", ";
1743  }
1744  out << "}, //";
1745  for (auto e : m_States[n]->m_Emit) {
1746  out << " " << e << ": " << m_Str[e];
1747  }
1748  out << "\n";
1749  }
1750  }
1751  out << "} };\n";
1752 
1753  // #define _FSM_STATE(size) static size_t states[size]
1754  out << "NCBI_FSM_STATES = {\n";
1755  for (size_t n = 1; n < m_States.size(); n++) {
1756  out << "// " << (n - 1) << "\n";
1757  for (size_t i = 0; i < 256; i++) {
1758  out
1759  << (m_States[n]->m_Trans[i] ? m_States[n]->m_Trans[i] - 1 : 0)
1760  << (i % 32 == 31 ? ",\n" : ", ");
1761  }
1762  }
1763  out << "};\n";
1764 }
1765 
void GenerateSourceCode(ostream &out) const
Generate C code for the FSM search.
void AddPatterns(const vector< string > &patterns)
unique_ptr< CRegExFSA > m_FSM
std::function< void(size_t, size_t)> VoidCall2
std::function< bool(size_t, size_t)> BoolCall2
void AddPattern(const char *pattern, TFlags flags=0)
static string QuoteString(const string &str)
Quote special characters to insert string into regular expression.
void GenerateArrayMapData(ostream &out) const
Generate C++ array/map data.
std::function< void(size_t)> VoidCall1
void GenerateDotGraph(ostream &out) const
Generate graphical representation of the FSM in DOT format.
void Search(const char *input, VoidCall1 found_callback) const
std::function< bool(size_t)> BoolCall1
pair< size_t, CRegEx::EType > TNode
vector< TNode > TNodeSet
void GenerateArrayMapData(ostream &out) const
void Trans(size_t x, unsigned char c, size_t y)
void Add(const CRegEx &rx)
void GenerateSourceCode(ostream &out) const
void Create(const CRegEx &rx, size_t emit)
static size_t Collect(TScratch &VV, CRegEx::EType t, TStates &src, TStates &dest, TNodeSetMap &NM, TNodeSetList &NL, TNodeSet &NS, TScratch &HH)
void GenerateDotGraph(ostream &out) const
void Short(size_t x, size_t y)
static void Extend(size_t x, unsigned char c, TStates &src, TStates &dest, TNodeSetMap &NM, TNodeSetList &NL, TNodeSet &NS, TScratch &VV, TScratch &HH)
size_t AddState(unsigned char t=CRegEx::eTypePass)
void Merge(unique_ptr< CRegExFSA > fsa)
vector< TNodeSet > TNodeSetList
vector< unique_ptr< CRegExState > > TStates
unique_ptr< CRegX > x_ParseAtom()
unique_ptr< CRegX > x_ParsePlain()
unique_ptr< CRegX > x_ParseConcat()
void x_Consume(char c)
void x_ParseSquare(set< unsigned char > &t)
bool x_ParseRepeat(int &from, int &to, bool &lazy)
void x_Print(ostream &out) const
void x_ThrowError(const string msg, size_t pos, size_t len)
unsigned char x_ParseEscape()
unique_ptr< CRegX > x_ParseTerm()
int x_ParseHex(size_t len=0)
CMultipatternSearch::TFlags m_Flag
void x_ThrowUnexpectedCharacter()
int x_ParseDec(size_t len=0)
void x_ThrowUnexpectedEndOfLine()
unique_ptr< CRegX > m_RegX
static bool IsWordCharacter(unsigned char c)
void x_ParseOptions()
unique_ptr< CRegX > x_ParseSelect()
const_iterator begin() const
Definition: map.hpp:151
const_iterator end() const
Definition: map.hpp:152
iterator_bool insert(const value_type &val)
Definition: map.hpp:165
const_iterator find(const key_type &key) const
Definition: map.hpp:153
Definition: map.hpp:338
const_iterator begin() const
Definition: set.hpp:135
bool empty() const
Definition: set.hpp:133
const_iterator end() const
Definition: set.hpp:136
std::ofstream out("events_result.xml")
main entry point for tests
static DLIST_TYPE *DLIST_NAME() first(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:46
static DLIST_TYPE *DLIST_NAME() last(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:51
static DLIST_TYPE *DLIST_NAME() prev(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
Definition: dlist.tmpl.h:61
static DLIST_TYPE *DLIST_NAME() next(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
Definition: dlist.tmpl.h:56
static const char * str(char *buf, int n)
Definition: stats.c:84
#define HH(a, b, c, d, x, s)
Definition: md4.c:190
Uint8 uint64_t
string
Definition: cgiapp.hpp:687
unsigned int uintptr_t
Definition: ncbitype.h:197
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
static enable_if< is_arithmetic< TNumeric >::value||is_convertible< TNumeric, Int8 >::value, string >::type NumericToString(TNumeric value, TNumToStringFlags flags=0, int base=10)
Convert numeric value to string.
Definition: ncbistr.hpp:673
static const char label[]
static int input()
int i
yy_size_t n
int len
static void InsertNoSpace(set< unsigned char > &t)
static void InsertNoAlpha(set< unsigned char > &t)
static void InsertAlpha(set< unsigned char > &t)
static void InsertSpace(set< unsigned char > &t)
static string QuoteDot(const string &s, bool space=false)
static void xMultiPatternSearch(const char *input, const CRegExFSA &fsa, CMultipatternSearch::BoolCall2 report)
static string Pack(const string &s)
static void InsertNoDigit(set< unsigned char > &t)
static void InsertDigit(set< unsigned char > &t)
USING_SCOPE(FSM)
ostream & operator<<(ostream &ostr, const CRegEx &regex)
range(_Ty, _Ty) -> range< _Ty >
unsigned int a
Definition: ncbi_localip.c:102
EIPRangeType t
Definition: ncbi_localip.c:101
T max(T x_, T y_)
double f(double x_, const double &y_)
Definition: njn_root.hpp:188
static patstr * patterns
Definition: pcregrep.c:259
#define assert(x)
Definition: srv_diag.hpp:58
void Out1(std::ostream &out, bool bit)
void Final64(std::ostream &out)
void Out64(std::ostream &out, bool bit)
void Print(ostream &out, size_t off) const
void Render(CRegExFSA &fsa, size_t from, size_t to) const
void Render(CRegExFSA &fsa, size_t from, size_t to) const
void Print(ostream &out, size_t off) const
set< unsigned char > m_Set
bool IsCaseInsensitive() const
void Render(CRegExFSA &fsa, size_t from, size_t to) const
void Render(CRegExFSA &fsa, size_t from, size_t to) const
void Render(CRegExFSA &fsa, size_t from, size_t to) const
void Print(ostream &out, size_t off) const
void Render(CRegExFSA &fsa, size_t from, size_t to) const
static void DummyTrans(CRegExFSA &fsa, size_t x, unsigned char t)
static void PrintOffset(ostream &out, size_t off)
void Merge(wxMenu &menu_1, const wxMenu &menu_2)
merges all items form menu_2 into menu_1, preserving the structure if possible
Definition: wx_utils.cpp:579
Modified on Wed Jun 19 17:06:02 2024 by modify_doxy.py rev. 669887