NCBI C++ ToolKit
pcre_study.c
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4 
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7 
8  Written by Philip Hazel
9  Copyright (c) 1997-2012 University of Cambridge
10 
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14 
15  * Redistributions of source code must retain the above copyright notice,
16  this list of conditions and the following disclaimer.
17 
18  * Redistributions in binary form must reproduce the above copyright
19  notice, this list of conditions and the following disclaimer in the
20  documentation and/or other materials provided with the distribution.
21 
22  * Neither the name of the University of Cambridge nor the names of its
23  contributors may be used to endorse or promote products derived from
24  this software without specific prior written permission.
25 
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39 
40 
41 /* This module contains the external function pcre_study(), along with local
42 supporting functions. */
43 
44 
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48 
49 #include "pcre_internal.h"
50 
51 #define SET_BIT(c) start_bits[c/8] |= (1 << (c&7))
52 
53 /* Returns from set_start_bits() */
54 
56 
57 
58 
59 /*************************************************
60 * Find the minimum subject length for a group *
61 *************************************************/
62 
63 /* Scan a parenthesized group and compute the minimum length of subject that
64 is needed to match it. This is a lower bound; it does not mean there is a
65 string of that length that matches. In UTF8 mode, the result is in characters
66 rather than bytes.
67 
68 Arguments:
69  re compiled pattern block
70  code pointer to start of group (the bracket)
71  startcode pointer to start of the whole pattern's code
72  options the compiling options
73  recurses chain of recurse_check to catch mutual recursion
74  countptr pointer to call count (to catch over complexity)
75 
76 Returns: the minimum length
77  -1 if \C in UTF-8 mode or (*ACCEPT) was encountered
78  -2 internal error (missing capturing bracket)
79  -3 internal error (opcode not listed)
80 */
81 
82 static int
84  const pcre_uchar *startcode, int options, recurse_check *recurses,
85  int *countptr)
86 {
87 int length = -1;
88 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
89 BOOL utf = (options & PCRE_UTF8) != 0;
90 BOOL had_recurse = FALSE;
91 recurse_check this_recurse;
92 register int branchlength = 0;
93 register pcre_uchar *cc = (pcre_uchar *)code + 1 + LINK_SIZE;
94 
95 if ((*countptr)++ > 1000) return -1; /* too complex */
96 
97 if (*code == OP_CBRA || *code == OP_SCBRA ||
98  *code == OP_CBRAPOS || *code == OP_SCBRAPOS) cc += IMM2_SIZE;
99 
100 /* Scan along the opcodes for this branch. If we get to the end of the
101 branch, check the length against that of the other branches. */
102 
103 for (;;)
104  {
105  int d, min;
106  pcre_uchar *cs, *ce;
107  register pcre_uchar op = *cc;
108 
109  switch (op)
110  {
111  case OP_COND:
112  case OP_SCOND:
113 
114  /* If there is only one branch in a condition, the implied branch has zero
115  length, so we don't add anything. This covers the DEFINE "condition"
116  automatically. */
117 
118  cs = cc + GET(cc, 1);
119  if (*cs != OP_ALT)
120  {
121  cc = cs + 1 + LINK_SIZE;
122  break;
123  }
124 
125  /* Otherwise we can fall through and treat it the same as any other
126  subpattern. */
127 
128  case OP_CBRA:
129  case OP_SCBRA:
130  case OP_BRA:
131  case OP_SBRA:
132  case OP_CBRAPOS:
133  case OP_SCBRAPOS:
134  case OP_BRAPOS:
135  case OP_SBRAPOS:
136  case OP_ONCE:
137  case OP_ONCE_NC:
138  d = find_minlength(re, cc, startcode, options, recurses, countptr);
139  if (d < 0) return d;
140  branchlength += d;
141  do cc += GET(cc, 1); while (*cc == OP_ALT);
142  cc += 1 + LINK_SIZE;
143  break;
144 
145  /* ACCEPT makes things far too complicated; we have to give up. */
146 
147  case OP_ACCEPT:
148  case OP_ASSERT_ACCEPT:
149  return -1;
150 
151  /* Reached end of a branch; if it's a ket it is the end of a nested
152  call. If it's ALT it is an alternation in a nested call. If it is END it's
153  the end of the outer call. All can be handled by the same code. If an
154  ACCEPT was previously encountered, use the length that was in force at that
155  time, and pass back the shortest ACCEPT length. */
156 
157  case OP_ALT:
158  case OP_KET:
159  case OP_KETRMAX:
160  case OP_KETRMIN:
161  case OP_KETRPOS:
162  case OP_END:
163  if (length < 0 || (!had_recurse && branchlength < length))
164  length = branchlength;
165  if (op != OP_ALT) return length;
166  cc += 1 + LINK_SIZE;
167  branchlength = 0;
168  had_recurse = FALSE;
169  break;
170 
171  /* Skip over assertive subpatterns */
172 
173  case OP_ASSERT:
174  case OP_ASSERT_NOT:
175  case OP_ASSERTBACK:
176  case OP_ASSERTBACK_NOT:
177  do cc += GET(cc, 1); while (*cc == OP_ALT);
178  /* Fall through */
179 
180  /* Skip over things that don't match chars */
181 
182  case OP_REVERSE:
183  case OP_CREF:
184  case OP_DNCREF:
185  case OP_RREF:
186  case OP_DNRREF:
187  case OP_DEF:
188  case OP_CALLOUT:
189  case OP_SOD:
190  case OP_SOM:
191  case OP_EOD:
192  case OP_EODN:
193  case OP_CIRC:
194  case OP_CIRCM:
195  case OP_DOLL:
196  case OP_DOLLM:
198  case OP_WORD_BOUNDARY:
199  cc += PRIV(OP_lengths)[*cc];
200  break;
201 
202  /* Skip over a subpattern that has a {0} or {0,x} quantifier */
203 
204  case OP_BRAZERO:
205  case OP_BRAMINZERO:
206  case OP_BRAPOSZERO:
207  case OP_SKIPZERO:
208  cc += PRIV(OP_lengths)[*cc];
209  do cc += GET(cc, 1); while (*cc == OP_ALT);
210  cc += 1 + LINK_SIZE;
211  break;
212 
213  /* Handle literal characters and + repetitions */
214 
215  case OP_CHAR:
216  case OP_CHARI:
217  case OP_NOT:
218  case OP_NOTI:
219  case OP_PLUS:
220  case OP_PLUSI:
221  case OP_MINPLUS:
222  case OP_MINPLUSI:
223  case OP_POSPLUS:
224  case OP_POSPLUSI:
225  case OP_NOTPLUS:
226  case OP_NOTPLUSI:
227  case OP_NOTMINPLUS:
228  case OP_NOTMINPLUSI:
229  case OP_NOTPOSPLUS:
230  case OP_NOTPOSPLUSI:
231  branchlength++;
232  cc += 2;
233 #ifdef SUPPORT_UTF
234  if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
235 #endif
236  break;
237 
238  case OP_TYPEPLUS:
239  case OP_TYPEMINPLUS:
240  case OP_TYPEPOSPLUS:
241  branchlength++;
242  cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
243  break;
244 
245  /* Handle exact repetitions. The count is already in characters, but we
246  need to skip over a multibyte character in UTF8 mode. */
247 
248  case OP_EXACT:
249  case OP_EXACTI:
250  case OP_NOTEXACT:
251  case OP_NOTEXACTI:
252  branchlength += GET2(cc,1);
253  cc += 2 + IMM2_SIZE;
254 #ifdef SUPPORT_UTF
255  if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
256 #endif
257  break;
258 
259  case OP_TYPEEXACT:
260  branchlength += GET2(cc,1);
261  cc += 2 + IMM2_SIZE + ((cc[1 + IMM2_SIZE] == OP_PROP
262  || cc[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
263  break;
264 
265  /* Handle single-char non-literal matchers */
266 
267  case OP_PROP:
268  case OP_NOTPROP:
269  cc += 2;
270  /* Fall through */
271 
272  case OP_NOT_DIGIT:
273  case OP_DIGIT:
274  case OP_NOT_WHITESPACE:
275  case OP_WHITESPACE:
276  case OP_NOT_WORDCHAR:
277  case OP_WORDCHAR:
278  case OP_ANY:
279  case OP_ALLANY:
280  case OP_EXTUNI:
281  case OP_HSPACE:
282  case OP_NOT_HSPACE:
283  case OP_VSPACE:
284  case OP_NOT_VSPACE:
285  branchlength++;
286  cc++;
287  break;
288 
289  /* "Any newline" might match two characters, but it also might match just
290  one. */
291 
292  case OP_ANYNL:
293  branchlength += 1;
294  cc++;
295  break;
296 
297  /* The single-byte matcher means we can't proceed in UTF-8 mode. (In
298  non-UTF-8 mode \C will actually be turned into OP_ALLANY, so won't ever
299  appear, but leave the code, just in case.) */
300 
301  case OP_ANYBYTE:
302 #ifdef SUPPORT_UTF
303  if (utf) return -1;
304 #endif
305  branchlength++;
306  cc++;
307  break;
308 
309  /* For repeated character types, we have to test for \p and \P, which have
310  an extra two bytes of parameters. */
311 
312  case OP_TYPESTAR:
313  case OP_TYPEMINSTAR:
314  case OP_TYPEQUERY:
315  case OP_TYPEMINQUERY:
316  case OP_TYPEPOSSTAR:
317  case OP_TYPEPOSQUERY:
318  if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
319  cc += PRIV(OP_lengths)[op];
320  break;
321 
322  case OP_TYPEUPTO:
323  case OP_TYPEMINUPTO:
324  case OP_TYPEPOSUPTO:
325  if (cc[1 + IMM2_SIZE] == OP_PROP
326  || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2;
327  cc += PRIV(OP_lengths)[op];
328  break;
329 
330  /* Check a class for variable quantification */
331 
332  case OP_CLASS:
333  case OP_NCLASS:
334 #if defined SUPPORT_UTF || defined COMPILE_PCRE16 || defined COMPILE_PCRE32
335  case OP_XCLASS:
336  /* The original code caused an unsigned overflow in 64 bit systems,
337  so now we use a conditional statement. */
338  if (op == OP_XCLASS)
339  cc += GET(cc, 1);
340  else
341  cc += PRIV(OP_lengths)[OP_CLASS];
342 #else
343  cc += PRIV(OP_lengths)[OP_CLASS];
344 #endif
345 
346  switch (*cc)
347  {
348  case OP_CRPLUS:
349  case OP_CRMINPLUS:
350  case OP_CRPOSPLUS:
351  branchlength++;
352  /* Fall through */
353 
354  case OP_CRSTAR:
355  case OP_CRMINSTAR:
356  case OP_CRQUERY:
357  case OP_CRMINQUERY:
358  case OP_CRPOSSTAR:
359  case OP_CRPOSQUERY:
360  cc++;
361  break;
362 
363  case OP_CRRANGE:
364  case OP_CRMINRANGE:
365  case OP_CRPOSRANGE:
366  branchlength += GET2(cc,1);
367  cc += 1 + 2 * IMM2_SIZE;
368  break;
369 
370  default:
371  branchlength++;
372  break;
373  }
374  break;
375 
376  /* Backreferences and subroutine calls are treated in the same way: we find
377  the minimum length for the subpattern. A recursion, however, causes an
378  a flag to be set that causes the length of this branch to be ignored. The
379  logic is that a recursion can only make sense if there is another
380  alternation that stops the recursing. That will provide the minimum length
381  (when no recursion happens). A backreference within the group that it is
382  referencing behaves in the same way.
383 
384  If PCRE_JAVASCRIPT_COMPAT is set, a backreference to an unset bracket
385  matches an empty string (by default it causes a matching failure), so in
386  that case we must set the minimum length to zero. */
387 
388  case OP_DNREF: /* Duplicate named pattern back reference */
389  case OP_DNREFI:
390  if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
391  {
392  int count = GET2(cc, 1+IMM2_SIZE);
393  pcre_uchar *slot = (pcre_uchar *)re +
394  re->name_table_offset + GET2(cc, 1) * re->name_entry_size;
395  d = INT_MAX;
396  while (count-- > 0)
397  {
398  ce = cs = (pcre_uchar *)PRIV(find_bracket)(startcode, utf, GET2(slot, 0));
399  if (cs == NULL) return -2;
400  do ce += GET(ce, 1); while (*ce == OP_ALT);
401  if (cc > cs && cc < ce) /* Simple recursion */
402  {
403  d = 0;
404  had_recurse = TRUE;
405  break;
406  }
407  else
408  {
409  recurse_check *r = recurses;
410  for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
411  if (r != NULL) /* Mutual recursion */
412  {
413  d = 0;
414  had_recurse = TRUE;
415  break;
416  }
417  else
418  {
419  int dd;
420  this_recurse.prev = recurses;
421  this_recurse.group = cs;
422  dd = find_minlength(re, cs, startcode, options, &this_recurse,
423  countptr);
424  if (dd < d) d = dd;
425  }
426  }
427  slot += re->name_entry_size;
428  }
429  }
430  else d = 0;
431  cc += 1 + 2*IMM2_SIZE;
432  goto REPEAT_BACK_REFERENCE;
433 
434  case OP_REF: /* Single back reference */
435  case OP_REFI:
436  if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
437  {
438  ce = cs = (pcre_uchar *)PRIV(find_bracket)(startcode, utf, GET2(cc, 1));
439  if (cs == NULL) return -2;
440  do ce += GET(ce, 1); while (*ce == OP_ALT);
441  if (cc > cs && cc < ce) /* Simple recursion */
442  {
443  d = 0;
444  had_recurse = TRUE;
445  }
446  else
447  {
448  recurse_check *r = recurses;
449  for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
450  if (r != NULL) /* Mutual recursion */
451  {
452  d = 0;
453  had_recurse = TRUE;
454  }
455  else
456  {
457  this_recurse.prev = recurses;
458  this_recurse.group = cs;
459  d = find_minlength(re, cs, startcode, options, &this_recurse,
460  countptr);
461  }
462  }
463  }
464  else d = 0;
465  cc += 1 + IMM2_SIZE;
466 
467  /* Handle repeated back references */
468 
469  REPEAT_BACK_REFERENCE:
470  switch (*cc)
471  {
472  case OP_CRSTAR:
473  case OP_CRMINSTAR:
474  case OP_CRQUERY:
475  case OP_CRMINQUERY:
476  case OP_CRPOSSTAR:
477  case OP_CRPOSQUERY:
478  min = 0;
479  cc++;
480  break;
481 
482  case OP_CRPLUS:
483  case OP_CRMINPLUS:
484  case OP_CRPOSPLUS:
485  min = 1;
486  cc++;
487  break;
488 
489  case OP_CRRANGE:
490  case OP_CRMINRANGE:
491  case OP_CRPOSRANGE:
492  min = GET2(cc, 1);
493  cc += 1 + 2 * IMM2_SIZE;
494  break;
495 
496  default:
497  min = 1;
498  break;
499  }
500 
501  branchlength += min * d;
502  break;
503 
504  /* We can easily detect direct recursion, but not mutual recursion. This is
505  caught by a recursion depth count. */
506 
507  case OP_RECURSE:
508  cs = ce = (pcre_uchar *)startcode + GET(cc, 1);
509  do ce += GET(ce, 1); while (*ce == OP_ALT);
510  if (cc > cs && cc < ce) /* Simple recursion */
511  had_recurse = TRUE;
512  else
513  {
514  recurse_check *r = recurses;
515  for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
516  if (r != NULL) /* Mutual recursion */
517  had_recurse = TRUE;
518  else
519  {
520  this_recurse.prev = recurses;
521  this_recurse.group = cs;
522  branchlength += find_minlength(re, cs, startcode, options,
523  &this_recurse, countptr);
524  }
525  }
526  cc += 1 + LINK_SIZE;
527  break;
528 
529  /* Anything else does not or need not match a character. We can get the
530  item's length from the table, but for those that can match zero occurrences
531  of a character, we must take special action for UTF-8 characters. As it
532  happens, the "NOT" versions of these opcodes are used at present only for
533  ASCII characters, so they could be omitted from this list. However, in
534  future that may change, so we include them here so as not to leave a
535  gotcha for a future maintainer. */
536 
537  case OP_UPTO:
538  case OP_UPTOI:
539  case OP_NOTUPTO:
540  case OP_NOTUPTOI:
541  case OP_MINUPTO:
542  case OP_MINUPTOI:
543  case OP_NOTMINUPTO:
544  case OP_NOTMINUPTOI:
545  case OP_POSUPTO:
546  case OP_POSUPTOI:
547  case OP_NOTPOSUPTO:
548  case OP_NOTPOSUPTOI:
549 
550  case OP_STAR:
551  case OP_STARI:
552  case OP_NOTSTAR:
553  case OP_NOTSTARI:
554  case OP_MINSTAR:
555  case OP_MINSTARI:
556  case OP_NOTMINSTAR:
557  case OP_NOTMINSTARI:
558  case OP_POSSTAR:
559  case OP_POSSTARI:
560  case OP_NOTPOSSTAR:
561  case OP_NOTPOSSTARI:
562 
563  case OP_QUERY:
564  case OP_QUERYI:
565  case OP_NOTQUERY:
566  case OP_NOTQUERYI:
567  case OP_MINQUERY:
568  case OP_MINQUERYI:
569  case OP_NOTMINQUERY:
570  case OP_NOTMINQUERYI:
571  case OP_POSQUERY:
572  case OP_POSQUERYI:
573  case OP_NOTPOSQUERY:
574  case OP_NOTPOSQUERYI:
575 
576  cc += PRIV(OP_lengths)[op];
577 #ifdef SUPPORT_UTF
578  if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
579 #endif
580  break;
581 
582  /* Skip these, but we need to add in the name length. */
583 
584  case OP_MARK:
585  case OP_PRUNE_ARG:
586  case OP_SKIP_ARG:
587  case OP_THEN_ARG:
588  cc += PRIV(OP_lengths)[op] + cc[1];
589  break;
590 
591  /* The remaining opcodes are just skipped over. */
592 
593  case OP_CLOSE:
594  case OP_COMMIT:
595  case OP_FAIL:
596  case OP_PRUNE:
597  case OP_SET_SOM:
598  case OP_SKIP:
599  case OP_THEN:
600  cc += PRIV(OP_lengths)[op];
601  break;
602 
603  /* This should not occur: we list all opcodes explicitly so that when
604  new ones get added they are properly considered. */
605 
606  default:
607  return -3;
608  }
609  }
610 /* Control never gets here */
611 }
612 
613 
614 
615 /*************************************************
616 * Set a bit and maybe its alternate case *
617 *************************************************/
618 
619 /* Given a character, set its first byte's bit in the table, and also the
620 corresponding bit for the other version of a letter if we are caseless. In
621 UTF-8 mode, for characters greater than 127, we can only do the caseless thing
622 when Unicode property support is available.
623 
624 Arguments:
625  start_bits points to the bit map
626  p points to the character
627  caseless the caseless flag
628  cd the block with char table pointers
629  utf TRUE for UTF-8 / UTF-16 / UTF-32 mode
630 
631 Returns: pointer after the character
632 */
633 
634 static const pcre_uchar *
635 set_table_bit(pcre_uint8 *start_bits, const pcre_uchar *p, BOOL caseless,
636  compile_data *cd, BOOL utf)
637 {
638 pcre_uint32 c = *p;
639 
640 #ifdef COMPILE_PCRE8
641 SET_BIT(c);
642 
643 #ifdef SUPPORT_UTF
644 if (utf && c > 127)
645  {
646  GETCHARINC(c, p);
647 #ifdef SUPPORT_UCP
648  if (caseless)
649  {
650  pcre_uchar buff[6];
651  c = UCD_OTHERCASE(c);
652  (void)PRIV(ord2utf)(c, buff);
653  SET_BIT(buff[0]);
654  }
655 #endif /* Not SUPPORT_UCP */
656  return p;
657  }
658 #else /* Not SUPPORT_UTF */
659 (void)(utf); /* Stops warning for unused parameter */
660 #endif /* SUPPORT_UTF */
661 
662 /* Not UTF-8 mode, or character is less than 127. */
663 
664 if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
665 return p + 1;
666 #endif /* COMPILE_PCRE8 */
667 
668 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
669 if (c > 0xff)
670  {
671  c = 0xff;
672  caseless = FALSE;
673  }
674 SET_BIT(c);
675 
676 #ifdef SUPPORT_UTF
677 if (utf && c > 127)
678  {
679  GETCHARINC(c, p);
680 #ifdef SUPPORT_UCP
681  if (caseless)
682  {
683  c = UCD_OTHERCASE(c);
684  if (c > 0xff)
685  c = 0xff;
686  SET_BIT(c);
687  }
688 #endif /* SUPPORT_UCP */
689  return p;
690  }
691 #else /* Not SUPPORT_UTF */
692 (void)(utf); /* Stops warning for unused parameter */
693 #endif /* SUPPORT_UTF */
694 
695 if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
696 return p + 1;
697 #endif
698 }
699 
700 
701 
702 /*************************************************
703 * Set bits for a positive character type *
704 *************************************************/
705 
706 /* This function sets starting bits for a character type. In UTF-8 mode, we can
707 only do a direct setting for bytes less than 128, as otherwise there can be
708 confusion with bytes in the middle of UTF-8 characters. In a "traditional"
709 environment, the tables will only recognize ASCII characters anyway, but in at
710 least one Windows environment, some higher bytes bits were set in the tables.
711 So we deal with that case by considering the UTF-8 encoding.
712 
713 Arguments:
714  start_bits the starting bitmap
715  cbit type the type of character wanted
716  table_limit 32 for non-UTF-8; 16 for UTF-8
717  cd the block with char table pointers
718 
719 Returns: nothing
720 */
721 
722 static void
723 set_type_bits(pcre_uint8 *start_bits, int cbit_type, unsigned int table_limit,
724  compile_data *cd)
725 {
726 register pcre_uint32 c;
727 for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];
728 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
729 if (table_limit == 32) return;
730 for (c = 128; c < 256; c++)
731  {
732  if ((cd->cbits[c/8] & (1 << (c&7))) != 0)
733  {
734  pcre_uchar buff[6];
735  (void)PRIV(ord2utf)(c, buff);
736  SET_BIT(buff[0]);
737  }
738  }
739 #endif
740 }
741 
742 
743 /*************************************************
744 * Set bits for a negative character type *
745 *************************************************/
746 
747 /* This function sets starting bits for a negative character type such as \D.
748 In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
749 otherwise there can be confusion with bytes in the middle of UTF-8 characters.
750 Unlike in the positive case, where we can set appropriate starting bits for
751 specific high-valued UTF-8 characters, in this case we have to set the bits for
752 all high-valued characters. The lowest is 0xc2, but we overkill by starting at
753 0xc0 (192) for simplicity.
754 
755 Arguments:
756  start_bits the starting bitmap
757  cbit type the type of character wanted
758  table_limit 32 for non-UTF-8; 16 for UTF-8
759  cd the block with char table pointers
760 
761 Returns: nothing
762 */
763 
764 static void
765 set_nottype_bits(pcre_uint8 *start_bits, int cbit_type, unsigned int table_limit,
766  compile_data *cd)
767 {
768 register pcre_uint32 c;
769 for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];
770 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
771 if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;
772 #endif
773 }
774 
775 
776 
777 /*************************************************
778 * Create bitmap of starting bytes *
779 *************************************************/
780 
781 /* This function scans a compiled unanchored expression recursively and
782 attempts to build a bitmap of the set of possible starting bytes. As time goes
783 by, we may be able to get more clever at doing this. The SSB_CONTINUE return is
784 useful for parenthesized groups in patterns such as (a*)b where the group
785 provides some optional starting bytes but scanning must continue at the outer
786 level to find at least one mandatory byte. At the outermost level, this
787 function fails unless the result is SSB_DONE.
788 
789 Arguments:
790  code points to an expression
791  start_bits points to a 32-byte table, initialized to 0
792  utf TRUE if in UTF-8 / UTF-16 / UTF-32 mode
793  cd the block with char table pointers
794 
795 Returns: SSB_FAIL => Failed to find any starting bytes
796  SSB_DONE => Found mandatory starting bytes
797  SSB_CONTINUE => Found optional starting bytes
798  SSB_UNKNOWN => Hit an unrecognized opcode
799 */
800 
801 static int
802 set_start_bits(const pcre_uchar *code, pcre_uint8 *start_bits, BOOL utf,
803  compile_data *cd)
804 {
805 register pcre_uint32 c;
806 int yield = SSB_DONE;
807 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
808 int table_limit = utf? 16:32;
809 #else
810 int table_limit = 32;
811 #endif
812 
813 #if 0
814 /* ========================================================================= */
815 /* The following comment and code was inserted in January 1999. In May 2006,
816 when it was observed to cause compiler warnings about unused values, I took it
817 out again. If anybody is still using OS/2, they will have to put it back
818 manually. */
819 
820 /* This next statement and the later reference to dummy are here in order to
821 trick the optimizer of the IBM C compiler for OS/2 into generating correct
822 code. Apparently IBM isn't going to fix the problem, and we would rather not
823 disable optimization (in this module it actually makes a big difference, and
824 the pcre module can use all the optimization it can get). */
825 
826 volatile int dummy;
827 /* ========================================================================= */
828 #endif
829 
830 do
831  {
832  BOOL try_next = TRUE;
833  const pcre_uchar *tcode = code + 1 + LINK_SIZE;
834 
835  if (*code == OP_CBRA || *code == OP_SCBRA ||
836  *code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += IMM2_SIZE;
837 
838  while (try_next) /* Loop for items in this branch */
839  {
840  int rc;
841 
842  switch(*tcode)
843  {
844  /* If we reach something we don't understand, it means a new opcode has
845  been created that hasn't been added to this code. Hopefully this problem
846  will be discovered during testing. */
847 
848  default:
849  return SSB_UNKNOWN;
850 
851  /* Fail for a valid opcode that implies no starting bits. */
852 
853  case OP_ACCEPT:
854  case OP_ASSERT_ACCEPT:
855  case OP_ALLANY:
856  case OP_ANY:
857  case OP_ANYBYTE:
858  case OP_CIRC:
859  case OP_CIRCM:
860  case OP_CLOSE:
861  case OP_COMMIT:
862  case OP_COND:
863  case OP_CREF:
864  case OP_DEF:
865  case OP_DNCREF:
866  case OP_DNREF:
867  case OP_DNREFI:
868  case OP_DNRREF:
869  case OP_DOLL:
870  case OP_DOLLM:
871  case OP_END:
872  case OP_EOD:
873  case OP_EODN:
874  case OP_EXTUNI:
875  case OP_FAIL:
876  case OP_MARK:
877  case OP_NOT:
878  case OP_NOTEXACT:
879  case OP_NOTEXACTI:
880  case OP_NOTI:
881  case OP_NOTMINPLUS:
882  case OP_NOTMINPLUSI:
883  case OP_NOTMINQUERY:
884  case OP_NOTMINQUERYI:
885  case OP_NOTMINSTAR:
886  case OP_NOTMINSTARI:
887  case OP_NOTMINUPTO:
888  case OP_NOTMINUPTOI:
889  case OP_NOTPLUS:
890  case OP_NOTPLUSI:
891  case OP_NOTPOSPLUS:
892  case OP_NOTPOSPLUSI:
893  case OP_NOTPOSQUERY:
894  case OP_NOTPOSQUERYI:
895  case OP_NOTPOSSTAR:
896  case OP_NOTPOSSTARI:
897  case OP_NOTPOSUPTO:
898  case OP_NOTPOSUPTOI:
899  case OP_NOTPROP:
900  case OP_NOTQUERY:
901  case OP_NOTQUERYI:
902  case OP_NOTSTAR:
903  case OP_NOTSTARI:
904  case OP_NOTUPTO:
905  case OP_NOTUPTOI:
906  case OP_NOT_HSPACE:
907  case OP_NOT_VSPACE:
908  case OP_PRUNE:
909  case OP_PRUNE_ARG:
910  case OP_RECURSE:
911  case OP_REF:
912  case OP_REFI:
913  case OP_REVERSE:
914  case OP_RREF:
915  case OP_SCOND:
916  case OP_SET_SOM:
917  case OP_SKIP:
918  case OP_SKIP_ARG:
919  case OP_SOD:
920  case OP_SOM:
921  case OP_THEN:
922  case OP_THEN_ARG:
923  return SSB_FAIL;
924 
925  /* A "real" property test implies no starting bits, but the fake property
926  PT_CLIST identifies a list of characters. These lists are short, as they
927  are used for characters with more than one "other case", so there is no
928  point in recognizing them for OP_NOTPROP. */
929 
930  case OP_PROP:
931  if (tcode[1] != PT_CLIST) return SSB_FAIL;
932  {
933  const pcre_uint32 *p = PRIV(ucd_caseless_sets) + tcode[2];
934  while ((c = *p++) < NOTACHAR)
935  {
936 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
937  if (utf)
938  {
939  pcre_uchar buff[6];
940  (void)PRIV(ord2utf)(c, buff);
941  c = buff[0];
942  }
943 #endif
944  if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);
945  }
946  }
947  try_next = FALSE;
948  break;
949 
950  /* We can ignore word boundary tests. */
951 
952  case OP_WORD_BOUNDARY:
954  tcode++;
955  break;
956 
957  /* If we hit a bracket or a positive lookahead assertion, recurse to set
958  bits from within the subpattern. If it can't find anything, we have to
959  give up. If it finds some mandatory character(s), we are done for this
960  branch. Otherwise, carry on scanning after the subpattern. */
961 
962  case OP_BRA:
963  case OP_SBRA:
964  case OP_CBRA:
965  case OP_SCBRA:
966  case OP_BRAPOS:
967  case OP_SBRAPOS:
968  case OP_CBRAPOS:
969  case OP_SCBRAPOS:
970  case OP_ONCE:
971  case OP_ONCE_NC:
972  case OP_ASSERT:
973  rc = set_start_bits(tcode, start_bits, utf, cd);
974  if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
975  if (rc == SSB_DONE) try_next = FALSE; else
976  {
977  do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
978  tcode += 1 + LINK_SIZE;
979  }
980  break;
981 
982  /* If we hit ALT or KET, it means we haven't found anything mandatory in
983  this branch, though we might have found something optional. For ALT, we
984  continue with the next alternative, but we have to arrange that the final
985  result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
986  return SSB_CONTINUE: if this is the top level, that indicates failure,
987  but after a nested subpattern, it causes scanning to continue. */
988 
989  case OP_ALT:
990  yield = SSB_CONTINUE;
991  try_next = FALSE;
992  break;
993 
994  case OP_KET:
995  case OP_KETRMAX:
996  case OP_KETRMIN:
997  case OP_KETRPOS:
998  return SSB_CONTINUE;
999 
1000  /* Skip over callout */
1001 
1002  case OP_CALLOUT:
1003  tcode += 2 + 2*LINK_SIZE;
1004  break;
1005 
1006  /* Skip over lookbehind and negative lookahead assertions */
1007 
1008  case OP_ASSERT_NOT:
1009  case OP_ASSERTBACK:
1010  case OP_ASSERTBACK_NOT:
1011  do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
1012  tcode += 1 + LINK_SIZE;
1013  break;
1014 
1015  /* BRAZERO does the bracket, but carries on. */
1016 
1017  case OP_BRAZERO:
1018  case OP_BRAMINZERO:
1019  case OP_BRAPOSZERO:
1020  rc = set_start_bits(++tcode, start_bits, utf, cd);
1021  if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
1022 /* =========================================================================
1023  See the comment at the head of this function concerning the next line,
1024  which was an old fudge for the benefit of OS/2.
1025  dummy = 1;
1026  ========================================================================= */
1027  do tcode += GET(tcode,1); while (*tcode == OP_ALT);
1028  tcode += 1 + LINK_SIZE;
1029  break;
1030 
1031  /* SKIPZERO skips the bracket. */
1032 
1033  case OP_SKIPZERO:
1034  tcode++;
1035  do tcode += GET(tcode,1); while (*tcode == OP_ALT);
1036  tcode += 1 + LINK_SIZE;
1037  break;
1038 
1039  /* Single-char * or ? sets the bit and tries the next item */
1040 
1041  case OP_STAR:
1042  case OP_MINSTAR:
1043  case OP_POSSTAR:
1044  case OP_QUERY:
1045  case OP_MINQUERY:
1046  case OP_POSQUERY:
1047  tcode = set_table_bit(start_bits, tcode + 1, FALSE, cd, utf);
1048  break;
1049 
1050  case OP_STARI:
1051  case OP_MINSTARI:
1052  case OP_POSSTARI:
1053  case OP_QUERYI:
1054  case OP_MINQUERYI:
1055  case OP_POSQUERYI:
1056  tcode = set_table_bit(start_bits, tcode + 1, TRUE, cd, utf);
1057  break;
1058 
1059  /* Single-char upto sets the bit and tries the next */
1060 
1061  case OP_UPTO:
1062  case OP_MINUPTO:
1063  case OP_POSUPTO:
1064  tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, FALSE, cd, utf);
1065  break;
1066 
1067  case OP_UPTOI:
1068  case OP_MINUPTOI:
1069  case OP_POSUPTOI:
1070  tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, TRUE, cd, utf);
1071  break;
1072 
1073  /* At least one single char sets the bit and stops */
1074 
1075  case OP_EXACT:
1076  tcode += IMM2_SIZE;
1077  /* Fall through */
1078  case OP_CHAR:
1079  case OP_PLUS:
1080  case OP_MINPLUS:
1081  case OP_POSPLUS:
1082  (void)set_table_bit(start_bits, tcode + 1, FALSE, cd, utf);
1083  try_next = FALSE;
1084  break;
1085 
1086  case OP_EXACTI:
1087  tcode += IMM2_SIZE;
1088  /* Fall through */
1089  case OP_CHARI:
1090  case OP_PLUSI:
1091  case OP_MINPLUSI:
1092  case OP_POSPLUSI:
1093  (void)set_table_bit(start_bits, tcode + 1, TRUE, cd, utf);
1094  try_next = FALSE;
1095  break;
1096 
1097  /* Special spacing and line-terminating items. These recognize specific
1098  lists of characters. The difference between VSPACE and ANYNL is that the
1099  latter can match the two-character CRLF sequence, but that is not
1100  relevant for finding the first character, so their code here is
1101  identical. */
1102 
1103  case OP_HSPACE:
1104  SET_BIT(CHAR_HT);
1106 #ifdef SUPPORT_UTF
1107  if (utf)
1108  {
1109 #ifdef COMPILE_PCRE8
1110  SET_BIT(0xC2); /* For U+00A0 */
1111  SET_BIT(0xE1); /* For U+1680, U+180E */
1112  SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
1113  SET_BIT(0xE3); /* For U+3000 */
1114 #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1115  SET_BIT(0xA0);
1116  SET_BIT(0xFF); /* For characters > 255 */
1117 #endif /* COMPILE_PCRE[8|16|32] */
1118  }
1119  else
1120 #endif /* SUPPORT_UTF */
1121  {
1122 #ifndef EBCDIC
1123  SET_BIT(0xA0);
1124 #endif /* Not EBCDIC */
1125 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1126  SET_BIT(0xFF); /* For characters > 255 */
1127 #endif /* COMPILE_PCRE[16|32] */
1128  }
1129  try_next = FALSE;
1130  break;
1131 
1132  case OP_ANYNL:
1133  case OP_VSPACE:
1134  SET_BIT(CHAR_LF);
1135  SET_BIT(CHAR_VT);
1136  SET_BIT(CHAR_FF);
1137  SET_BIT(CHAR_CR);
1138 #ifdef SUPPORT_UTF
1139  if (utf)
1140  {
1141 #ifdef COMPILE_PCRE8
1142  SET_BIT(0xC2); /* For U+0085 */
1143  SET_BIT(0xE2); /* For U+2028, U+2029 */
1144 #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1145  SET_BIT(CHAR_NEL);
1146  SET_BIT(0xFF); /* For characters > 255 */
1147 #endif /* COMPILE_PCRE[8|16|32] */
1148  }
1149  else
1150 #endif /* SUPPORT_UTF */
1151  {
1152  SET_BIT(CHAR_NEL);
1153 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1154  SET_BIT(0xFF); /* For characters > 255 */
1155 #endif
1156  }
1157  try_next = FALSE;
1158  break;
1159 
1160  /* Single character types set the bits and stop. Note that if PCRE_UCP
1161  is set, we do not see these op codes because \d etc are converted to
1162  properties. Therefore, these apply in the case when only characters less
1163  than 256 are recognized to match the types. */
1164 
1165  case OP_NOT_DIGIT:
1166  set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
1167  try_next = FALSE;
1168  break;
1169 
1170  case OP_DIGIT:
1171  set_type_bits(start_bits, cbit_digit, table_limit, cd);
1172  try_next = FALSE;
1173  break;
1174 
1175  /* The cbit_space table has vertical tab as whitespace; we no longer
1176  have to play fancy tricks because Perl added VT to its whitespace at
1177  release 5.18. PCRE added it at release 8.34. */
1178 
1179  case OP_NOT_WHITESPACE:
1180  set_nottype_bits(start_bits, cbit_space, table_limit, cd);
1181  try_next = FALSE;
1182  break;
1183 
1184  case OP_WHITESPACE:
1185  set_type_bits(start_bits, cbit_space, table_limit, cd);
1186  try_next = FALSE;
1187  break;
1188 
1189  case OP_NOT_WORDCHAR:
1190  set_nottype_bits(start_bits, cbit_word, table_limit, cd);
1191  try_next = FALSE;
1192  break;
1193 
1194  case OP_WORDCHAR:
1195  set_type_bits(start_bits, cbit_word, table_limit, cd);
1196  try_next = FALSE;
1197  break;
1198 
1199  /* One or more character type fudges the pointer and restarts, knowing
1200  it will hit a single character type and stop there. */
1201 
1202  case OP_TYPEPLUS:
1203  case OP_TYPEMINPLUS:
1204  case OP_TYPEPOSPLUS:
1205  tcode++;
1206  break;
1207 
1208  case OP_TYPEEXACT:
1209  tcode += 1 + IMM2_SIZE;
1210  break;
1211 
1212  /* Zero or more repeats of character types set the bits and then
1213  try again. */
1214 
1215  case OP_TYPEUPTO:
1216  case OP_TYPEMINUPTO:
1217  case OP_TYPEPOSUPTO:
1218  tcode += IMM2_SIZE; /* Fall through */
1219 
1220  case OP_TYPESTAR:
1221  case OP_TYPEMINSTAR:
1222  case OP_TYPEPOSSTAR:
1223  case OP_TYPEQUERY:
1224  case OP_TYPEMINQUERY:
1225  case OP_TYPEPOSQUERY:
1226  switch(tcode[1])
1227  {
1228  default:
1229  case OP_ANY:
1230  case OP_ALLANY:
1231  return SSB_FAIL;
1232 
1233  case OP_HSPACE:
1234  SET_BIT(CHAR_HT);
1236 #ifdef SUPPORT_UTF
1237  if (utf)
1238  {
1239 #ifdef COMPILE_PCRE8
1240  SET_BIT(0xC2); /* For U+00A0 */
1241  SET_BIT(0xE1); /* For U+1680, U+180E */
1242  SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
1243  SET_BIT(0xE3); /* For U+3000 */
1244 #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1245  SET_BIT(0xA0);
1246  SET_BIT(0xFF); /* For characters > 255 */
1247 #endif /* COMPILE_PCRE[8|16|32] */
1248  }
1249  else
1250 #endif /* SUPPORT_UTF */
1251 #ifndef EBCDIC
1252  SET_BIT(0xA0);
1253 #endif /* Not EBCDIC */
1254  break;
1255 
1256  case OP_ANYNL:
1257  case OP_VSPACE:
1258  SET_BIT(CHAR_LF);
1259  SET_BIT(CHAR_VT);
1260  SET_BIT(CHAR_FF);
1261  SET_BIT(CHAR_CR);
1262 #ifdef SUPPORT_UTF
1263  if (utf)
1264  {
1265 #ifdef COMPILE_PCRE8
1266  SET_BIT(0xC2); /* For U+0085 */
1267  SET_BIT(0xE2); /* For U+2028, U+2029 */
1268 #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1269  SET_BIT(CHAR_NEL);
1270  SET_BIT(0xFF); /* For characters > 255 */
1271 #endif /* COMPILE_PCRE16 */
1272  }
1273  else
1274 #endif /* SUPPORT_UTF */
1275  SET_BIT(CHAR_NEL);
1276  break;
1277 
1278  case OP_NOT_DIGIT:
1279  set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
1280  break;
1281 
1282  case OP_DIGIT:
1283  set_type_bits(start_bits, cbit_digit, table_limit, cd);
1284  break;
1285 
1286  /* The cbit_space table has vertical tab as whitespace; we no longer
1287  have to play fancy tricks because Perl added VT to its whitespace at
1288  release 5.18. PCRE added it at release 8.34. */
1289 
1290  case OP_NOT_WHITESPACE:
1291  set_nottype_bits(start_bits, cbit_space, table_limit, cd);
1292  break;
1293 
1294  case OP_WHITESPACE:
1295  set_type_bits(start_bits, cbit_space, table_limit, cd);
1296  break;
1297 
1298  case OP_NOT_WORDCHAR:
1299  set_nottype_bits(start_bits, cbit_word, table_limit, cd);
1300  break;
1301 
1302  case OP_WORDCHAR:
1303  set_type_bits(start_bits, cbit_word, table_limit, cd);
1304  break;
1305  }
1306 
1307  tcode += 2;
1308  break;
1309 
1310  /* Character class where all the information is in a bit map: set the
1311  bits and either carry on or not, according to the repeat count. If it was
1312  a negative class, and we are operating with UTF-8 characters, any byte
1313  with a value >= 0xc4 is a potentially valid starter because it starts a
1314  character with a value > 255. */
1315 
1316 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
1317  case OP_XCLASS:
1318  if ((tcode[1 + LINK_SIZE] & XCL_HASPROP) != 0)
1319  return SSB_FAIL;
1320  /* All bits are set. */
1321  if ((tcode[1 + LINK_SIZE] & XCL_MAP) == 0 && (tcode[1 + LINK_SIZE] & XCL_NOT) != 0)
1322  return SSB_FAIL;
1323 #endif
1324  /* Fall through */
1325 
1326  case OP_NCLASS:
1327 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
1328  if (utf)
1329  {
1330  start_bits[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */
1331  memset(start_bits+25, 0xff, 7); /* Bits for 0xc9 - 0xff */
1332  }
1333 #endif
1334 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1335  SET_BIT(0xFF); /* For characters > 255 */
1336 #endif
1337  /* Fall through */
1338 
1339  case OP_CLASS:
1340  {
1341  pcre_uint8 *map;
1342 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
1343  map = NULL;
1344  if (*tcode == OP_XCLASS)
1345  {
1346  if ((tcode[1 + LINK_SIZE] & XCL_MAP) != 0)
1347  map = (pcre_uint8 *)(tcode + 1 + LINK_SIZE + 1);
1348  tcode += GET(tcode, 1);
1349  }
1350  else
1351 #endif
1352  {
1353  tcode++;
1354  map = (pcre_uint8 *)tcode;
1355  tcode += 32 / sizeof(pcre_uchar);
1356  }
1357 
1358  /* In UTF-8 mode, the bits in a bit map correspond to character
1359  values, not to byte values. However, the bit map we are constructing is
1360  for byte values. So we have to do a conversion for characters whose
1361  value is > 127. In fact, there are only two possible starting bytes for
1362  characters in the range 128 - 255. */
1363 
1364 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
1365  if (map != NULL)
1366 #endif
1367  {
1368 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
1369  if (utf)
1370  {
1371  for (c = 0; c < 16; c++) start_bits[c] |= map[c];
1372  for (c = 128; c < 256; c++)
1373  {
1374  if ((map[c/8] & (1 << (c&7))) != 0)
1375  {
1376  int d = (c >> 6) | 0xc0; /* Set bit for this starter */
1377  start_bits[d/8] |= (1 << (d&7)); /* and then skip on to the */
1378  c = (c & 0xc0) + 0x40 - 1; /* next relevant character. */
1379  }
1380  }
1381  }
1382  else
1383 #endif
1384  {
1385  /* In non-UTF-8 mode, the two bit maps are completely compatible. */
1386  for (c = 0; c < 32; c++) start_bits[c] |= map[c];
1387  }
1388  }
1389 
1390  /* Advance past the bit map, and act on what follows. For a zero
1391  minimum repeat, continue; otherwise stop processing. */
1392 
1393  switch (*tcode)
1394  {
1395  case OP_CRSTAR:
1396  case OP_CRMINSTAR:
1397  case OP_CRQUERY:
1398  case OP_CRMINQUERY:
1399  case OP_CRPOSSTAR:
1400  case OP_CRPOSQUERY:
1401  tcode++;
1402  break;
1403 
1404  case OP_CRRANGE:
1405  case OP_CRMINRANGE:
1406  case OP_CRPOSRANGE:
1407  if (GET2(tcode, 1) == 0) tcode += 1 + 2 * IMM2_SIZE;
1408  else try_next = FALSE;
1409  break;
1410 
1411  default:
1412  try_next = FALSE;
1413  break;
1414  }
1415  }
1416  break; /* End of bitmap class handling */
1417 
1418  } /* End of switch */
1419  } /* End of try_next loop */
1420 
1421  code += GET(code, 1); /* Advance to next branch */
1422  }
1423 while (*code == OP_ALT);
1424 return yield;
1425 }
1426 
1427 
1428 
1429 
1430 
1431 /*************************************************
1432 * Study a compiled expression *
1433 *************************************************/
1434 
1435 /* This function is handed a compiled expression that it must study to produce
1436 information that will speed up the matching. It returns a pcre[16]_extra block
1437 which then gets handed back to pcre_exec().
1438 
1439 Arguments:
1440  re points to the compiled expression
1441  options contains option bits
1442  errorptr points to where to place error messages;
1443  set NULL unless error
1444 
1445 Returns: pointer to a pcre[16]_extra block, with study_data filled in and
1446  the appropriate flags set;
1447  NULL on error or if no optimization possible
1448 */
1449 
1450 #if defined COMPILE_PCRE8
1452 pcre_study(const pcre *external_re, int options, const char **errorptr)
1453 #elif defined COMPILE_PCRE16
1455 pcre16_study(const pcre16 *external_re, int options, const char **errorptr)
1456 #elif defined COMPILE_PCRE32
1458 pcre32_study(const pcre32 *external_re, int options, const char **errorptr)
1459 #endif
1460 {
1461 int min;
1462 int count = 0;
1463 BOOL bits_set = FALSE;
1464 pcre_uint8 start_bits[32];
1465 PUBL(extra) *extra = NULL;
1466 pcre_study_data *study;
1467 const pcre_uint8 *tables;
1468 pcre_uchar *code;
1469 compile_data compile_block;
1470 const REAL_PCRE *re = (const REAL_PCRE *)external_re;
1471 
1472 
1473 *errorptr = NULL;
1474 
1475 if (re == NULL || re->magic_number != MAGIC_NUMBER)
1476  {
1477  *errorptr = "argument is not a compiled regular expression";
1478  return NULL;
1479  }
1480 
1481 if ((re->flags & PCRE_MODE) == 0)
1482  {
1483 #if defined COMPILE_PCRE8
1484  *errorptr = "argument not compiled in 8 bit mode";
1485 #elif defined COMPILE_PCRE16
1486  *errorptr = "argument not compiled in 16 bit mode";
1487 #elif defined COMPILE_PCRE32
1488  *errorptr = "argument not compiled in 32 bit mode";
1489 #endif
1490  return NULL;
1491  }
1492 
1493 if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
1494  {
1495  *errorptr = "unknown or incorrect option bit(s) set";
1496  return NULL;
1497  }
1498 
1499 code = (pcre_uchar *)re + re->name_table_offset +
1500  (re->name_count * re->name_entry_size);
1501 
1502 /* For an anchored pattern, or an unanchored pattern that has a first char, or
1503 a multiline pattern that matches only at "line starts", there is no point in
1504 seeking a list of starting bytes. */
1505 
1506 if ((re->options & PCRE_ANCHORED) == 0 &&
1507  (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) == 0)
1508  {
1509  int rc;
1510 
1511  /* Set the character tables in the block that is passed around */
1512 
1513  tables = re->tables;
1514 
1515 #if defined COMPILE_PCRE8
1516  if (tables == NULL)
1517  (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1518  (void *)(&tables));
1519 #elif defined COMPILE_PCRE16
1520  if (tables == NULL)
1521  (void)pcre16_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1522  (void *)(&tables));
1523 #elif defined COMPILE_PCRE32
1524  if (tables == NULL)
1525  (void)pcre32_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1526  (void *)(&tables));
1527 #endif
1528 
1529  compile_block.lcc = tables + lcc_offset;
1530  compile_block.fcc = tables + fcc_offset;
1531  compile_block.cbits = tables + cbits_offset;
1532  compile_block.ctypes = tables + ctypes_offset;
1533 
1534  /* See if we can find a fixed set of initial characters for the pattern. */
1535 
1536  memset(start_bits, 0, 32 * sizeof(pcre_uint8));
1537  rc = set_start_bits(code, start_bits, (re->options & PCRE_UTF8) != 0,
1538  &compile_block);
1539  bits_set = rc == SSB_DONE;
1540  if (rc == SSB_UNKNOWN)
1541  {
1542  *errorptr = "internal error: opcode not recognized";
1543  return NULL;
1544  }
1545  }
1546 
1547 /* Find the minimum length of subject string. */
1548 
1549 switch(min = find_minlength(re, code, code, re->options, NULL, &count))
1550  {
1551  case -2: *errorptr = "internal error: missing capturing bracket"; return NULL;
1552  case -3: *errorptr = "internal error: opcode not recognized"; return NULL;
1553  default: break;
1554  }
1555 
1556 /* If a set of starting bytes has been identified, or if the minimum length is
1557 greater than zero, or if JIT optimization has been requested, or if
1558 PCRE_STUDY_EXTRA_NEEDED is set, get a pcre[16]_extra block and a
1559 pcre_study_data block. The study data is put in the latter, which is pointed to
1560 by the former, which may also get additional data set later by the calling
1561 program. At the moment, the size of pcre_study_data is fixed. We nevertheless
1562 save it in a field for returning via the pcre_fullinfo() function so that if it
1563 becomes variable in the future, we don't have to change that code. */
1564 
1565 if (bits_set || min > 0 || (options & (
1566 #ifdef SUPPORT_JIT
1569 #endif
1570  PCRE_STUDY_EXTRA_NEEDED)) != 0)
1571  {
1572  extra = (PUBL(extra) *)(PUBL(malloc))
1573  (sizeof(PUBL(extra)) + sizeof(pcre_study_data));
1574  if (extra == NULL)
1575  {
1576  *errorptr = "failed to get memory";
1577  return NULL;
1578  }
1579 
1580  study = (pcre_study_data *)((char *)extra + sizeof(PUBL(extra)));
1581  extra->flags = PCRE_EXTRA_STUDY_DATA;
1582  extra->study_data = study;
1583 
1584  study->size = sizeof(pcre_study_data);
1585  study->flags = 0;
1586 
1587  /* Set the start bits always, to avoid unset memory errors if the
1588  study data is written to a file, but set the flag only if any of the bits
1589  are set, to save time looking when none are. */
1590 
1591  if (bits_set)
1592  {
1593  study->flags |= PCRE_STUDY_MAPPED;
1594  memcpy(study->start_bits, start_bits, sizeof(start_bits));
1595  }
1596  else memset(study->start_bits, 0, 32 * sizeof(pcre_uint8));
1597 
1598 #ifdef PCRE_DEBUG
1599  if (bits_set)
1600  {
1601  pcre_uint8 *ptr = start_bits;
1602  int i;
1603 
1604  printf("Start bits:\n");
1605  for (i = 0; i < 32; i++)
1606  printf("%3d: %02x%s", i * 8, *ptr++, ((i + 1) & 0x7) != 0? " " : "\n");
1607  }
1608 #endif
1609 
1610  /* Always set the minlength value in the block, because the JIT compiler
1611  makes use of it. However, don't set the bit unless the length is greater than
1612  zero - the interpretive pcre_exec() and pcre_dfa_exec() needn't waste time
1613  checking the zero case. */
1614 
1615  if (min > 0)
1616  {
1617  study->flags |= PCRE_STUDY_MINLEN;
1618  study->minlength = min;
1619  }
1620  else study->minlength = 0;
1621 
1622  /* If JIT support was compiled and requested, attempt the JIT compilation.
1623  If no starting bytes were found, and the minimum length is zero, and JIT
1624  compilation fails, abandon the extra block and return NULL, unless
1625  PCRE_STUDY_EXTRA_NEEDED is set. */
1626 
1627 #ifdef SUPPORT_JIT
1628  extra->executable_jit = NULL;
1629  if ((options & PCRE_STUDY_JIT_COMPILE) != 0)
1630  PRIV(jit_compile)(re, extra, JIT_COMPILE);
1631  if ((options & PCRE_STUDY_JIT_PARTIAL_SOFT_COMPILE) != 0)
1632  PRIV(jit_compile)(re, extra, JIT_PARTIAL_SOFT_COMPILE);
1633  if ((options & PCRE_STUDY_JIT_PARTIAL_HARD_COMPILE) != 0)
1634  PRIV(jit_compile)(re, extra, JIT_PARTIAL_HARD_COMPILE);
1635 
1636  if (study->flags == 0 && (extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) == 0 &&
1637  (options & PCRE_STUDY_EXTRA_NEEDED) == 0)
1638  {
1639 #if defined COMPILE_PCRE8
1640  pcre_free_study(extra);
1641 #elif defined COMPILE_PCRE16
1642  pcre16_free_study(extra);
1643 #elif defined COMPILE_PCRE32
1644  pcre32_free_study(extra);
1645 #endif
1646  extra = NULL;
1647  }
1648 #endif
1649  }
1650 
1651 return extra;
1652 }
1653 
1654 
1655 /*************************************************
1656 * Free the study data *
1657 *************************************************/
1658 
1659 /* This function frees the memory that was obtained by pcre_study().
1660 
1661 Argument: a pointer to the pcre[16]_extra block
1662 Returns: nothing
1663 */
1664 
1665 #if defined COMPILE_PCRE8
1666 PCRE_EXP_DEFN void
1668 #elif defined COMPILE_PCRE16
1669 PCRE_EXP_DEFN void
1671 #elif defined COMPILE_PCRE32
1672 PCRE_EXP_DEFN void
1674 #endif
1675 {
1676 if (extra == NULL)
1677  return;
1678 #ifdef SUPPORT_JIT
1679 if ((extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0 &&
1680  extra->executable_jit != NULL)
1681  PRIV(jit_free)(extra->executable_jit);
1682 #endif
1683 PUBL(free)(extra);
1684 }
1685 
1686 /* End of pcre_study.c */
static CBioSource dummy
Definition: map.hpp:338
#define NULL
Definition: ncbistd.hpp:225
int i
if(yy_accept[yy_current_state])
T min(T x_, T y_)
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
#define PCRE_STUDY_EXTRA_NEEDED
Definition: pcre.h:315
#define PCRE_STUDY_JIT_PARTIAL_HARD_COMPILE
Definition: pcre.h:314
int pcre16_fullinfo(const pcre16 *, const pcre16_extra *, int, void *)
#define PCRE_UTF8
Definition: pcre.h:144
#define PCRE_EXTRA_STUDY_DATA
Definition: pcre.h:320
void pcre32_free_study(pcre32_extra *)
pcre32_extra * pcre32_study(const pcre32 *, int, const char **)
pcre16_extra * pcre16_study(const pcre16 *, int, const char **)
#define PCRE_ANCHORED
Definition: pcre.h:137
#define PCRE_EXTRA_EXECUTABLE_JIT
Definition: pcre.h:326
void pcre16_free_study(pcre16_extra *)
int pcre32_fullinfo(const pcre32 *, const pcre32_extra *, int, void *)
#define PCRE_STUDY_JIT_PARTIAL_SOFT_COMPILE
Definition: pcre.h:313
#define PCRE_JAVASCRIPT_COMPAT
Definition: pcre.h:172
#define PCRE_INFO_DEFAULT_TABLES
Definition: pcre.h:275
#define PCRE_STUDY_JIT_COMPILE
Definition: pcre.h:312
int pcre_fullinfo(const pcre *, const pcre_extra *, int, void *)
Definition: pcre_fullinfo.c:70
#define PCRE_MODE
#define XCL_HASPROP
@ JIT_PARTIAL_SOFT_COMPILE
@ JIT_COMPILE
@ JIT_PARTIAL_HARD_COMPILE
#define cbits_offset
#define REAL_PCRE
#define ctype_letter
#define PUBL(name)
#define cbit_space
#define cbit_digit
#define PCRE_EXP_DEFN
#define PT_CLIST
#define PCRE_STUDY_MAPPED
#define MAGIC_NUMBER
#define PUBLIC_STUDY_OPTIONS
#define XCL_NOT
#define NOTACHAR
unsigned char pcre_uint8
#define PCRE_CALL_CONVENTION
@ OP_STARI
@ OP_END
@ OP_NOTMINSTARI
@ OP_ANYNL
@ OP_SKIPZERO
@ OP_CHAR
@ OP_CRMINQUERY
@ OP_NOTPOSUPTOI
@ OP_SBRA
@ OP_ONCE
@ OP_NOTPROP
@ OP_NOTPLUS
@ OP_TYPEMINPLUS
@ OP_TYPEQUERY
@ OP_SBRAPOS
@ OP_SCOND
@ OP_ASSERTBACK
@ OP_CIRCM
@ OP_CLASS
@ OP_TYPEPLUS
@ OP_HSPACE
@ OP_NOT_WORDCHAR
@ OP_SKIP_ARG
@ OP_MINSTARI
@ OP_CRMINPLUS
@ OP_CRRANGE
@ OP_DOLL
@ OP_ONCE_NC
@ OP_BRAPOS
@ OP_ASSERT_NOT
@ OP_NOTSTARI
@ OP_NOT
@ OP_DNCREF
@ OP_ASSERT
@ OP_SET_SOM
@ OP_NOTMINPLUSI
@ OP_TYPEPOSSTAR
@ OP_TYPEMINUPTO
@ OP_TYPEPOSPLUS
@ OP_POSSTAR
@ OP_NOTUPTO
@ OP_TYPESTAR
@ OP_BRAMINZERO
@ OP_EXACTI
@ OP_PRUNE_ARG
@ OP_NOTPLUSI
@ OP_NOTQUERYI
@ OP_THEN_ARG
@ OP_CRQUERY
@ OP_ASSERTBACK_NOT
@ OP_RREF
@ OP_DNRREF
@ OP_DIGIT
@ OP_KETRPOS
@ OP_EXACT
@ OP_TYPEEXACT
@ OP_PLUS
@ OP_WHITESPACE
@ OP_CRMINSTAR
@ OP_NOTPOSPLUSI
@ OP_NOT_WORD_BOUNDARY
@ OP_KET
@ OP_NOT_DIGIT
@ OP_CALLOUT
@ OP_CRMINRANGE
@ OP_RECURSE
@ OP_BRA
@ OP_CREF
@ OP_POSUPTO
@ OP_MINUPTOI
@ OP_NOTPOSUPTO
@ OP_REVERSE
@ OP_NCLASS
@ OP_KETRMIN
@ OP_COND
@ OP_MINPLUS
@ OP_TYPEPOSUPTO
@ OP_WORDCHAR
@ OP_MINQUERY
@ OP_EODN
@ OP_UPTOI
@ OP_CRPOSRANGE
@ OP_THEN
@ OP_ALT
@ OP_UPTO
@ OP_QUERY
@ OP_POSQUERYI
@ OP_NOTPOSSTARI
@ OP_PROP
@ OP_NOTPOSSTAR
@ OP_PLUSI
@ OP_KETRMAX
@ OP_NOTMINPLUS
@ OP_CBRAPOS
@ OP_BRAZERO
@ OP_QUERYI
@ OP_POSPLUSI
@ OP_ANYBYTE
@ OP_CLOSE
@ OP_SCBRAPOS
@ OP_CHARI
@ OP_NOTMINQUERYI
@ OP_MARK
@ OP_TYPEMINQUERY
@ OP_SKIP
@ OP_NOT_WHITESPACE
@ OP_NOTMINSTAR
@ OP_NOTSTAR
@ OP_SCBRA
@ OP_CRPOSSTAR
@ OP_MINUPTO
@ OP_NOTPOSQUERYI
@ OP_NOT_VSPACE
@ OP_CRSTAR
@ OP_VSPACE
@ OP_POSQUERY
@ OP_MINSTAR
@ OP_PRUNE
@ OP_STAR
@ OP_ALLANY
@ OP_DEF
@ OP_DOLLM
@ OP_CRPOSQUERY
@ OP_DNREFI
@ OP_TYPEMINSTAR
@ OP_NOTMINUPTO
@ OP_NOTMINQUERY
@ OP_CRPLUS
@ OP_TYPEPOSQUERY
@ OP_POSPLUS
@ OP_REF
@ OP_SOD
@ OP_NOTPOSQUERY
@ OP_TYPEUPTO
@ OP_SOM
@ OP_ANY
@ OP_XCLASS
@ OP_POSSTARI
@ OP_NOT_HSPACE
@ OP_COMMIT
@ OP_FAIL
@ OP_MINQUERYI
@ OP_MINPLUSI
@ OP_ASSERT_ACCEPT
@ OP_NOTMINUPTOI
@ OP_NOTI
@ OP_NOTQUERY
@ OP_POSUPTOI
@ OP_CBRA
@ OP_BRAPOSZERO
@ OP_REFI
@ OP_NOTUPTOI
@ OP_CRPOSPLUS
@ OP_EXTUNI
@ OP_DNREF
@ OP_EOD
@ OP_NOTEXACTI
@ OP_NOTEXACT
@ OP_ACCEPT
@ OP_WORD_BOUNDARY
@ OP_NOTPOSPLUS
@ OP_CIRC
#define CHAR_CR
#define PCRE_STUDY_MINLEN
unsigned char pcre_uchar
#define CHAR_NEL
#define PCRE_FIRSTSET
#define cbit_word
#define CHAR_LF
#define CHAR_HT
#define fcc_offset
#define CHAR_FF
#define IMM2_SIZE
#define lcc_offset
#define GETCHARINC(c, eptr)
#define COMPILE_PCRE8
Definition: pcre_internal.h:58
#define XCL_MAP
#define PRIV(name)
#define CHAR_SPACE
#define ctypes_offset
#define CHAR_VT
#define GET2(a, n)
#define PCRE_STARTLINE
struct pcre_study_data pcre_study_data
static const unsigned char * tables(int mode)
pcre_extra * pcre_study(const pcre *external_re, int options, const char **errorptr)
Definition: pcre_study.c:1452
static int set_start_bits(const pcre_uchar *code, pcre_uint8 *start_bits, BOOL utf, compile_data *cd)
Definition: pcre_study.c:802
void pcre_free_study(pcre_extra *extra)
Definition: pcre_study.c:1667
static void set_type_bits(pcre_uint8 *start_bits, int cbit_type, unsigned int table_limit, compile_data *cd)
Definition: pcre_study.c:723
#define SET_BIT(c)
Definition: pcre_study.c:51
static int find_minlength(const real_pcre *re, const pcre_uchar *code, const pcre_uchar *startcode, int options, recurse_check *recurses, int *countptr)
Definition: pcre_study.c:83
static const pcre_uchar * set_table_bit(pcre_uint8 *start_bits, const pcre_uchar *p, BOOL caseless, compile_data *cd, BOOL utf)
Definition: pcre_study.c:635
@ SSB_FAIL
Definition: pcre_study.c:55
@ SSB_DONE
Definition: pcre_study.c:55
@ SSB_CONTINUE
Definition: pcre_study.c:55
@ SSB_UNKNOWN
Definition: pcre_study.c:55
static void set_nottype_bits(pcre_uint8 *start_bits, int cbit_type, unsigned int table_limit, compile_data *cd)
Definition: pcre_study.c:765
#define LINK_SIZE
Definition: config.h:188
#define SUPPORT_UTF
Definition: config.h:366
Definition: inftrees.h:24
const pcre_uint8 * ctypes
const pcre_uint8 * lcc
const pcre_uint8 * cbits
const pcre_uint8 * fcc
unsigned long int flags
Definition: pcre.h:409
void * executable_jit
Definition: pcre.h:416
void * study_data
Definition: pcre.h:410
pcre_uint32 minlength
pcre_uint32 flags
pcre_uint8 start_bits[32]
pcre_uint32 size
const pcre_uchar * group
struct recurse_check * prev
int BOOL
Definition: sybdb.h:150
@ FALSE
Definition: testodbc.c:27
@ TRUE
Definition: testodbc.c:27
void free(voidpf ptr)
voidp malloc(uInt size)
Modified on Fri Mar 01 10:08:34 2024 by modify_doxy.py rev. 669887