|
1 /************************************************* |
|
2 * Perl-Compatible Regular Expressions * |
|
3 *************************************************/ |
|
4 |
|
5 /* PCRE is a library of functions to support regular expressions whose syntax |
|
6 and semantics are as close as possible to those of the Perl 5 language. |
|
7 |
|
8 Written by Philip Hazel |
|
9 Copyright (c) 1997-2005 University of Cambridge |
|
10 |
|
11 Copyright (C) 2002, 2004, 2006, 2007 Apple Inc. All rights reserved. |
|
12 |
|
13 ----------------------------------------------------------------------------- |
|
14 Redistribution and use in source and binary forms, with or without |
|
15 modification, are permitted provided that the following conditions are met: |
|
16 |
|
17 * Redistributions of source code must retain the above copyright notice, |
|
18 this list of conditions and the following disclaimer. |
|
19 |
|
20 * Redistributions in binary form must reproduce the above copyright |
|
21 notice, this list of conditions and the following disclaimer in the |
|
22 documentation and/or other materials provided with the distribution. |
|
23 |
|
24 * Neither the name of the University of Cambridge nor the names of its |
|
25 contributors may be used to endorse or promote products derived from |
|
26 this software without specific prior written permission. |
|
27 |
|
28 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
|
29 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
|
30 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
|
31 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
|
32 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
|
33 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
|
34 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
|
35 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
|
36 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
|
37 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
|
38 POSSIBILITY OF SUCH DAMAGE. |
|
39 ----------------------------------------------------------------------------- |
|
40 */ |
|
41 |
|
42 |
|
43 /* This module contains pcre_exec(), the externally visible function that does |
|
44 pattern matching using an NFA algorithm, trying to mimic Perl as closely as |
|
45 possible. There are also some static supporting functions. */ |
|
46 |
|
47 #include "pcre_internal.h" |
|
48 |
|
49 /* Avoid warnings on Windows. */ |
|
50 #undef min |
|
51 #undef max |
|
52 |
|
53 /* Structure for building a chain of data that actually lives on the |
|
54 stack, for holding the values of the subject pointer at the start of each |
|
55 subpattern, so as to detect when an empty string has been matched by a |
|
56 subpattern - to break infinite loops. When NO_RECURSE is set, these blocks |
|
57 are on the heap, not on the stack. */ |
|
58 |
|
59 typedef struct eptrblock { |
|
60 struct eptrblock *epb_prev; |
|
61 const pcre_uchar *epb_saved_eptr; |
|
62 } eptrblock; |
|
63 |
|
64 /* Flag bits for the match() function */ |
|
65 |
|
66 #define match_condassert 0x01 /* Called to check a condition assertion */ |
|
67 #define match_isgroup 0x02 /* Set if start of bracketed group */ |
|
68 |
|
69 /* Non-error returns from the match() function. Error returns are externally |
|
70 defined PCRE_ERROR_xxx codes, which are all negative. */ |
|
71 |
|
72 #define MATCH_MATCH 1 |
|
73 #define MATCH_NOMATCH 0 |
|
74 |
|
75 /* Maximum number of ints of offset to save on the stack for recursive calls. |
|
76 If the offset vector is bigger, malloc is used. This should be a multiple of 3, |
|
77 because the offset vector is always a multiple of 3 long. */ |
|
78 |
|
79 #define REC_STACK_SAVE_MAX 30 |
|
80 |
|
81 /* Min and max values for the common repeats; for the maxima, 0 => infinity */ |
|
82 |
|
83 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 }; |
|
84 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 }; |
|
85 |
|
86 |
|
87 |
|
88 #ifdef DEBUG |
|
89 /************************************************* |
|
90 * Debugging function to print chars * |
|
91 *************************************************/ |
|
92 |
|
93 /* Print a sequence of chars in printable format, stopping at the end of the |
|
94 subject if the requested. |
|
95 |
|
96 Arguments: |
|
97 p points to characters |
|
98 length number to print |
|
99 is_subject TRUE if printing from within md->start_subject |
|
100 md pointer to matching data block, if is_subject is TRUE |
|
101 |
|
102 Returns: nothing |
|
103 */ |
|
104 |
|
105 static void |
|
106 pchars(const pcre_uchar *p, int length, BOOL is_subject, match_data *md) |
|
107 { |
|
108 int c; |
|
109 if (is_subject && length > md->end_subject - p) length = md->end_subject - p; |
|
110 while (length-- > 0) |
|
111 if (isprint(c = *(p++))) printf("%c", c); |
|
112 #if PCRE_UTF16 |
|
113 else if (c < 256) printf("\\x%02x", c); |
|
114 else printf("\\x{%x}", c); |
|
115 #else |
|
116 else printf("\\x%02x", c); |
|
117 #endif |
|
118 } |
|
119 #endif |
|
120 |
|
121 |
|
122 |
|
123 /************************************************* |
|
124 * Match a back-reference * |
|
125 *************************************************/ |
|
126 |
|
127 /* If a back reference hasn't been set, the length that is passed is greater |
|
128 than the number of characters left in the string, so the match fails. |
|
129 |
|
130 Arguments: |
|
131 offset index into the offset vector |
|
132 eptr points into the subject |
|
133 length length to be matched |
|
134 md points to match data block |
|
135 ims the ims flags |
|
136 |
|
137 Returns: TRUE if matched |
|
138 */ |
|
139 |
|
140 static BOOL |
|
141 match_ref(int offset, register const pcre_uchar *eptr, int length, match_data *md, |
|
142 unsigned long int ims) |
|
143 { |
|
144 const pcre_uchar *p = md->start_subject + md->offset_vector[offset]; |
|
145 |
|
146 #ifdef DEBUG |
|
147 if (eptr >= md->end_subject) |
|
148 printf("matching subject <null>"); |
|
149 else |
|
150 { |
|
151 printf("matching subject "); |
|
152 pchars(eptr, length, TRUE, md); |
|
153 } |
|
154 printf(" against backref "); |
|
155 pchars(p, length, FALSE, md); |
|
156 printf("\n"); |
|
157 #endif |
|
158 |
|
159 /* Always fail if not enough characters left */ |
|
160 |
|
161 if (length > md->end_subject - eptr) return FALSE; |
|
162 |
|
163 /* Separate the caselesss case for speed */ |
|
164 |
|
165 if ((ims & PCRE_CASELESS) != 0) |
|
166 { |
|
167 while (length-- > 0) |
|
168 if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; |
|
169 } |
|
170 else |
|
171 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; } |
|
172 |
|
173 return TRUE; |
|
174 } |
|
175 |
|
176 |
|
177 |
|
178 /*************************************************************************** |
|
179 **************************************************************************** |
|
180 RECURSION IN THE match() FUNCTION |
|
181 |
|
182 The match() function is highly recursive. Some regular expressions can cause |
|
183 it to recurse thousands of times. I was writing for Unix, so I just let it |
|
184 call itself recursively. This uses the stack for saving everything that has |
|
185 to be saved for a recursive call. On Unix, the stack can be large, and this |
|
186 works fine. |
|
187 |
|
188 It turns out that on non-Unix systems there are problems with programs that |
|
189 use a lot of stack. (This despite the fact that every last chip has oodles |
|
190 of memory these days, and techniques for extending the stack have been known |
|
191 for decades.) So.... |
|
192 |
|
193 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive |
|
194 calls by keeping local variables that need to be preserved in blocks of memory |
|
195 obtained from malloc instead instead of on the stack. Macros are used to |
|
196 achieve this so that the actual code doesn't look very different to what it |
|
197 always used to. |
|
198 **************************************************************************** |
|
199 ***************************************************************************/ |
|
200 |
|
201 |
|
202 /* These versions of the macros use the stack, as normal */ |
|
203 |
|
204 #ifndef NO_RECURSE |
|
205 #define REGISTER register |
|
206 #define RMATCH(num,rx,ra,rb,rc,rd,re,rf,rg) rx = match(ra,rb,rc,rd,re,rf,rg) |
|
207 #define RRETURN(ra) return ra |
|
208 #else |
|
209 |
|
210 |
|
211 /* These versions of the macros manage a private stack on the heap. Note |
|
212 that the rd argument of RMATCH isn't actually used. It's the md argument of |
|
213 match(), which never changes. */ |
|
214 |
|
215 #define REGISTER |
|
216 |
|
217 #ifndef __GNUC__ |
|
218 |
|
219 /* Use numbered labels and switch statement at the bottom of the match function. */ |
|
220 |
|
221 #define RMATCH_WHERE(num) num |
|
222 #define RRETURN_LABEL RRETURN_SWITCH |
|
223 |
|
224 #else |
|
225 |
|
226 /* Use GCC's computed goto extension. */ |
|
227 |
|
228 /* For one test case this is more than 40% faster than the switch statement. |
|
229 We could avoid the use of the num argument entirely by using local labels, |
|
230 but using it for the GCC case as well as the non-GCC case allows us to share |
|
231 a bit more code and notice if we use conflicting numbers.*/ |
|
232 |
|
233 #define RMATCH_WHERE(num) &&RRETURN_##num |
|
234 #define RRETURN_LABEL *frame->Xwhere |
|
235 |
|
236 #endif |
|
237 |
|
238 |
|
239 #define RMATCH(num,rx,ra,rb,rc,rd,re,rf,rg)\ |
|
240 {\ |
|
241 heapframe *newframe;\ |
|
242 if (frame >= stackframes && frame + 1 < stackframesend)\ |
|
243 newframe = frame + 1;\ |
|
244 else\ |
|
245 newframe = (pcre_stack_malloc)(sizeof(heapframe));\ |
|
246 frame->Xwhere = RMATCH_WHERE(num);\ |
|
247 newframe->Xeptr = ra;\ |
|
248 newframe->Xecode = rb;\ |
|
249 newframe->Xoffset_top = rc;\ |
|
250 newframe->Xims = re;\ |
|
251 newframe->Xeptrb = rf;\ |
|
252 newframe->Xflags = rg;\ |
|
253 newframe->Xprevframe = frame;\ |
|
254 frame = newframe;\ |
|
255 DPRINTF(("restarting from line %d\n", __LINE__));\ |
|
256 goto HEAP_RECURSE;\ |
|
257 RRETURN_##num:\ |
|
258 DPRINTF(("did a goto back to line %d\n", __LINE__));\ |
|
259 frame = md->thisframe;\ |
|
260 rx = frame->Xresult;\ |
|
261 } |
|
262 |
|
263 #define RRETURN(ra)\ |
|
264 {\ |
|
265 heapframe *newframe = frame;\ |
|
266 frame = newframe->Xprevframe;\ |
|
267 if (!(newframe >= stackframes && newframe < stackframesend))\ |
|
268 (pcre_stack_free)(newframe);\ |
|
269 if (frame != NULL)\ |
|
270 {\ |
|
271 frame->Xresult = ra;\ |
|
272 md->thisframe = frame;\ |
|
273 goto RRETURN_LABEL;\ |
|
274 }\ |
|
275 return ra;\ |
|
276 } |
|
277 |
|
278 /* Structure for remembering the local variables in a private frame */ |
|
279 |
|
280 typedef struct heapframe { |
|
281 struct heapframe *Xprevframe; |
|
282 |
|
283 /* Function arguments that may change */ |
|
284 |
|
285 const pcre_uchar *Xeptr; |
|
286 const uschar *Xecode; |
|
287 int Xoffset_top; |
|
288 long int Xims; |
|
289 eptrblock *Xeptrb; |
|
290 int Xflags; |
|
291 |
|
292 /* Function local variables */ |
|
293 |
|
294 const uschar *Xcallpat; |
|
295 const uschar *Xcharptr; |
|
296 const uschar *Xdata; |
|
297 const uschar *Xnext; |
|
298 const pcre_uchar *Xpp; |
|
299 const uschar *Xprev; |
|
300 const pcre_uchar *Xsaved_eptr; |
|
301 |
|
302 recursion_info Xnew_recursive; |
|
303 |
|
304 BOOL Xcur_is_word; |
|
305 BOOL Xcondition; |
|
306 BOOL Xminimize; |
|
307 BOOL Xprev_is_word; |
|
308 |
|
309 unsigned long int Xoriginal_ims; |
|
310 |
|
311 #ifdef SUPPORT_UCP |
|
312 int Xprop_type; |
|
313 int Xprop_fail_result; |
|
314 int Xprop_category; |
|
315 int Xprop_chartype; |
|
316 int Xprop_othercase; |
|
317 int Xprop_test_against; |
|
318 int *Xprop_test_variable; |
|
319 |
|
320 int Xrepeat_othercase; |
|
321 #endif |
|
322 |
|
323 int Xctype; |
|
324 int Xfc; |
|
325 int Xfi; |
|
326 int Xlength; |
|
327 int Xmax; |
|
328 int Xmin; |
|
329 int Xnumber; |
|
330 int Xoffset; |
|
331 int Xop; |
|
332 int Xsave_capture_last; |
|
333 int Xsave_offset1, Xsave_offset2, Xsave_offset3; |
|
334 int Xstacksave[REC_STACK_SAVE_MAX]; |
|
335 |
|
336 eptrblock Xnewptrb; |
|
337 |
|
338 /* Place to pass back result, and where to jump back to */ |
|
339 |
|
340 int Xresult; |
|
341 #ifndef __GNUC__ |
|
342 int Xwhere; |
|
343 #else |
|
344 void *Xwhere; |
|
345 #endif |
|
346 |
|
347 } heapframe; |
|
348 |
|
349 #endif |
|
350 |
|
351 |
|
352 /*************************************************************************** |
|
353 ***************************************************************************/ |
|
354 |
|
355 |
|
356 |
|
357 /************************************************* |
|
358 * Match from current position * |
|
359 *************************************************/ |
|
360 |
|
361 /* On entry ecode points to the first opcode, and eptr to the first character |
|
362 in the subject string, while eptrb holds the value of eptr at the start of the |
|
363 last bracketed group - used for breaking infinite loops matching zero-length |
|
364 strings. This function is called recursively in many circumstances. Whenever it |
|
365 returns a negative (error) response, the outer incarnation must also return the |
|
366 same response. |
|
367 |
|
368 Performance note: It might be tempting to extract commonly used fields from the |
|
369 md structure (e.g. utf8, end_subject) into individual variables to improve |
|
370 performance. Tests using gcc on a SPARC disproved this; in the first case, it |
|
371 made performance worse. |
|
372 |
|
373 Arguments: |
|
374 eptr pointer in subject |
|
375 ecode position in code |
|
376 offset_top current top pointer |
|
377 md pointer to "static" info for the match |
|
378 ims current /i, /m, and /s options |
|
379 eptrb pointer to chain of blocks containing eptr at start of |
|
380 brackets - for testing for empty matches |
|
381 flags can contain |
|
382 match_condassert - this is an assertion condition |
|
383 match_isgroup - this is the start of a bracketed group |
|
384 |
|
385 Returns: MATCH_MATCH if matched ) these values are >= 0 |
|
386 MATCH_NOMATCH if failed to match ) |
|
387 a negative PCRE_ERROR_xxx value if aborted by an error condition |
|
388 (e.g. stopped by recursion limit) |
|
389 */ |
|
390 |
|
391 static int |
|
392 match(REGISTER const pcre_uchar *eptr, REGISTER const uschar *ecode, |
|
393 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb, |
|
394 int flags) |
|
395 { |
|
396 /* These variables do not need to be preserved over recursion in this function, |
|
397 so they can be ordinary variables in all cases. Mark them with "register" |
|
398 because they are used a lot in loops. */ |
|
399 |
|
400 register int rrc; /* Returns from recursive calls */ |
|
401 register int i; /* Used for loops not involving calls to RMATCH() */ |
|
402 register int c; /* Character values not kept over RMATCH() calls */ |
|
403 register BOOL utf8; /* Local copy of UTF-8 flag for speed */ |
|
404 |
|
405 /* When recursion is not being used, all "local" variables that have to be |
|
406 preserved over calls to RMATCH() are part of a "frame" which is obtained from |
|
407 heap storage. Set up the top-level frame here; others are obtained from the |
|
408 heap whenever RMATCH() does a "recursion". See the macro definitions above. */ |
|
409 |
|
410 #ifdef NO_RECURSE |
|
411 |
|
412 /* The value 16 here is large enough that most regular expressions don't require |
|
413 any calls to pcre_stack_malloc, yet the amount of stack used for the array is |
|
414 modest enough that we don't run out of stack. */ |
|
415 heapframe stackframes[16]; |
|
416 heapframe *stackframesend = stackframes + sizeof(stackframes) / sizeof(stackframes[0]); |
|
417 |
|
418 heapframe *frame = stackframes; |
|
419 frame->Xprevframe = NULL; /* Marks the top level */ |
|
420 |
|
421 /* Copy in the original argument variables */ |
|
422 |
|
423 frame->Xeptr = eptr; |
|
424 frame->Xecode = ecode; |
|
425 frame->Xoffset_top = offset_top; |
|
426 frame->Xims = ims; |
|
427 frame->Xeptrb = eptrb; |
|
428 frame->Xflags = flags; |
|
429 |
|
430 /* This is where control jumps back to to effect "recursion" */ |
|
431 |
|
432 HEAP_RECURSE: |
|
433 |
|
434 /* Macros make the argument variables come from the current frame */ |
|
435 |
|
436 #define eptr frame->Xeptr |
|
437 #define ecode frame->Xecode |
|
438 #define offset_top frame->Xoffset_top |
|
439 #define ims frame->Xims |
|
440 #define eptrb frame->Xeptrb |
|
441 #define flags frame->Xflags |
|
442 |
|
443 /* Ditto for the local variables */ |
|
444 |
|
445 #ifdef SUPPORT_UTF8 |
|
446 #define charptr frame->Xcharptr |
|
447 #endif |
|
448 #define callpat frame->Xcallpat |
|
449 #define data frame->Xdata |
|
450 #define next frame->Xnext |
|
451 #define pp frame->Xpp |
|
452 #define prev frame->Xprev |
|
453 #define saved_eptr frame->Xsaved_eptr |
|
454 |
|
455 #define new_recursive frame->Xnew_recursive |
|
456 |
|
457 #define cur_is_word frame->Xcur_is_word |
|
458 #define condition frame->Xcondition |
|
459 #define minimize frame->Xminimize |
|
460 #define prev_is_word frame->Xprev_is_word |
|
461 |
|
462 #define original_ims frame->Xoriginal_ims |
|
463 |
|
464 #ifdef SUPPORT_UCP |
|
465 |
|
466 #define prop_type frame->Xprop_type |
|
467 #define prop_fail_result frame->Xprop_fail_result |
|
468 #define prop_category frame->Xprop_category |
|
469 #define prop_chartype frame->Xprop_chartype |
|
470 #define prop_othercase frame->Xprop_othercase |
|
471 #define prop_test_against frame->Xprop_test_against |
|
472 #define prop_test_variable frame->Xprop_test_variable |
|
473 |
|
474 #define repeat_othercase frame->Xrepeat_othercase |
|
475 |
|
476 #endif |
|
477 |
|
478 #define ctype frame->Xctype |
|
479 #define fc frame->Xfc |
|
480 #define fi frame->Xfi |
|
481 #define length frame->Xlength |
|
482 #define max frame->Xmax |
|
483 #define min frame->Xmin |
|
484 #define number frame->Xnumber |
|
485 #define offset frame->Xoffset |
|
486 #define op frame->Xop |
|
487 #define save_capture_last frame->Xsave_capture_last |
|
488 #define save_offset1 frame->Xsave_offset1 |
|
489 #define save_offset2 frame->Xsave_offset2 |
|
490 #define save_offset3 frame->Xsave_offset3 |
|
491 #define stacksave frame->Xstacksave |
|
492 |
|
493 #define newptrb frame->Xnewptrb |
|
494 |
|
495 /* When recursion is being used, local variables are allocated on the stack and |
|
496 get preserved during recursion in the normal way. In this environment, fi and |
|
497 i, and fc and c, can be the same variables. */ |
|
498 |
|
499 #else |
|
500 #define fi i |
|
501 #define fc c |
|
502 |
|
503 |
|
504 #if !PCRE_UTF16 |
|
505 #ifdef SUPPORT_UTF8 /* Many of these variables are used ony */ |
|
506 const uschar *charptr; /* small blocks of the code. My normal */ |
|
507 #endif /* style of coding would have declared */ |
|
508 #endif |
|
509 const uschar *callpat; /* them within each of those blocks. */ |
|
510 const uschar *data; /* However, in order to accommodate the */ |
|
511 const uschar *next; /* version of this code that uses an */ |
|
512 const pcre_uchar *pp; /* external "stack" implemented on the */ |
|
513 const uschar *prev; /* heap, it is easier to declare them */ |
|
514 const pcre_uchar *saved_eptr; /* all here, so the declarations can */ |
|
515 /* be cut out in a block. The only */ |
|
516 recursion_info new_recursive; /* declarations within blocks below are */ |
|
517 /* for variables that do not have to */ |
|
518 BOOL cur_is_word; /* be preserved over a recursive call */ |
|
519 BOOL condition; /* to RMATCH(). */ |
|
520 BOOL minimize; |
|
521 BOOL prev_is_word; |
|
522 |
|
523 unsigned long int original_ims; |
|
524 |
|
525 #ifdef SUPPORT_UCP |
|
526 |
|
527 int prop_type; |
|
528 int prop_fail_result; |
|
529 int prop_category; |
|
530 int prop_chartype; |
|
531 int prop_othercase; |
|
532 int prop_test_against; |
|
533 int *prop_test_variable; |
|
534 |
|
535 int repeat_othercase; |
|
536 |
|
537 #endif |
|
538 |
|
539 int ctype; |
|
540 int length; |
|
541 int max; |
|
542 int min; |
|
543 int number; |
|
544 int offset; |
|
545 int op; |
|
546 int save_capture_last; |
|
547 int save_offset1, save_offset2, save_offset3; |
|
548 int stacksave[REC_STACK_SAVE_MAX]; |
|
549 |
|
550 eptrblock newptrb; |
|
551 #endif |
|
552 |
|
553 /* These statements are here to stop the compiler complaining about unitialized |
|
554 variables. */ |
|
555 |
|
556 #ifdef SUPPORT_UCP |
|
557 prop_fail_result = 0; |
|
558 prop_test_against = 0; |
|
559 prop_test_variable = NULL; |
|
560 #endif |
|
561 |
|
562 /* OK, now we can get on with the real code of the function. Recursion is |
|
563 specified by the macros RMATCH and RRETURN. When NO_RECURSE is *not* defined, |
|
564 these just turn into a recursive call to match() and a "return", respectively. |
|
565 However, RMATCH isn't like a function call because it's quite a complicated |
|
566 macro. It has to be used in one particular way. This shouldn't, however, impact |
|
567 performance when true recursion is being used. */ |
|
568 |
|
569 utf8 = md->utf8; /* Local copy of the flag */ |
|
570 |
|
571 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT); |
|
572 |
|
573 original_ims = ims; /* Save for resetting on ')' */ |
|
574 |
|
575 /* At the start of a bracketed group, add the current subject pointer to the |
|
576 stack of such pointers, to be re-instated at the end of the group when we hit |
|
577 the closing ket. When match() is called in other circumstances, we don't add to |
|
578 this stack. */ |
|
579 |
|
580 if ((flags & match_isgroup) != 0) |
|
581 { |
|
582 newptrb.epb_prev = eptrb; |
|
583 newptrb.epb_saved_eptr = eptr; |
|
584 eptrb = &newptrb; |
|
585 } |
|
586 |
|
587 /* Now start processing the operations. */ |
|
588 |
|
589 for (;;) |
|
590 { |
|
591 op = *ecode; |
|
592 minimize = FALSE; |
|
593 |
|
594 /* For partial matching, remember if we ever hit the end of the subject after |
|
595 matching at least one subject character. */ |
|
596 |
|
597 if (md->partial && |
|
598 eptr >= md->end_subject && |
|
599 eptr > md->start_match) |
|
600 md->hitend = TRUE; |
|
601 |
|
602 /* Opening capturing bracket. If there is space in the offset vector, save |
|
603 the current subject position in the working slot at the top of the vector. We |
|
604 mustn't change the current values of the data slot, because they may be set |
|
605 from a previous iteration of this group, and be referred to by a reference |
|
606 inside the group. |
|
607 |
|
608 If the bracket fails to match, we need to restore this value and also the |
|
609 values of the final offsets, in case they were set by a previous iteration of |
|
610 the same bracket. |
|
611 |
|
612 If there isn't enough space in the offset vector, treat this as if it were a |
|
613 non-capturing bracket. Don't worry about setting the flag for the error case |
|
614 here; that is handled in the code for KET. */ |
|
615 |
|
616 if (op > OP_BRA) |
|
617 { |
|
618 number = op - OP_BRA; |
|
619 |
|
620 /* For extended extraction brackets (large number), we have to fish out the |
|
621 number from a dummy opcode at the start. */ |
|
622 |
|
623 if (number > EXTRACT_BASIC_MAX) |
|
624 number = GET2(ecode, 2+LINK_SIZE); |
|
625 offset = number << 1; |
|
626 |
|
627 #ifdef DEBUG |
|
628 printf("start bracket %d subject=", number); |
|
629 pchars(eptr, 16, TRUE, md); |
|
630 printf("\n"); |
|
631 #endif |
|
632 |
|
633 if (offset < md->offset_max) |
|
634 { |
|
635 save_offset1 = md->offset_vector[offset]; |
|
636 save_offset2 = md->offset_vector[offset+1]; |
|
637 save_offset3 = md->offset_vector[md->offset_end - number]; |
|
638 save_capture_last = md->capture_last; |
|
639 |
|
640 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3)); |
|
641 md->offset_vector[md->offset_end - number] = INT_CAST(eptr - md->start_subject); |
|
642 |
|
643 do |
|
644 { |
|
645 RMATCH(1, rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, |
|
646 match_isgroup); |
|
647 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
648 md->capture_last = save_capture_last; |
|
649 ecode += GET(ecode, 1); |
|
650 } |
|
651 while (*ecode == OP_ALT); |
|
652 |
|
653 DPRINTF(("bracket %d failed\n", number)); |
|
654 |
|
655 md->offset_vector[offset] = save_offset1; |
|
656 md->offset_vector[offset+1] = save_offset2; |
|
657 md->offset_vector[md->offset_end - number] = save_offset3; |
|
658 |
|
659 RRETURN(MATCH_NOMATCH); |
|
660 } |
|
661 |
|
662 /* Insufficient room for saving captured contents */ |
|
663 |
|
664 else op = OP_BRA; |
|
665 } |
|
666 |
|
667 /* Other types of node can be handled by a switch */ |
|
668 |
|
669 switch(op) |
|
670 { |
|
671 case OP_BRA: /* Non-capturing bracket: optimized */ |
|
672 DPRINTF(("start bracket 0\n")); |
|
673 do |
|
674 { |
|
675 RMATCH(2, rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, |
|
676 match_isgroup); |
|
677 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
678 ecode += GET(ecode, 1); |
|
679 } |
|
680 while (*ecode == OP_ALT); |
|
681 DPRINTF(("bracket 0 failed\n")); |
|
682 RRETURN(MATCH_NOMATCH); |
|
683 |
|
684 /* Conditional group: compilation checked that there are no more than |
|
685 two branches. If the condition is false, skipping the first branch takes us |
|
686 past the end if there is only one branch, but that's OK because that is |
|
687 exactly what going to the ket would do. */ |
|
688 |
|
689 case OP_COND: |
|
690 if (ecode[LINK_SIZE+1] == OP_CREF) /* Condition extract or recurse test */ |
|
691 { |
|
692 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */ |
|
693 condition = (offset == CREF_RECURSE * 2)? |
|
694 (md->recursive != NULL) : |
|
695 (offset < offset_top && md->offset_vector[offset] >= 0); |
|
696 RMATCH(3, rrc, eptr, ecode + (condition? |
|
697 (LINK_SIZE + 4) : (LINK_SIZE + 1 + GET(ecode, 1))), |
|
698 offset_top, md, ims, eptrb, match_isgroup); |
|
699 RRETURN(rrc); |
|
700 } |
|
701 |
|
702 /* The condition is an assertion. Call match() to evaluate it - setting |
|
703 the final argument TRUE causes it to stop at the end of an assertion. */ |
|
704 |
|
705 else |
|
706 { |
|
707 RMATCH(4, rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, |
|
708 match_condassert | match_isgroup); |
|
709 if (rrc == MATCH_MATCH) |
|
710 { |
|
711 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE+2); |
|
712 while (*ecode == OP_ALT) ecode += GET(ecode, 1); |
|
713 } |
|
714 else if (rrc != MATCH_NOMATCH) |
|
715 { |
|
716 RRETURN(rrc); /* Need braces because of following else */ |
|
717 } |
|
718 else ecode += GET(ecode, 1); |
|
719 RMATCH(5, rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, |
|
720 match_isgroup); |
|
721 RRETURN(rrc); |
|
722 } |
|
723 /* Control never reaches here */ |
|
724 |
|
725 /* Skip over conditional reference or large extraction number data if |
|
726 encountered. */ |
|
727 |
|
728 case OP_CREF: |
|
729 case OP_BRANUMBER: |
|
730 ecode += 3; |
|
731 break; |
|
732 |
|
733 /* End of the pattern. If we are in a recursion, we should restore the |
|
734 offsets appropriately and continue from after the call. */ |
|
735 |
|
736 case OP_END: |
|
737 if (md->recursive != NULL && md->recursive->group_num == 0) |
|
738 { |
|
739 recursion_info *rec = md->recursive; |
|
740 DPRINTF(("Hit the end in a (?0) recursion\n")); |
|
741 md->recursive = rec->prevrec; |
|
742 memmove(md->offset_vector, rec->offset_save, |
|
743 rec->saved_max * sizeof(int)); |
|
744 md->start_match = rec->save_start; |
|
745 ims = original_ims; |
|
746 ecode = rec->after_call; |
|
747 break; |
|
748 } |
|
749 |
|
750 /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty |
|
751 string - backtracking will then try other alternatives, if any. */ |
|
752 |
|
753 if (md->notempty && eptr == md->start_match) RRETURN(MATCH_NOMATCH); |
|
754 md->end_match_ptr = eptr; /* Record where we ended */ |
|
755 md->end_offset_top = offset_top; /* and how many extracts were taken */ |
|
756 RRETURN(MATCH_MATCH); |
|
757 |
|
758 /* Change option settings */ |
|
759 |
|
760 case OP_OPT: |
|
761 ims = ecode[1]; |
|
762 ecode += 2; |
|
763 DPRINTF(("ims set to %02lx\n", ims)); |
|
764 break; |
|
765 |
|
766 /* Assertion brackets. Check the alternative branches in turn - the |
|
767 matching won't pass the KET for an assertion. If any one branch matches, |
|
768 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the |
|
769 start of each branch to move the current point backwards, so the code at |
|
770 this level is identical to the lookahead case. */ |
|
771 |
|
772 case OP_ASSERT: |
|
773 case OP_ASSERTBACK: |
|
774 do |
|
775 { |
|
776 RMATCH(6, rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, |
|
777 match_isgroup); |
|
778 if (rrc == MATCH_MATCH) break; |
|
779 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
780 ecode += GET(ecode, 1); |
|
781 } |
|
782 while (*ecode == OP_ALT); |
|
783 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH); |
|
784 |
|
785 /* If checking an assertion for a condition, return MATCH_MATCH. */ |
|
786 |
|
787 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH); |
|
788 |
|
789 /* Continue from after the assertion, updating the offsets high water |
|
790 mark, since extracts may have been taken during the assertion. */ |
|
791 |
|
792 do ecode += GET(ecode,1); while (*ecode == OP_ALT); |
|
793 ecode += 1 + LINK_SIZE; |
|
794 offset_top = md->end_offset_top; |
|
795 continue; |
|
796 |
|
797 /* Negative assertion: all branches must fail to match */ |
|
798 |
|
799 case OP_ASSERT_NOT: |
|
800 case OP_ASSERTBACK_NOT: |
|
801 do |
|
802 { |
|
803 RMATCH(7, rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, |
|
804 match_isgroup); |
|
805 if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH); |
|
806 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
807 ecode += GET(ecode,1); |
|
808 } |
|
809 while (*ecode == OP_ALT); |
|
810 |
|
811 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH); |
|
812 |
|
813 ecode += 1 + LINK_SIZE; |
|
814 continue; |
|
815 |
|
816 /* Move the subject pointer back. This occurs only at the start of |
|
817 each branch of a lookbehind assertion. If we are too close to the start to |
|
818 move back, this match function fails. When working with UTF-8 we move |
|
819 back a number of characters, not bytes. */ |
|
820 |
|
821 case OP_REVERSE: |
|
822 #ifdef SUPPORT_UTF8 |
|
823 if (utf8) |
|
824 { |
|
825 c = GET(ecode,1); |
|
826 for (i = 0; i < c; i++) |
|
827 { |
|
828 eptr--; |
|
829 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH); |
|
830 BACKCHAR(eptr) |
|
831 } |
|
832 } |
|
833 else |
|
834 #endif |
|
835 |
|
836 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */ |
|
837 |
|
838 { |
|
839 eptr -= GET(ecode,1); |
|
840 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH); |
|
841 } |
|
842 |
|
843 /* Skip to next op code */ |
|
844 |
|
845 ecode += 1 + LINK_SIZE; |
|
846 break; |
|
847 |
|
848 /* The callout item calls an external function, if one is provided, passing |
|
849 details of the match so far. This is mainly for debugging, though the |
|
850 function is able to force a failure. */ |
|
851 |
|
852 case OP_CALLOUT: |
|
853 if (pcre_callout != NULL) |
|
854 { |
|
855 pcre_callout_block cb; |
|
856 cb.version = 1; /* Version 1 of the callout block */ |
|
857 cb.callout_number = ecode[1]; |
|
858 cb.offset_vector = md->offset_vector; |
|
859 cb.subject = (const pcre_char *)md->start_subject; |
|
860 cb.subject_length = INT_CAST(md->end_subject - md->start_subject); |
|
861 cb.start_match = INT_CAST(md->start_match - md->start_subject); |
|
862 cb.current_position = INT_CAST(eptr - md->start_subject); |
|
863 cb.pattern_position = GET(ecode, 2); |
|
864 cb.next_item_length = GET(ecode, 2 + LINK_SIZE); |
|
865 cb.capture_top = offset_top/2; |
|
866 cb.capture_last = md->capture_last; |
|
867 cb.callout_data = md->callout_data; |
|
868 if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH); |
|
869 if (rrc < 0) RRETURN(rrc); |
|
870 } |
|
871 ecode += 2 + 2*LINK_SIZE; |
|
872 break; |
|
873 |
|
874 /* Recursion either matches the current regex, or some subexpression. The |
|
875 offset data is the offset to the starting bracket from the start of the |
|
876 whole pattern. (This is so that it works from duplicated subpatterns.) |
|
877 |
|
878 If there are any capturing brackets started but not finished, we have to |
|
879 save their starting points and reinstate them after the recursion. However, |
|
880 we don't know how many such there are (offset_top records the completed |
|
881 total) so we just have to save all the potential data. There may be up to |
|
882 65535 such values, which is too large to put on the stack, but using malloc |
|
883 for small numbers seems expensive. As a compromise, the stack is used when |
|
884 there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc |
|
885 is used. A problem is what to do if the malloc fails ... there is no way of |
|
886 returning to the top level with an error. Save the top REC_STACK_SAVE_MAX |
|
887 values on the stack, and accept that the rest may be wrong. |
|
888 |
|
889 There are also other values that have to be saved. We use a chained |
|
890 sequence of blocks that actually live on the stack. Thanks to Robin Houston |
|
891 for the original version of this logic. */ |
|
892 |
|
893 case OP_RECURSE: |
|
894 { |
|
895 callpat = md->start_code + GET(ecode, 1); |
|
896 new_recursive.group_num = *callpat - OP_BRA; |
|
897 |
|
898 /* For extended extraction brackets (large number), we have to fish out |
|
899 the number from a dummy opcode at the start. */ |
|
900 |
|
901 if (new_recursive.group_num > EXTRACT_BASIC_MAX) |
|
902 new_recursive.group_num = GET2(callpat, 2+LINK_SIZE); |
|
903 |
|
904 /* Add to "recursing stack" */ |
|
905 |
|
906 new_recursive.prevrec = md->recursive; |
|
907 md->recursive = &new_recursive; |
|
908 |
|
909 /* Find where to continue from afterwards */ |
|
910 |
|
911 ecode += 1 + LINK_SIZE; |
|
912 new_recursive.after_call = ecode; |
|
913 |
|
914 /* Now save the offset data. */ |
|
915 |
|
916 new_recursive.saved_max = md->offset_end; |
|
917 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX) |
|
918 new_recursive.offset_save = stacksave; |
|
919 else |
|
920 { |
|
921 new_recursive.offset_save = |
|
922 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int)); |
|
923 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY); |
|
924 } |
|
925 |
|
926 memcpy(new_recursive.offset_save, md->offset_vector, |
|
927 new_recursive.saved_max * sizeof(int)); |
|
928 new_recursive.save_start = md->start_match; |
|
929 md->start_match = eptr; |
|
930 |
|
931 /* OK, now we can do the recursion. For each top-level alternative we |
|
932 restore the offset and recursion data. */ |
|
933 |
|
934 DPRINTF(("Recursing into group %d\n", new_recursive.group_num)); |
|
935 do |
|
936 { |
|
937 RMATCH(8, rrc, eptr, callpat + 1 + LINK_SIZE, offset_top, md, ims, |
|
938 eptrb, match_isgroup); |
|
939 if (rrc == MATCH_MATCH) |
|
940 { |
|
941 md->recursive = new_recursive.prevrec; |
|
942 if (new_recursive.offset_save != stacksave) |
|
943 (pcre_free)(new_recursive.offset_save); |
|
944 RRETURN(MATCH_MATCH); |
|
945 } |
|
946 else if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
947 |
|
948 md->recursive = &new_recursive; |
|
949 memcpy(md->offset_vector, new_recursive.offset_save, |
|
950 new_recursive.saved_max * sizeof(int)); |
|
951 callpat += GET(callpat, 1); |
|
952 } |
|
953 while (*callpat == OP_ALT); |
|
954 |
|
955 DPRINTF(("Recursion didn't match\n")); |
|
956 md->recursive = new_recursive.prevrec; |
|
957 if (new_recursive.offset_save != stacksave) |
|
958 (pcre_free)(new_recursive.offset_save); |
|
959 RRETURN(MATCH_NOMATCH); |
|
960 } |
|
961 /* Control never reaches here */ |
|
962 |
|
963 /* "Once" brackets are like assertion brackets except that after a match, |
|
964 the point in the subject string is not moved back. Thus there can never be |
|
965 a move back into the brackets. Friedl calls these "atomic" subpatterns. |
|
966 Check the alternative branches in turn - the matching won't pass the KET |
|
967 for this kind of subpattern. If any one branch matches, we carry on as at |
|
968 the end of a normal bracket, leaving the subject pointer. */ |
|
969 |
|
970 case OP_ONCE: |
|
971 { |
|
972 prev = ecode; |
|
973 saved_eptr = eptr; |
|
974 |
|
975 do |
|
976 { |
|
977 RMATCH(9, rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, |
|
978 eptrb, match_isgroup); |
|
979 if (rrc == MATCH_MATCH) break; |
|
980 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
981 ecode += GET(ecode,1); |
|
982 } |
|
983 while (*ecode == OP_ALT); |
|
984 |
|
985 /* If hit the end of the group (which could be repeated), fail */ |
|
986 |
|
987 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH); |
|
988 |
|
989 /* Continue as from after the assertion, updating the offsets high water |
|
990 mark, since extracts may have been taken. */ |
|
991 |
|
992 do ecode += GET(ecode,1); while (*ecode == OP_ALT); |
|
993 |
|
994 offset_top = md->end_offset_top; |
|
995 eptr = md->end_match_ptr; |
|
996 |
|
997 /* For a non-repeating ket, just continue at this level. This also |
|
998 happens for a repeating ket if no characters were matched in the group. |
|
999 This is the forcible breaking of infinite loops as implemented in Perl |
|
1000 5.005. If there is an options reset, it will get obeyed in the normal |
|
1001 course of events. */ |
|
1002 |
|
1003 if (*ecode == OP_KET || eptr == saved_eptr) |
|
1004 { |
|
1005 ecode += 1+LINK_SIZE; |
|
1006 break; |
|
1007 } |
|
1008 |
|
1009 /* The repeating kets try the rest of the pattern or restart from the |
|
1010 preceding bracket, in the appropriate order. We need to reset any options |
|
1011 that changed within the bracket before re-running it, so check the next |
|
1012 opcode. */ |
|
1013 |
|
1014 if (ecode[1+LINK_SIZE] == OP_OPT) |
|
1015 { |
|
1016 ims = (ims & ~PCRE_IMS) | ecode[4]; |
|
1017 DPRINTF(("ims set to %02lx at group repeat\n", ims)); |
|
1018 } |
|
1019 |
|
1020 if (*ecode == OP_KETRMIN) |
|
1021 { |
|
1022 RMATCH(10, rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0); |
|
1023 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
1024 RMATCH(11, rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup); |
|
1025 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
1026 } |
|
1027 else /* OP_KETRMAX */ |
|
1028 { |
|
1029 RMATCH(12, rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup); |
|
1030 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
1031 RMATCH(13, rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0); |
|
1032 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
1033 } |
|
1034 } |
|
1035 RRETURN(MATCH_NOMATCH); |
|
1036 |
|
1037 /* An alternation is the end of a branch; scan along to find the end of the |
|
1038 bracketed group and go to there. */ |
|
1039 |
|
1040 case OP_ALT: |
|
1041 do ecode += GET(ecode,1); while (*ecode == OP_ALT); |
|
1042 break; |
|
1043 |
|
1044 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating |
|
1045 that it may occur zero times. It may repeat infinitely, or not at all - |
|
1046 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper |
|
1047 repeat limits are compiled as a number of copies, with the optional ones |
|
1048 preceded by BRAZERO or BRAMINZERO. */ |
|
1049 |
|
1050 case OP_BRAZERO: |
|
1051 { |
|
1052 next = ecode+1; |
|
1053 RMATCH(14, rrc, eptr, next, offset_top, md, ims, eptrb, match_isgroup); |
|
1054 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
1055 do next += GET(next,1); while (*next == OP_ALT); |
|
1056 ecode = next + 1+LINK_SIZE; |
|
1057 } |
|
1058 break; |
|
1059 |
|
1060 case OP_BRAMINZERO: |
|
1061 { |
|
1062 next = ecode+1; |
|
1063 do next += GET(next,1); while (*next == OP_ALT); |
|
1064 RMATCH(15, rrc, eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, |
|
1065 match_isgroup); |
|
1066 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
1067 ecode++; |
|
1068 } |
|
1069 break; |
|
1070 |
|
1071 /* End of a group, repeated or non-repeating. If we are at the end of |
|
1072 an assertion "group", stop matching and return MATCH_MATCH, but record the |
|
1073 current high water mark for use by positive assertions. Do this also |
|
1074 for the "once" (not-backup up) groups. */ |
|
1075 |
|
1076 case OP_KET: |
|
1077 case OP_KETRMIN: |
|
1078 case OP_KETRMAX: |
|
1079 { |
|
1080 prev = ecode - GET(ecode, 1); |
|
1081 saved_eptr = eptrb->epb_saved_eptr; |
|
1082 |
|
1083 /* Back up the stack of bracket start pointers. */ |
|
1084 |
|
1085 eptrb = eptrb->epb_prev; |
|
1086 |
|
1087 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT || |
|
1088 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT || |
|
1089 *prev == OP_ONCE) |
|
1090 { |
|
1091 md->end_match_ptr = eptr; /* For ONCE */ |
|
1092 md->end_offset_top = offset_top; |
|
1093 RRETURN(MATCH_MATCH); |
|
1094 } |
|
1095 |
|
1096 /* In all other cases except a conditional group we have to check the |
|
1097 group number back at the start and if necessary complete handling an |
|
1098 extraction by setting the offsets and bumping the high water mark. */ |
|
1099 |
|
1100 if (*prev != OP_COND) |
|
1101 { |
|
1102 number = *prev - OP_BRA; |
|
1103 |
|
1104 /* For extended extraction brackets (large number), we have to fish out |
|
1105 the number from a dummy opcode at the start. */ |
|
1106 |
|
1107 if (number > EXTRACT_BASIC_MAX) number = GET2(prev, 2+LINK_SIZE); |
|
1108 offset = number << 1; |
|
1109 |
|
1110 #ifdef DEBUG |
|
1111 printf("end bracket %d", number); |
|
1112 printf("\n"); |
|
1113 #endif |
|
1114 |
|
1115 /* Test for a numbered group. This includes groups called as a result |
|
1116 of recursion. Note that whole-pattern recursion is coded as a recurse |
|
1117 into group 0, so it won't be picked up here. Instead, we catch it when |
|
1118 the OP_END is reached. */ |
|
1119 |
|
1120 if (number > 0) |
|
1121 { |
|
1122 md->capture_last = number; |
|
1123 if (offset >= md->offset_max) md->offset_overflow = TRUE; else |
|
1124 { |
|
1125 md->offset_vector[offset] = |
|
1126 md->offset_vector[md->offset_end - number]; |
|
1127 md->offset_vector[offset+1] = INT_CAST(eptr - md->start_subject); |
|
1128 if (offset_top <= offset) offset_top = offset + 2; |
|
1129 } |
|
1130 |
|
1131 /* Handle a recursively called group. Restore the offsets |
|
1132 appropriately and continue from after the call. */ |
|
1133 |
|
1134 if (md->recursive != NULL && md->recursive->group_num == number) |
|
1135 { |
|
1136 recursion_info *rec = md->recursive; |
|
1137 DPRINTF(("Recursion (%d) succeeded - continuing\n", number)); |
|
1138 md->recursive = rec->prevrec; |
|
1139 md->start_match = rec->save_start; |
|
1140 memcpy(md->offset_vector, rec->offset_save, |
|
1141 rec->saved_max * sizeof(int)); |
|
1142 ecode = rec->after_call; |
|
1143 ims = original_ims; |
|
1144 break; |
|
1145 } |
|
1146 } |
|
1147 } |
|
1148 |
|
1149 /* Reset the value of the ims flags, in case they got changed during |
|
1150 the group. */ |
|
1151 |
|
1152 ims = original_ims; |
|
1153 DPRINTF(("ims reset to %02lx\n", ims)); |
|
1154 |
|
1155 /* For a non-repeating ket, just continue at this level. This also |
|
1156 happens for a repeating ket if no characters were matched in the group. |
|
1157 This is the forcible breaking of infinite loops as implemented in Perl |
|
1158 5.005. If there is an options reset, it will get obeyed in the normal |
|
1159 course of events. */ |
|
1160 |
|
1161 if (*ecode == OP_KET || eptr == saved_eptr) |
|
1162 { |
|
1163 ecode += 1 + LINK_SIZE; |
|
1164 break; |
|
1165 } |
|
1166 |
|
1167 /* The repeating kets try the rest of the pattern or restart from the |
|
1168 preceding bracket, in the appropriate order. */ |
|
1169 |
|
1170 if (*ecode == OP_KETRMIN) |
|
1171 { |
|
1172 RMATCH(16, rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0); |
|
1173 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
1174 RMATCH(17, rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup); |
|
1175 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
1176 } |
|
1177 else /* OP_KETRMAX */ |
|
1178 { |
|
1179 RMATCH(18, rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup); |
|
1180 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
1181 RMATCH(19, rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0); |
|
1182 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
1183 } |
|
1184 } |
|
1185 |
|
1186 RRETURN(MATCH_NOMATCH); |
|
1187 |
|
1188 /* Start of subject unless notbol, or after internal newline if multiline */ |
|
1189 |
|
1190 case OP_CIRC: |
|
1191 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH); |
|
1192 if ((ims & PCRE_MULTILINE) != 0) |
|
1193 { |
|
1194 if (eptr != md->start_subject && eptr[-1] != NEWLINE) |
|
1195 RRETURN(MATCH_NOMATCH); |
|
1196 ecode++; |
|
1197 break; |
|
1198 } |
|
1199 /* ... else fall through */ |
|
1200 |
|
1201 /* Start of subject assertion */ |
|
1202 |
|
1203 case OP_SOD: |
|
1204 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH); |
|
1205 ecode++; |
|
1206 break; |
|
1207 |
|
1208 /* Start of match assertion */ |
|
1209 |
|
1210 case OP_SOM: |
|
1211 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH); |
|
1212 ecode++; |
|
1213 break; |
|
1214 |
|
1215 /* Assert before internal newline if multiline, or before a terminating |
|
1216 newline unless endonly is set, else end of subject unless noteol is set. */ |
|
1217 |
|
1218 case OP_DOLL: |
|
1219 if ((ims & PCRE_MULTILINE) != 0) |
|
1220 { |
|
1221 if (eptr < md->end_subject) |
|
1222 { if (*eptr != NEWLINE) RRETURN(MATCH_NOMATCH); } |
|
1223 else |
|
1224 { if (md->noteol) RRETURN(MATCH_NOMATCH); } |
|
1225 ecode++; |
|
1226 break; |
|
1227 } |
|
1228 else |
|
1229 { |
|
1230 if (md->noteol) RRETURN(MATCH_NOMATCH); |
|
1231 if (!md->endonly) |
|
1232 { |
|
1233 if (eptr < md->end_subject - 1 || |
|
1234 (eptr == md->end_subject - 1 && *eptr != NEWLINE)) |
|
1235 RRETURN(MATCH_NOMATCH); |
|
1236 ecode++; |
|
1237 break; |
|
1238 } |
|
1239 } |
|
1240 /* ... else fall through */ |
|
1241 |
|
1242 /* End of subject assertion (\z) */ |
|
1243 |
|
1244 case OP_EOD: |
|
1245 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH); |
|
1246 ecode++; |
|
1247 break; |
|
1248 |
|
1249 /* End of subject or ending \n assertion (\Z) */ |
|
1250 |
|
1251 case OP_EODN: |
|
1252 if (eptr < md->end_subject - 1 || |
|
1253 (eptr == md->end_subject - 1 && *eptr != NEWLINE)) RRETURN(MATCH_NOMATCH); |
|
1254 ecode++; |
|
1255 break; |
|
1256 |
|
1257 /* Word boundary assertions */ |
|
1258 |
|
1259 case OP_NOT_WORD_BOUNDARY: |
|
1260 case OP_WORD_BOUNDARY: |
|
1261 { |
|
1262 |
|
1263 /* Find out if the previous and current characters are "word" characters. |
|
1264 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to |
|
1265 be "non-word" characters. */ |
|
1266 |
|
1267 #ifdef SUPPORT_UTF8 |
|
1268 if (utf8) |
|
1269 { |
|
1270 if (eptr == md->start_subject) prev_is_word = FALSE; else |
|
1271 { |
|
1272 const pcre_uchar *lastptr = eptr - 1; |
|
1273 while(ISMIDCHAR(*lastptr)) lastptr--; |
|
1274 GETCHAR(c, lastptr); |
|
1275 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0; |
|
1276 } |
|
1277 if (eptr >= md->end_subject) cur_is_word = FALSE; else |
|
1278 { |
|
1279 GETCHAR(c, eptr); |
|
1280 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0; |
|
1281 } |
|
1282 } |
|
1283 else |
|
1284 #endif |
|
1285 |
|
1286 /* More streamlined when not in UTF-8 mode */ |
|
1287 |
|
1288 { |
|
1289 prev_is_word = (eptr != md->start_subject) && |
|
1290 ((md->ctypes[eptr[-1]] & ctype_word) != 0); |
|
1291 cur_is_word = (eptr < md->end_subject) && |
|
1292 ((md->ctypes[*eptr] & ctype_word) != 0); |
|
1293 } |
|
1294 |
|
1295 /* Now see if the situation is what we want */ |
|
1296 |
|
1297 if ((*ecode++ == OP_WORD_BOUNDARY)? |
|
1298 cur_is_word == prev_is_word : cur_is_word != prev_is_word) |
|
1299 RRETURN(MATCH_NOMATCH); |
|
1300 } |
|
1301 break; |
|
1302 |
|
1303 /* Match a single character type; inline for speed */ |
|
1304 |
|
1305 case OP_ANY: |
|
1306 if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == NEWLINE) |
|
1307 RRETURN(MATCH_NOMATCH); |
|
1308 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH); |
|
1309 #ifdef SUPPORT_UTF8 |
|
1310 if (utf8) |
|
1311 while (eptr < md->end_subject && ISMIDCHAR(*eptr)) eptr++; |
|
1312 #endif |
|
1313 ecode++; |
|
1314 break; |
|
1315 |
|
1316 /* Match a single byte, even in UTF-8 mode. This opcode really does match |
|
1317 any byte, even newline, independent of the setting of PCRE_DOTALL. */ |
|
1318 |
|
1319 case OP_ANYBYTE: |
|
1320 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH); |
|
1321 ecode++; |
|
1322 break; |
|
1323 |
|
1324 case OP_NOT_DIGIT: |
|
1325 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); |
|
1326 GETCHARINCTEST(c, eptr); |
|
1327 if ( |
|
1328 #ifdef SUPPORT_UTF8 |
|
1329 c < 256 && |
|
1330 #endif |
|
1331 (md->ctypes[c] & ctype_digit) != 0 |
|
1332 ) |
|
1333 RRETURN(MATCH_NOMATCH); |
|
1334 ecode++; |
|
1335 break; |
|
1336 |
|
1337 case OP_DIGIT: |
|
1338 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); |
|
1339 GETCHARINCTEST(c, eptr); |
|
1340 if ( |
|
1341 #ifdef SUPPORT_UTF8 |
|
1342 c >= 256 || |
|
1343 #endif |
|
1344 (md->ctypes[c] & ctype_digit) == 0 |
|
1345 ) |
|
1346 RRETURN(MATCH_NOMATCH); |
|
1347 ecode++; |
|
1348 break; |
|
1349 |
|
1350 case OP_NOT_WHITESPACE: |
|
1351 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); |
|
1352 GETCHARINCTEST(c, eptr); |
|
1353 if ( |
|
1354 #ifdef SUPPORT_UTF8 |
|
1355 c < 256 && |
|
1356 #endif |
|
1357 (md->ctypes[c] & ctype_space) != 0 |
|
1358 ) |
|
1359 RRETURN(MATCH_NOMATCH); |
|
1360 ecode++; |
|
1361 break; |
|
1362 |
|
1363 case OP_WHITESPACE: |
|
1364 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); |
|
1365 GETCHARINCTEST(c, eptr); |
|
1366 if ( |
|
1367 #ifdef SUPPORT_UTF8 |
|
1368 c >= 256 || |
|
1369 #endif |
|
1370 (md->ctypes[c] & ctype_space) == 0 |
|
1371 ) |
|
1372 RRETURN(MATCH_NOMATCH); |
|
1373 ecode++; |
|
1374 break; |
|
1375 |
|
1376 case OP_NOT_WORDCHAR: |
|
1377 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); |
|
1378 GETCHARINCTEST(c, eptr); |
|
1379 if ( |
|
1380 #ifdef SUPPORT_UTF8 |
|
1381 c < 256 && |
|
1382 #endif |
|
1383 (md->ctypes[c] & ctype_word) != 0 |
|
1384 ) |
|
1385 RRETURN(MATCH_NOMATCH); |
|
1386 ecode++; |
|
1387 break; |
|
1388 |
|
1389 case OP_WORDCHAR: |
|
1390 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); |
|
1391 GETCHARINCTEST(c, eptr); |
|
1392 if ( |
|
1393 #ifdef SUPPORT_UTF8 |
|
1394 c >= 256 || |
|
1395 #endif |
|
1396 (md->ctypes[c] & ctype_word) == 0 |
|
1397 ) |
|
1398 RRETURN(MATCH_NOMATCH); |
|
1399 ecode++; |
|
1400 break; |
|
1401 |
|
1402 #ifdef SUPPORT_UCP |
|
1403 /* Check the next character by Unicode property. We will get here only |
|
1404 if the support is in the binary; otherwise a compile-time error occurs. */ |
|
1405 |
|
1406 case OP_PROP: |
|
1407 case OP_NOTPROP: |
|
1408 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); |
|
1409 GETCHARINCTEST(c, eptr); |
|
1410 { |
|
1411 int chartype, rqdtype; |
|
1412 int othercase; |
|
1413 int category = _pcre_ucp_findchar(c, &chartype, &othercase); |
|
1414 |
|
1415 rqdtype = *(++ecode); |
|
1416 ecode++; |
|
1417 |
|
1418 if (rqdtype >= 128) |
|
1419 { |
|
1420 if ((rqdtype - 128 != category) == (op == OP_PROP)) |
|
1421 RRETURN(MATCH_NOMATCH); |
|
1422 } |
|
1423 else |
|
1424 { |
|
1425 if ((rqdtype != chartype) == (op == OP_PROP)) |
|
1426 RRETURN(MATCH_NOMATCH); |
|
1427 } |
|
1428 } |
|
1429 break; |
|
1430 |
|
1431 /* Match an extended Unicode sequence. We will get here only if the support |
|
1432 is in the binary; otherwise a compile-time error occurs. */ |
|
1433 |
|
1434 case OP_EXTUNI: |
|
1435 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); |
|
1436 GETCHARINCTEST(c, eptr); |
|
1437 { |
|
1438 int chartype; |
|
1439 int othercase; |
|
1440 int category = _pcre_ucp_findchar(c, &chartype, &othercase); |
|
1441 if (category == ucp_M) RRETURN(MATCH_NOMATCH); |
|
1442 while (eptr < md->end_subject) |
|
1443 { |
|
1444 int len = 1; |
|
1445 if (!utf8) c = *eptr; else |
|
1446 { |
|
1447 GETCHARLEN(c, eptr, len); |
|
1448 } |
|
1449 category = _pcre_ucp_findchar(c, &chartype, &othercase); |
|
1450 if (category != ucp_M) break; |
|
1451 eptr += len; |
|
1452 } |
|
1453 } |
|
1454 ecode++; |
|
1455 break; |
|
1456 #endif |
|
1457 |
|
1458 |
|
1459 /* Match a back reference, possibly repeatedly. Look past the end of the |
|
1460 item to see if there is repeat information following. The code is similar |
|
1461 to that for character classes, but repeated for efficiency. Then obey |
|
1462 similar code to character type repeats - written out again for speed. |
|
1463 However, if the referenced string is the empty string, always treat |
|
1464 it as matched, any number of times (otherwise there could be infinite |
|
1465 loops). */ |
|
1466 |
|
1467 case OP_REF: |
|
1468 { |
|
1469 offset = GET2(ecode, 1) << 1; /* Doubled ref number */ |
|
1470 ecode += 3; /* Advance past item */ |
|
1471 |
|
1472 /* If the reference is unset, set the length to be longer than the amount |
|
1473 of subject left; this ensures that every attempt at a match fails. We |
|
1474 can't just fail here, because of the possibility of quantifiers with zero |
|
1475 minima. */ |
|
1476 |
|
1477 length = (offset >= offset_top || md->offset_vector[offset] < 0)? |
|
1478 #if JAVASCRIPT |
|
1479 0 : /* in JavaScript these match the empty string */ |
|
1480 #else |
|
1481 INT_CAST(md->end_subject - eptr + 1) : |
|
1482 #endif |
|
1483 md->offset_vector[offset+1] - md->offset_vector[offset]; |
|
1484 |
|
1485 /* Set up for repetition, or handle the non-repeated case */ |
|
1486 |
|
1487 switch (*ecode) |
|
1488 { |
|
1489 case OP_CRSTAR: |
|
1490 case OP_CRMINSTAR: |
|
1491 case OP_CRPLUS: |
|
1492 case OP_CRMINPLUS: |
|
1493 case OP_CRQUERY: |
|
1494 case OP_CRMINQUERY: |
|
1495 c = *ecode++ - OP_CRSTAR; |
|
1496 minimize = (c & 1) != 0; |
|
1497 min = rep_min[c]; /* Pick up values from tables; */ |
|
1498 max = rep_max[c]; /* zero for max => infinity */ |
|
1499 if (max == 0) max = INT_MAX; |
|
1500 break; |
|
1501 |
|
1502 case OP_CRRANGE: |
|
1503 case OP_CRMINRANGE: |
|
1504 minimize = (*ecode == OP_CRMINRANGE); |
|
1505 min = GET2(ecode, 1); |
|
1506 max = GET2(ecode, 3); |
|
1507 if (max == 0) max = INT_MAX; |
|
1508 ecode += 5; |
|
1509 break; |
|
1510 |
|
1511 default: /* No repeat follows */ |
|
1512 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH); |
|
1513 eptr += length; |
|
1514 continue; /* With the main loop */ |
|
1515 } |
|
1516 |
|
1517 /* If the length of the reference is zero, just continue with the |
|
1518 main loop. */ |
|
1519 |
|
1520 if (length == 0) continue; |
|
1521 |
|
1522 /* First, ensure the minimum number of matches are present. We get back |
|
1523 the length of the reference string explicitly rather than passing the |
|
1524 address of eptr, so that eptr can be a register variable. */ |
|
1525 |
|
1526 for (i = 1; i <= min; i++) |
|
1527 { |
|
1528 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH); |
|
1529 eptr += length; |
|
1530 } |
|
1531 |
|
1532 /* If min = max, continue at the same level without recursion. |
|
1533 They are not both allowed to be zero. */ |
|
1534 |
|
1535 if (min == max) continue; |
|
1536 |
|
1537 /* If minimizing, keep trying and advancing the pointer */ |
|
1538 |
|
1539 if (minimize) |
|
1540 { |
|
1541 for (fi = min;; fi++) |
|
1542 { |
|
1543 RMATCH(20, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); |
|
1544 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
1545 if (fi >= max || !match_ref(offset, eptr, length, md, ims)) |
|
1546 RRETURN(MATCH_NOMATCH); |
|
1547 eptr += length; |
|
1548 } |
|
1549 /* Control never gets here */ |
|
1550 } |
|
1551 |
|
1552 /* If maximizing, find the longest string and work backwards */ |
|
1553 |
|
1554 else |
|
1555 { |
|
1556 pp = eptr; |
|
1557 for (i = min; i < max; i++) |
|
1558 { |
|
1559 if (!match_ref(offset, eptr, length, md, ims)) break; |
|
1560 eptr += length; |
|
1561 } |
|
1562 while (eptr >= pp) |
|
1563 { |
|
1564 RMATCH(21, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); |
|
1565 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
1566 eptr -= length; |
|
1567 } |
|
1568 RRETURN(MATCH_NOMATCH); |
|
1569 } |
|
1570 } |
|
1571 /* Control never gets here */ |
|
1572 |
|
1573 |
|
1574 |
|
1575 /* Match a bit-mapped character class, possibly repeatedly. This op code is |
|
1576 used when all the characters in the class have values in the range 0-255, |
|
1577 and either the matching is caseful, or the characters are in the range |
|
1578 0-127 when UTF-8 processing is enabled. The only difference between |
|
1579 OP_CLASS and OP_NCLASS occurs when a data character outside the range is |
|
1580 encountered. |
|
1581 |
|
1582 First, look past the end of the item to see if there is repeat information |
|
1583 following. Then obey similar code to character type repeats - written out |
|
1584 again for speed. */ |
|
1585 |
|
1586 case OP_NCLASS: |
|
1587 case OP_CLASS: |
|
1588 { |
|
1589 data = ecode + 1; /* Save for matching */ |
|
1590 ecode += 33; /* Advance past the item */ |
|
1591 |
|
1592 switch (*ecode) |
|
1593 { |
|
1594 case OP_CRSTAR: |
|
1595 case OP_CRMINSTAR: |
|
1596 case OP_CRPLUS: |
|
1597 case OP_CRMINPLUS: |
|
1598 case OP_CRQUERY: |
|
1599 case OP_CRMINQUERY: |
|
1600 c = *ecode++ - OP_CRSTAR; |
|
1601 minimize = (c & 1) != 0; |
|
1602 min = rep_min[c]; /* Pick up values from tables; */ |
|
1603 max = rep_max[c]; /* zero for max => infinity */ |
|
1604 if (max == 0) max = INT_MAX; |
|
1605 break; |
|
1606 |
|
1607 case OP_CRRANGE: |
|
1608 case OP_CRMINRANGE: |
|
1609 minimize = (*ecode == OP_CRMINRANGE); |
|
1610 min = GET2(ecode, 1); |
|
1611 max = GET2(ecode, 3); |
|
1612 if (max == 0) max = INT_MAX; |
|
1613 ecode += 5; |
|
1614 break; |
|
1615 |
|
1616 default: /* No repeat follows */ |
|
1617 min = max = 1; |
|
1618 break; |
|
1619 } |
|
1620 |
|
1621 /* First, ensure the minimum number of matches are present. */ |
|
1622 |
|
1623 #ifdef SUPPORT_UTF8 |
|
1624 /* UTF-8 mode */ |
|
1625 if (utf8) |
|
1626 { |
|
1627 for (i = 1; i <= min; i++) |
|
1628 { |
|
1629 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); |
|
1630 GETCHARINC(c, eptr); |
|
1631 if (c > 255) |
|
1632 { |
|
1633 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH); |
|
1634 } |
|
1635 else |
|
1636 { |
|
1637 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH); |
|
1638 } |
|
1639 } |
|
1640 } |
|
1641 else |
|
1642 #endif |
|
1643 /* Not UTF-8 mode */ |
|
1644 { |
|
1645 for (i = 1; i <= min; i++) |
|
1646 { |
|
1647 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); |
|
1648 c = *eptr++; |
|
1649 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH); |
|
1650 } |
|
1651 } |
|
1652 |
|
1653 /* If max == min we can continue with the main loop without the |
|
1654 need to recurse. */ |
|
1655 |
|
1656 if (min == max) continue; |
|
1657 |
|
1658 /* If minimizing, keep testing the rest of the expression and advancing |
|
1659 the pointer while it matches the class. */ |
|
1660 |
|
1661 if (minimize) |
|
1662 { |
|
1663 #ifdef SUPPORT_UTF8 |
|
1664 /* UTF-8 mode */ |
|
1665 if (utf8) |
|
1666 { |
|
1667 for (fi = min;; fi++) |
|
1668 { |
|
1669 RMATCH(22, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); |
|
1670 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
1671 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); |
|
1672 GETCHARINC(c, eptr); |
|
1673 if (c > 255) |
|
1674 { |
|
1675 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH); |
|
1676 } |
|
1677 else |
|
1678 { |
|
1679 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH); |
|
1680 } |
|
1681 } |
|
1682 } |
|
1683 else |
|
1684 #endif |
|
1685 /* Not UTF-8 mode */ |
|
1686 { |
|
1687 for (fi = min;; fi++) |
|
1688 { |
|
1689 RMATCH(23, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); |
|
1690 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
1691 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); |
|
1692 c = *eptr++; |
|
1693 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH); |
|
1694 } |
|
1695 } |
|
1696 /* Control never gets here */ |
|
1697 } |
|
1698 |
|
1699 /* If maximizing, find the longest possible run, then work backwards. */ |
|
1700 |
|
1701 else |
|
1702 { |
|
1703 pp = eptr; |
|
1704 |
|
1705 #ifdef SUPPORT_UTF8 |
|
1706 /* UTF-8 mode */ |
|
1707 if (utf8) |
|
1708 { |
|
1709 for (i = min; i < max; i++) |
|
1710 { |
|
1711 int len = 1; |
|
1712 if (eptr >= md->end_subject) break; |
|
1713 GETCHARLEN(c, eptr, len); |
|
1714 if (c > 255) |
|
1715 { |
|
1716 if (op == OP_CLASS) break; |
|
1717 } |
|
1718 else |
|
1719 { |
|
1720 if ((data[c/8] & (1 << (c&7))) == 0) break; |
|
1721 } |
|
1722 eptr += len; |
|
1723 } |
|
1724 for (;;) |
|
1725 { |
|
1726 RMATCH(24, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); |
|
1727 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
1728 if (eptr-- == pp) break; /* Stop if tried at original pos */ |
|
1729 BACKCHAR(eptr); |
|
1730 } |
|
1731 } |
|
1732 else |
|
1733 #endif |
|
1734 /* Not UTF-8 mode */ |
|
1735 { |
|
1736 for (i = min; i < max; i++) |
|
1737 { |
|
1738 if (eptr >= md->end_subject) break; |
|
1739 c = *eptr; |
|
1740 if ((data[c/8] & (1 << (c&7))) == 0) break; |
|
1741 eptr++; |
|
1742 } |
|
1743 while (eptr >= pp) |
|
1744 { |
|
1745 RMATCH(25, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); |
|
1746 eptr--; |
|
1747 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
1748 } |
|
1749 } |
|
1750 |
|
1751 RRETURN(MATCH_NOMATCH); |
|
1752 } |
|
1753 } |
|
1754 /* Control never gets here */ |
|
1755 |
|
1756 |
|
1757 /* Match an extended character class. This opcode is encountered only |
|
1758 in UTF-8 mode, because that's the only time it is compiled. */ |
|
1759 |
|
1760 #ifdef SUPPORT_UTF8 |
|
1761 case OP_XCLASS: |
|
1762 { |
|
1763 data = ecode + 1 + LINK_SIZE; /* Save for matching */ |
|
1764 ecode += GET(ecode, 1); /* Advance past the item */ |
|
1765 |
|
1766 switch (*ecode) |
|
1767 { |
|
1768 case OP_CRSTAR: |
|
1769 case OP_CRMINSTAR: |
|
1770 case OP_CRPLUS: |
|
1771 case OP_CRMINPLUS: |
|
1772 case OP_CRQUERY: |
|
1773 case OP_CRMINQUERY: |
|
1774 c = *ecode++ - OP_CRSTAR; |
|
1775 minimize = (c & 1) != 0; |
|
1776 min = rep_min[c]; /* Pick up values from tables; */ |
|
1777 max = rep_max[c]; /* zero for max => infinity */ |
|
1778 if (max == 0) max = INT_MAX; |
|
1779 break; |
|
1780 |
|
1781 case OP_CRRANGE: |
|
1782 case OP_CRMINRANGE: |
|
1783 minimize = (*ecode == OP_CRMINRANGE); |
|
1784 min = GET2(ecode, 1); |
|
1785 max = GET2(ecode, 3); |
|
1786 if (max == 0) max = INT_MAX; |
|
1787 ecode += 5; |
|
1788 break; |
|
1789 |
|
1790 default: /* No repeat follows */ |
|
1791 min = max = 1; |
|
1792 break; |
|
1793 } |
|
1794 |
|
1795 /* First, ensure the minimum number of matches are present. */ |
|
1796 |
|
1797 for (i = 1; i <= min; i++) |
|
1798 { |
|
1799 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); |
|
1800 GETCHARINC(c, eptr); |
|
1801 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH); |
|
1802 } |
|
1803 |
|
1804 /* If max == min we can continue with the main loop without the |
|
1805 need to recurse. */ |
|
1806 |
|
1807 if (min == max) continue; |
|
1808 |
|
1809 /* If minimizing, keep testing the rest of the expression and advancing |
|
1810 the pointer while it matches the class. */ |
|
1811 |
|
1812 if (minimize) |
|
1813 { |
|
1814 for (fi = min;; fi++) |
|
1815 { |
|
1816 RMATCH(26, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); |
|
1817 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
1818 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); |
|
1819 GETCHARINC(c, eptr); |
|
1820 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH); |
|
1821 } |
|
1822 /* Control never gets here */ |
|
1823 } |
|
1824 |
|
1825 /* If maximizing, find the longest possible run, then work backwards. */ |
|
1826 |
|
1827 else |
|
1828 { |
|
1829 pp = eptr; |
|
1830 for (i = min; i < max; i++) |
|
1831 { |
|
1832 int len = 1; |
|
1833 if (eptr >= md->end_subject) break; |
|
1834 GETCHARLEN(c, eptr, len); |
|
1835 if (!_pcre_xclass(c, data)) break; |
|
1836 eptr += len; |
|
1837 } |
|
1838 for(;;) |
|
1839 { |
|
1840 RMATCH(27, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); |
|
1841 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
1842 if (eptr-- == pp) break; /* Stop if tried at original pos */ |
|
1843 BACKCHAR(eptr) |
|
1844 } |
|
1845 RRETURN(MATCH_NOMATCH); |
|
1846 } |
|
1847 |
|
1848 /* Control never gets here */ |
|
1849 } |
|
1850 #endif /* End of XCLASS */ |
|
1851 |
|
1852 /* Match a single character, casefully */ |
|
1853 |
|
1854 case OP_CHAR: |
|
1855 #ifdef SUPPORT_UTF8 |
|
1856 if (utf8) |
|
1857 { |
|
1858 length = 1; |
|
1859 ecode++; |
|
1860 GETUTF8CHARLEN(fc, ecode, length); |
|
1861 #if PCRE_UTF16 |
|
1862 { |
|
1863 int dc; |
|
1864 ecode += length; |
|
1865 switch (md->end_subject - eptr) |
|
1866 { |
|
1867 case 0: |
|
1868 RRETURN(MATCH_NOMATCH); |
|
1869 case 1: |
|
1870 dc = *eptr++; |
|
1871 if (IS_LEADING_SURROGATE(dc)) |
|
1872 RRETURN(MATCH_NOMATCH); |
|
1873 break; |
|
1874 default: |
|
1875 GETCHARINC(dc, eptr); |
|
1876 } |
|
1877 if (fc != dc) RRETURN(MATCH_NOMATCH); |
|
1878 } |
|
1879 #else |
|
1880 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH); |
|
1881 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH); |
|
1882 #endif |
|
1883 } |
|
1884 else |
|
1885 #endif |
|
1886 |
|
1887 /* Non-UTF-8 mode */ |
|
1888 { |
|
1889 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH); |
|
1890 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH); |
|
1891 ecode += 2; |
|
1892 } |
|
1893 break; |
|
1894 |
|
1895 /* Match a single character, caselessly */ |
|
1896 |
|
1897 case OP_CHARNC: |
|
1898 #ifdef SUPPORT_UTF8 |
|
1899 if (utf8) |
|
1900 { |
|
1901 length = 1; |
|
1902 ecode++; |
|
1903 GETUTF8CHARLEN(fc, ecode, length); |
|
1904 |
|
1905 #if PCRE_UTF16 |
|
1906 if (md->end_subject - eptr == 0) RRETURN(MATCH_NOMATCH); |
|
1907 #else |
|
1908 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH); |
|
1909 #endif |
|
1910 |
|
1911 /* If the pattern character's value is < 128, we have only one byte, and |
|
1912 can use the fast lookup table. */ |
|
1913 |
|
1914 if (fc < 128) |
|
1915 { |
|
1916 #if PCRE_UTF16 |
|
1917 int dc; |
|
1918 ecode++; |
|
1919 dc = *eptr++; |
|
1920 if (dc >= 128 || md->lcc[fc] != md->lcc[dc]) RRETURN(MATCH_NOMATCH); |
|
1921 #else |
|
1922 if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH); |
|
1923 #endif |
|
1924 } |
|
1925 |
|
1926 /* Otherwise we must pick up the subject character */ |
|
1927 |
|
1928 else |
|
1929 { |
|
1930 int dc; |
|
1931 #if PCRE_UTF16 |
|
1932 if (md->end_subject - eptr == 1) { |
|
1933 dc = *eptr++; |
|
1934 if (IS_LEADING_SURROGATE(dc)) |
|
1935 RRETURN(MATCH_NOMATCH); |
|
1936 } else |
|
1937 #endif |
|
1938 GETCHARINC(dc, eptr); |
|
1939 ecode += length; |
|
1940 |
|
1941 /* If we have Unicode property support, we can use it to test the other |
|
1942 case of the character, if there is one. The result of _pcre_ucp_findchar() is |
|
1943 < 0 if the char isn't found, and othercase is returned as zero if there |
|
1944 isn't one. */ |
|
1945 |
|
1946 if (fc != dc) |
|
1947 { |
|
1948 #ifdef SUPPORT_UCP |
|
1949 int chartype; |
|
1950 int othercase; |
|
1951 if (_pcre_ucp_findchar(fc, &chartype, &othercase) != ucp_L || dc != othercase) |
|
1952 #endif |
|
1953 RRETURN(MATCH_NOMATCH); |
|
1954 } |
|
1955 } |
|
1956 } |
|
1957 else |
|
1958 #endif /* SUPPORT_UTF8 */ |
|
1959 |
|
1960 /* Non-UTF-8 mode */ |
|
1961 { |
|
1962 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH); |
|
1963 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH); |
|
1964 ecode += 2; |
|
1965 } |
|
1966 break; |
|
1967 |
|
1968 /* Match a single character repeatedly; different opcodes share code. */ |
|
1969 |
|
1970 case OP_EXACT: |
|
1971 min = max = GET2(ecode, 1); |
|
1972 ecode += 3; |
|
1973 goto REPEATCHAR; |
|
1974 |
|
1975 case OP_UPTO: |
|
1976 case OP_MINUPTO: |
|
1977 min = 0; |
|
1978 max = GET2(ecode, 1); |
|
1979 minimize = *ecode == OP_MINUPTO; |
|
1980 ecode += 3; |
|
1981 goto REPEATCHAR; |
|
1982 |
|
1983 case OP_STAR: |
|
1984 case OP_MINSTAR: |
|
1985 case OP_PLUS: |
|
1986 case OP_MINPLUS: |
|
1987 case OP_QUERY: |
|
1988 case OP_MINQUERY: |
|
1989 c = *ecode++ - OP_STAR; |
|
1990 minimize = (c & 1) != 0; |
|
1991 min = rep_min[c]; /* Pick up values from tables; */ |
|
1992 max = rep_max[c]; /* zero for max => infinity */ |
|
1993 if (max == 0) max = INT_MAX; |
|
1994 |
|
1995 /* Common code for all repeated single-character matches. We can give |
|
1996 up quickly if there are fewer than the minimum number of characters left in |
|
1997 the subject. */ |
|
1998 |
|
1999 REPEATCHAR: |
|
2000 #ifdef SUPPORT_UTF8 |
|
2001 #if PCRE_UTF16 |
|
2002 |
|
2003 length = 1; |
|
2004 GETUTF8CHARLEN(fc, ecode, length); |
|
2005 { |
|
2006 if (min * (fc > 0xFFFF ? 2 : 1) > md->end_subject - eptr) RRETURN(MATCH_NOMATCH); |
|
2007 ecode += length; |
|
2008 |
|
2009 if (fc <= 0xFFFF) |
|
2010 { |
|
2011 int othercase; |
|
2012 int chartype; |
|
2013 if ((ims & PCRE_CASELESS) == 0 || _pcre_ucp_findchar(fc, &chartype, &othercase) != ucp_L) |
|
2014 othercase = -1; /* Guaranteed to not match any character */ |
|
2015 |
|
2016 for (i = 1; i <= min; i++) |
|
2017 { |
|
2018 if (*eptr != fc && *eptr != othercase) RRETURN(MATCH_NOMATCH); |
|
2019 ++eptr; |
|
2020 } |
|
2021 |
|
2022 if (min == max) continue; |
|
2023 |
|
2024 if (minimize) |
|
2025 { |
|
2026 repeat_othercase = othercase; |
|
2027 for (fi = min;; fi++) |
|
2028 { |
|
2029 RMATCH(28, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); |
|
2030 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
2031 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); |
|
2032 if (*eptr != fc && *eptr != repeat_othercase) RRETURN(MATCH_NOMATCH); |
|
2033 ++eptr; |
|
2034 } |
|
2035 /* Control never gets here */ |
|
2036 } |
|
2037 else |
|
2038 { |
|
2039 pp = eptr; |
|
2040 for (i = min; i < max; i++) |
|
2041 { |
|
2042 if (eptr >= md->end_subject) break; |
|
2043 if (*eptr != fc && *eptr != othercase) break; |
|
2044 ++eptr; |
|
2045 } |
|
2046 while (eptr >= pp) |
|
2047 { |
|
2048 RMATCH(29, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); |
|
2049 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
2050 --eptr; |
|
2051 } |
|
2052 RRETURN(MATCH_NOMATCH); |
|
2053 } |
|
2054 /* Control never gets here */ |
|
2055 } |
|
2056 else |
|
2057 { |
|
2058 /* No case on surrogate pairs, so no need to bother with "othercase". */ |
|
2059 |
|
2060 for (i = 1; i <= min; i++) |
|
2061 { |
|
2062 int nc; |
|
2063 GETCHAR(nc, eptr); |
|
2064 if (nc != fc) RRETURN(MATCH_NOMATCH); |
|
2065 eptr += 2; |
|
2066 } |
|
2067 |
|
2068 if (min == max) continue; |
|
2069 |
|
2070 if (minimize) |
|
2071 { |
|
2072 for (fi = min;; fi++) |
|
2073 { |
|
2074 int nc; |
|
2075 RMATCH(30, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); |
|
2076 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
2077 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); |
|
2078 GETCHAR(nc, eptr); |
|
2079 if (*eptr != fc) RRETURN(MATCH_NOMATCH); |
|
2080 eptr += 2; |
|
2081 } |
|
2082 /* Control never gets here */ |
|
2083 } |
|
2084 else |
|
2085 { |
|
2086 pp = eptr; |
|
2087 for (i = min; i < max; i++) |
|
2088 { |
|
2089 int nc; |
|
2090 if (eptr > md->end_subject - 2) break; |
|
2091 GETCHAR(nc, eptr); |
|
2092 if (*eptr != fc) break; |
|
2093 eptr += 2; |
|
2094 } |
|
2095 while (eptr >= pp) |
|
2096 { |
|
2097 RMATCH(31, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); |
|
2098 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
2099 eptr -= 2; |
|
2100 } |
|
2101 RRETURN(MATCH_NOMATCH); |
|
2102 } |
|
2103 /* Control never gets here */ |
|
2104 } |
|
2105 /* Control never gets here */ |
|
2106 } |
|
2107 #else |
|
2108 if (utf8) |
|
2109 { |
|
2110 length = 1; |
|
2111 charptr = ecode; |
|
2112 GETCHARLEN(fc, ecode, length); |
|
2113 if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH); |
|
2114 ecode += length; |
|
2115 |
|
2116 /* Handle multibyte character matching specially here. There is |
|
2117 support for caseless matching if UCP support is present. */ |
|
2118 |
|
2119 if (length > 1) |
|
2120 { |
|
2121 int oclength = 0; |
|
2122 uschar occhars[8]; |
|
2123 |
|
2124 #ifdef SUPPORT_UCP |
|
2125 int othercase; |
|
2126 int chartype; |
|
2127 if ((ims & PCRE_CASELESS) != 0 && |
|
2128 _pcre_ucp_findchar(fc, &chartype, &othercase) == ucp_L && |
|
2129 othercase > 0) |
|
2130 oclength = _pcre_ord2utf8(othercase, occhars); |
|
2131 #endif /* SUPPORT_UCP */ |
|
2132 |
|
2133 for (i = 1; i <= min; i++) |
|
2134 { |
|
2135 if (memcmp(eptr, charptr, length) == 0) eptr += length; |
|
2136 /* Need braces because of following else */ |
|
2137 else if (oclength == 0) { RRETURN(MATCH_NOMATCH); } |
|
2138 else |
|
2139 { |
|
2140 if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH); |
|
2141 eptr += oclength; |
|
2142 } |
|
2143 } |
|
2144 |
|
2145 if (min == max) continue; |
|
2146 |
|
2147 if (minimize) |
|
2148 { |
|
2149 for (fi = min;; fi++) |
|
2150 { |
|
2151 // FIXME: This could blow away occhars and occlength in the NO_RECURSE case. |
|
2152 RMATCH(32, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); |
|
2153 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
2154 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); |
|
2155 if (memcmp(eptr, charptr, length) == 0) eptr += length; |
|
2156 /* Need braces because of following else */ |
|
2157 else if (oclength == 0) { RRETURN(MATCH_NOMATCH); } |
|
2158 else |
|
2159 { |
|
2160 if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH); |
|
2161 eptr += oclength; |
|
2162 } |
|
2163 } |
|
2164 /* Control never gets here */ |
|
2165 } |
|
2166 else |
|
2167 { |
|
2168 pp = eptr; |
|
2169 for (i = min; i < max; i++) |
|
2170 { |
|
2171 if (eptr > md->end_subject - length) break; |
|
2172 if (memcmp(eptr, charptr, length) == 0) eptr += length; |
|
2173 else if (oclength == 0) break; |
|
2174 else |
|
2175 { |
|
2176 if (memcmp(eptr, occhars, oclength) != 0) break; |
|
2177 eptr += oclength; |
|
2178 } |
|
2179 } |
|
2180 while (eptr >= pp) |
|
2181 { |
|
2182 RMATCH(33, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); |
|
2183 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
2184 eptr -= length; |
|
2185 } |
|
2186 RRETURN(MATCH_NOMATCH); |
|
2187 } |
|
2188 /* Control never gets here */ |
|
2189 } |
|
2190 |
|
2191 /* If the length of a UTF-8 character is 1, we fall through here, and |
|
2192 obey the code as for non-UTF-8 characters below, though in this case the |
|
2193 value of fc will always be < 128. */ |
|
2194 } |
|
2195 else |
|
2196 #endif |
|
2197 #endif /* SUPPORT_UTF8 */ |
|
2198 |
|
2199 #if !PCRE_UTF16 |
|
2200 /* When not in UTF-8 mode, load a single-byte character. */ |
|
2201 { |
|
2202 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH); |
|
2203 fc = *ecode++; |
|
2204 } |
|
2205 |
|
2206 /* The value of fc at this point is always less than 256, though we may or |
|
2207 may not be in UTF-8 mode. The code is duplicated for the caseless and |
|
2208 caseful cases, for speed, since matching characters is likely to be quite |
|
2209 common. First, ensure the minimum number of matches are present. If min = |
|
2210 max, continue at the same level without recursing. Otherwise, if |
|
2211 minimizing, keep trying the rest of the expression and advancing one |
|
2212 matching character if failing, up to the maximum. Alternatively, if |
|
2213 maximizing, find the maximum number of characters and work backwards. */ |
|
2214 |
|
2215 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max, |
|
2216 max, eptr)); |
|
2217 |
|
2218 if ((ims & PCRE_CASELESS) != 0) |
|
2219 { |
|
2220 fc = md->lcc[fc]; |
|
2221 for (i = 1; i <= min; i++) |
|
2222 if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH); |
|
2223 if (min == max) continue; |
|
2224 if (minimize) |
|
2225 { |
|
2226 for (fi = min;; fi++) |
|
2227 { |
|
2228 RMATCH(34, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); |
|
2229 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
2230 if (fi >= max || eptr >= md->end_subject || |
|
2231 fc != md->lcc[*eptr++]) |
|
2232 RRETURN(MATCH_NOMATCH); |
|
2233 } |
|
2234 /* Control never gets here */ |
|
2235 } |
|
2236 else |
|
2237 { |
|
2238 pp = eptr; |
|
2239 for (i = min; i < max; i++) |
|
2240 { |
|
2241 if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break; |
|
2242 eptr++; |
|
2243 } |
|
2244 while (eptr >= pp) |
|
2245 { |
|
2246 RMATCH(35, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); |
|
2247 eptr--; |
|
2248 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
2249 } |
|
2250 RRETURN(MATCH_NOMATCH); |
|
2251 } |
|
2252 /* Control never gets here */ |
|
2253 } |
|
2254 |
|
2255 /* Caseful comparisons (includes all multi-byte characters) */ |
|
2256 |
|
2257 else |
|
2258 { |
|
2259 for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH); |
|
2260 if (min == max) continue; |
|
2261 if (minimize) |
|
2262 { |
|
2263 for (fi = min;; fi++) |
|
2264 { |
|
2265 RMATCH(36, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); |
|
2266 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
2267 if (fi >= max || eptr >= md->end_subject || fc != *eptr++) |
|
2268 RRETURN(MATCH_NOMATCH); |
|
2269 } |
|
2270 /* Control never gets here */ |
|
2271 } |
|
2272 else |
|
2273 { |
|
2274 pp = eptr; |
|
2275 for (i = min; i < max; i++) |
|
2276 { |
|
2277 if (eptr >= md->end_subject || fc != *eptr) break; |
|
2278 eptr++; |
|
2279 } |
|
2280 while (eptr >= pp) |
|
2281 { |
|
2282 RMATCH(37, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); |
|
2283 eptr--; |
|
2284 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
2285 } |
|
2286 RRETURN(MATCH_NOMATCH); |
|
2287 } |
|
2288 } |
|
2289 /* Control never gets here */ |
|
2290 #endif |
|
2291 |
|
2292 /* Match a negated single one-byte character. The character we are |
|
2293 checking can be multibyte. */ |
|
2294 |
|
2295 case OP_NOT: |
|
2296 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); |
|
2297 ecode++; |
|
2298 GETCHARINCTEST(c, eptr); |
|
2299 if ((ims & PCRE_CASELESS) != 0) |
|
2300 { |
|
2301 #ifdef SUPPORT_UTF8 |
|
2302 if (c < 256) |
|
2303 #endif |
|
2304 c = md->lcc[c]; |
|
2305 if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH); |
|
2306 } |
|
2307 else |
|
2308 { |
|
2309 if (*ecode++ == c) RRETURN(MATCH_NOMATCH); |
|
2310 } |
|
2311 break; |
|
2312 |
|
2313 /* Match a negated single one-byte character repeatedly. This is almost a |
|
2314 repeat of the code for a repeated single character, but I haven't found a |
|
2315 nice way of commoning these up that doesn't require a test of the |
|
2316 positive/negative option for each character match. Maybe that wouldn't add |
|
2317 very much to the time taken, but character matching *is* what this is all |
|
2318 about... */ |
|
2319 |
|
2320 case OP_NOTEXACT: |
|
2321 min = max = GET2(ecode, 1); |
|
2322 ecode += 3; |
|
2323 goto REPEATNOTCHAR; |
|
2324 |
|
2325 case OP_NOTUPTO: |
|
2326 case OP_NOTMINUPTO: |
|
2327 min = 0; |
|
2328 max = GET2(ecode, 1); |
|
2329 minimize = *ecode == OP_NOTMINUPTO; |
|
2330 ecode += 3; |
|
2331 goto REPEATNOTCHAR; |
|
2332 |
|
2333 case OP_NOTSTAR: |
|
2334 case OP_NOTMINSTAR: |
|
2335 case OP_NOTPLUS: |
|
2336 case OP_NOTMINPLUS: |
|
2337 case OP_NOTQUERY: |
|
2338 case OP_NOTMINQUERY: |
|
2339 c = *ecode++ - OP_NOTSTAR; |
|
2340 minimize = (c & 1) != 0; |
|
2341 min = rep_min[c]; /* Pick up values from tables; */ |
|
2342 max = rep_max[c]; /* zero for max => infinity */ |
|
2343 if (max == 0) max = INT_MAX; |
|
2344 |
|
2345 /* Common code for all repeated single-byte matches. We can give up quickly |
|
2346 if there are fewer than the minimum number of bytes left in the |
|
2347 subject. */ |
|
2348 |
|
2349 REPEATNOTCHAR: |
|
2350 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH); |
|
2351 fc = *ecode++; |
|
2352 |
|
2353 /* The code is duplicated for the caseless and caseful cases, for speed, |
|
2354 since matching characters is likely to be quite common. First, ensure the |
|
2355 minimum number of matches are present. If min = max, continue at the same |
|
2356 level without recursing. Otherwise, if minimizing, keep trying the rest of |
|
2357 the expression and advancing one matching character if failing, up to the |
|
2358 maximum. Alternatively, if maximizing, find the maximum number of |
|
2359 characters and work backwards. */ |
|
2360 |
|
2361 #if PCRE_UTF16 |
|
2362 DPRINTF(("negative matching %c{%d,%d}\n", fc, min, max)); |
|
2363 #else |
|
2364 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max, |
|
2365 max, eptr)); |
|
2366 #endif |
|
2367 |
|
2368 if ((ims & PCRE_CASELESS) != 0) |
|
2369 { |
|
2370 fc = md->lcc[fc]; |
|
2371 |
|
2372 #ifdef SUPPORT_UTF8 |
|
2373 /* UTF-8 mode */ |
|
2374 if (utf8) |
|
2375 { |
|
2376 register int d; |
|
2377 for (i = 1; i <= min; i++) |
|
2378 { |
|
2379 GETCHARINC(d, eptr); |
|
2380 if (d < 256) d = md->lcc[d]; |
|
2381 if (fc == d) RRETURN(MATCH_NOMATCH); |
|
2382 } |
|
2383 } |
|
2384 else |
|
2385 #endif |
|
2386 |
|
2387 /* Not UTF-8 mode */ |
|
2388 { |
|
2389 for (i = 1; i <= min; i++) |
|
2390 if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH); |
|
2391 } |
|
2392 |
|
2393 if (min == max) continue; |
|
2394 |
|
2395 if (minimize) |
|
2396 { |
|
2397 #ifdef SUPPORT_UTF8 |
|
2398 /* UTF-8 mode */ |
|
2399 if (utf8) |
|
2400 { |
|
2401 register int d; |
|
2402 for (fi = min;; fi++) |
|
2403 { |
|
2404 RMATCH(38, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); |
|
2405 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
2406 GETCHARINC(d, eptr); |
|
2407 if (d < 256) d = md->lcc[d]; |
|
2408 if (fi >= max || eptr >= md->end_subject || fc == d) |
|
2409 RRETURN(MATCH_NOMATCH); |
|
2410 } |
|
2411 } |
|
2412 else |
|
2413 #endif |
|
2414 /* Not UTF-8 mode */ |
|
2415 { |
|
2416 for (fi = min;; fi++) |
|
2417 { |
|
2418 RMATCH(39, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); |
|
2419 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
2420 if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++]) |
|
2421 RRETURN(MATCH_NOMATCH); |
|
2422 } |
|
2423 } |
|
2424 /* Control never gets here */ |
|
2425 } |
|
2426 |
|
2427 /* Maximize case */ |
|
2428 |
|
2429 else |
|
2430 { |
|
2431 pp = eptr; |
|
2432 |
|
2433 #ifdef SUPPORT_UTF8 |
|
2434 /* UTF-8 mode */ |
|
2435 if (utf8) |
|
2436 { |
|
2437 register int d; |
|
2438 for (i = min; i < max; i++) |
|
2439 { |
|
2440 int len = 1; |
|
2441 if (eptr >= md->end_subject) break; |
|
2442 GETCHARLEN(d, eptr, len); |
|
2443 if (d < 256) d = md->lcc[d]; |
|
2444 if (fc == d) break; |
|
2445 eptr += len; |
|
2446 } |
|
2447 for(;;) |
|
2448 { |
|
2449 RMATCH(40, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); |
|
2450 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
2451 if (eptr-- == pp) break; /* Stop if tried at original pos */ |
|
2452 BACKCHAR(eptr); |
|
2453 } |
|
2454 } |
|
2455 else |
|
2456 #endif |
|
2457 /* Not UTF-8 mode */ |
|
2458 { |
|
2459 for (i = min; i < max; i++) |
|
2460 { |
|
2461 if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break; |
|
2462 eptr++; |
|
2463 } |
|
2464 while (eptr >= pp) |
|
2465 { |
|
2466 RMATCH(41, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); |
|
2467 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
2468 eptr--; |
|
2469 } |
|
2470 } |
|
2471 |
|
2472 RRETURN(MATCH_NOMATCH); |
|
2473 } |
|
2474 /* Control never gets here */ |
|
2475 } |
|
2476 |
|
2477 /* Caseful comparisons */ |
|
2478 |
|
2479 else |
|
2480 { |
|
2481 #ifdef SUPPORT_UTF8 |
|
2482 /* UTF-8 mode */ |
|
2483 if (utf8) |
|
2484 { |
|
2485 register int d; |
|
2486 for (i = 1; i <= min; i++) |
|
2487 { |
|
2488 GETCHARINC(d, eptr); |
|
2489 if (fc == d) RRETURN(MATCH_NOMATCH); |
|
2490 } |
|
2491 } |
|
2492 else |
|
2493 #endif |
|
2494 /* Not UTF-8 mode */ |
|
2495 { |
|
2496 for (i = 1; i <= min; i++) |
|
2497 if (fc == *eptr++) RRETURN(MATCH_NOMATCH); |
|
2498 } |
|
2499 |
|
2500 if (min == max) continue; |
|
2501 |
|
2502 if (minimize) |
|
2503 { |
|
2504 #ifdef SUPPORT_UTF8 |
|
2505 /* UTF-8 mode */ |
|
2506 if (utf8) |
|
2507 { |
|
2508 register int d; |
|
2509 for (fi = min;; fi++) |
|
2510 { |
|
2511 RMATCH(42, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); |
|
2512 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
2513 GETCHARINC(d, eptr); |
|
2514 if (fi >= max || eptr >= md->end_subject || fc == d) |
|
2515 RRETURN(MATCH_NOMATCH); |
|
2516 } |
|
2517 } |
|
2518 else |
|
2519 #endif |
|
2520 /* Not UTF-8 mode */ |
|
2521 { |
|
2522 for (fi = min;; fi++) |
|
2523 { |
|
2524 RMATCH(43, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); |
|
2525 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
2526 if (fi >= max || eptr >= md->end_subject || fc == *eptr++) |
|
2527 RRETURN(MATCH_NOMATCH); |
|
2528 } |
|
2529 } |
|
2530 /* Control never gets here */ |
|
2531 } |
|
2532 |
|
2533 /* Maximize case */ |
|
2534 |
|
2535 else |
|
2536 { |
|
2537 pp = eptr; |
|
2538 |
|
2539 #ifdef SUPPORT_UTF8 |
|
2540 /* UTF-8 mode */ |
|
2541 if (utf8) |
|
2542 { |
|
2543 register int d; |
|
2544 for (i = min; i < max; i++) |
|
2545 { |
|
2546 int len = 1; |
|
2547 if (eptr >= md->end_subject) break; |
|
2548 GETCHARLEN(d, eptr, len); |
|
2549 if (fc == d) break; |
|
2550 eptr += len; |
|
2551 } |
|
2552 for(;;) |
|
2553 { |
|
2554 RMATCH(44, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); |
|
2555 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
2556 if (eptr-- == pp) break; /* Stop if tried at original pos */ |
|
2557 BACKCHAR(eptr); |
|
2558 } |
|
2559 } |
|
2560 else |
|
2561 #endif |
|
2562 /* Not UTF-8 mode */ |
|
2563 { |
|
2564 for (i = min; i < max; i++) |
|
2565 { |
|
2566 if (eptr >= md->end_subject || fc == *eptr) break; |
|
2567 eptr++; |
|
2568 } |
|
2569 while (eptr >= pp) |
|
2570 { |
|
2571 RMATCH(45, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); |
|
2572 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
2573 eptr--; |
|
2574 } |
|
2575 } |
|
2576 |
|
2577 RRETURN(MATCH_NOMATCH); |
|
2578 } |
|
2579 } |
|
2580 /* Control never gets here */ |
|
2581 |
|
2582 /* Match a single character type repeatedly; several different opcodes |
|
2583 share code. This is very similar to the code for single characters, but we |
|
2584 repeat it in the interests of efficiency. */ |
|
2585 |
|
2586 case OP_TYPEEXACT: |
|
2587 min = max = GET2(ecode, 1); |
|
2588 minimize = TRUE; |
|
2589 ecode += 3; |
|
2590 goto REPEATTYPE; |
|
2591 |
|
2592 case OP_TYPEUPTO: |
|
2593 case OP_TYPEMINUPTO: |
|
2594 min = 0; |
|
2595 max = GET2(ecode, 1); |
|
2596 minimize = *ecode == OP_TYPEMINUPTO; |
|
2597 ecode += 3; |
|
2598 goto REPEATTYPE; |
|
2599 |
|
2600 case OP_TYPESTAR: |
|
2601 case OP_TYPEMINSTAR: |
|
2602 case OP_TYPEPLUS: |
|
2603 case OP_TYPEMINPLUS: |
|
2604 case OP_TYPEQUERY: |
|
2605 case OP_TYPEMINQUERY: |
|
2606 c = *ecode++ - OP_TYPESTAR; |
|
2607 minimize = (c & 1) != 0; |
|
2608 min = rep_min[c]; /* Pick up values from tables; */ |
|
2609 max = rep_max[c]; /* zero for max => infinity */ |
|
2610 if (max == 0) max = INT_MAX; |
|
2611 |
|
2612 /* Common code for all repeated single character type matches. Note that |
|
2613 in UTF-8 mode, '.' matches a character of any length, but for the other |
|
2614 character types, the valid characters are all one-byte long. */ |
|
2615 |
|
2616 REPEATTYPE: |
|
2617 ctype = *ecode++; /* Code for the character type */ |
|
2618 |
|
2619 #ifdef SUPPORT_UCP |
|
2620 if (ctype == OP_PROP || ctype == OP_NOTPROP) |
|
2621 { |
|
2622 prop_fail_result = ctype == OP_NOTPROP; |
|
2623 prop_type = *ecode++; |
|
2624 if (prop_type >= 128) |
|
2625 { |
|
2626 prop_test_against = prop_type - 128; |
|
2627 prop_test_variable = &prop_category; |
|
2628 } |
|
2629 else |
|
2630 { |
|
2631 prop_test_against = prop_type; |
|
2632 prop_test_variable = &prop_chartype; |
|
2633 } |
|
2634 } |
|
2635 else prop_type = -1; |
|
2636 #endif |
|
2637 |
|
2638 /* First, ensure the minimum number of matches are present. Use inline |
|
2639 code for maximizing the speed, and do the type test once at the start |
|
2640 (i.e. keep it out of the loop). Also we can test that there are at least |
|
2641 the minimum number of bytes before we start. This isn't as effective in |
|
2642 UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that |
|
2643 is tidier. Also separate the UCP code, which can be the same for both UTF-8 |
|
2644 and single-bytes. */ |
|
2645 |
|
2646 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH); |
|
2647 if (min > 0) |
|
2648 { |
|
2649 #ifdef SUPPORT_UCP |
|
2650 if (prop_type > 0) |
|
2651 { |
|
2652 for (i = 1; i <= min; i++) |
|
2653 { |
|
2654 GETCHARINC(c, eptr); |
|
2655 prop_category = _pcre_ucp_findchar(c, &prop_chartype, &prop_othercase); |
|
2656 if ((*prop_test_variable == prop_test_against) == prop_fail_result) |
|
2657 RRETURN(MATCH_NOMATCH); |
|
2658 } |
|
2659 } |
|
2660 |
|
2661 /* Match extended Unicode sequences. We will get here only if the |
|
2662 support is in the binary; otherwise a compile-time error occurs. */ |
|
2663 |
|
2664 else if (ctype == OP_EXTUNI) |
|
2665 { |
|
2666 for (i = 1; i <= min; i++) |
|
2667 { |
|
2668 GETCHARINCTEST(c, eptr); |
|
2669 prop_category = _pcre_ucp_findchar(c, &prop_chartype, &prop_othercase); |
|
2670 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH); |
|
2671 while (eptr < md->end_subject) |
|
2672 { |
|
2673 int len = 1; |
|
2674 if (!utf8) c = *eptr; else |
|
2675 { |
|
2676 GETCHARLEN(c, eptr, len); |
|
2677 } |
|
2678 prop_category = _pcre_ucp_findchar(c, &prop_chartype, &prop_othercase); |
|
2679 if (prop_category != ucp_M) break; |
|
2680 eptr += len; |
|
2681 } |
|
2682 } |
|
2683 } |
|
2684 |
|
2685 else |
|
2686 #endif /* SUPPORT_UCP */ |
|
2687 |
|
2688 /* Handle all other cases when the coding is UTF-8 */ |
|
2689 |
|
2690 #ifdef SUPPORT_UTF8 |
|
2691 if (utf8) switch(ctype) |
|
2692 { |
|
2693 case OP_ANY: |
|
2694 for (i = 1; i <= min; i++) |
|
2695 { |
|
2696 if (eptr >= md->end_subject || |
|
2697 (*eptr++ == NEWLINE && (ims & PCRE_DOTALL) == 0)) |
|
2698 RRETURN(MATCH_NOMATCH); |
|
2699 while (eptr < md->end_subject && ISMIDCHAR(*eptr)) eptr++; |
|
2700 } |
|
2701 break; |
|
2702 |
|
2703 case OP_ANYBYTE: |
|
2704 eptr += min; |
|
2705 break; |
|
2706 |
|
2707 case OP_NOT_DIGIT: |
|
2708 for (i = 1; i <= min; i++) |
|
2709 { |
|
2710 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); |
|
2711 GETCHARINC(c, eptr); |
|
2712 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0) |
|
2713 RRETURN(MATCH_NOMATCH); |
|
2714 } |
|
2715 break; |
|
2716 |
|
2717 case OP_DIGIT: |
|
2718 for (i = 1; i <= min; i++) |
|
2719 { |
|
2720 if (eptr >= md->end_subject || |
|
2721 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0) |
|
2722 RRETURN(MATCH_NOMATCH); |
|
2723 /* No need to skip more bytes - we know it's a 1-byte character */ |
|
2724 } |
|
2725 break; |
|
2726 |
|
2727 case OP_NOT_WHITESPACE: |
|
2728 for (i = 1; i <= min; i++) |
|
2729 { |
|
2730 if (eptr >= md->end_subject || |
|
2731 (*eptr < 128 && (md->ctypes[*eptr++] & ctype_space) != 0)) |
|
2732 RRETURN(MATCH_NOMATCH); |
|
2733 while (eptr < md->end_subject && ISMIDCHAR(*eptr)) eptr++; |
|
2734 } |
|
2735 break; |
|
2736 |
|
2737 case OP_WHITESPACE: |
|
2738 for (i = 1; i <= min; i++) |
|
2739 { |
|
2740 if (eptr >= md->end_subject || |
|
2741 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0) |
|
2742 RRETURN(MATCH_NOMATCH); |
|
2743 /* No need to skip more bytes - we know it's a 1-byte character */ |
|
2744 } |
|
2745 break; |
|
2746 |
|
2747 case OP_NOT_WORDCHAR: |
|
2748 for (i = 1; i <= min; i++) |
|
2749 { |
|
2750 if (eptr >= md->end_subject || |
|
2751 (*eptr < 128 && (md->ctypes[*eptr++] & ctype_word) != 0)) |
|
2752 RRETURN(MATCH_NOMATCH); |
|
2753 while (eptr < md->end_subject && ISMIDCHAR(*eptr)) eptr++; |
|
2754 } |
|
2755 break; |
|
2756 |
|
2757 case OP_WORDCHAR: |
|
2758 for (i = 1; i <= min; i++) |
|
2759 { |
|
2760 if (eptr >= md->end_subject || |
|
2761 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0) |
|
2762 RRETURN(MATCH_NOMATCH); |
|
2763 /* No need to skip more bytes - we know it's a 1-byte character */ |
|
2764 } |
|
2765 break; |
|
2766 |
|
2767 default: |
|
2768 RRETURN(PCRE_ERROR_INTERNAL); |
|
2769 } /* End switch(ctype) */ |
|
2770 |
|
2771 else |
|
2772 #endif /* SUPPORT_UTF8 */ |
|
2773 |
|
2774 /* Code for the non-UTF-8 case for minimum matching of operators other |
|
2775 than OP_PROP and OP_NOTPROP. */ |
|
2776 |
|
2777 switch(ctype) |
|
2778 { |
|
2779 case OP_ANY: |
|
2780 if ((ims & PCRE_DOTALL) == 0) |
|
2781 { |
|
2782 for (i = 1; i <= min; i++) |
|
2783 if (*eptr++ == NEWLINE) RRETURN(MATCH_NOMATCH); |
|
2784 } |
|
2785 else eptr += min; |
|
2786 break; |
|
2787 |
|
2788 case OP_ANYBYTE: |
|
2789 eptr += min; |
|
2790 break; |
|
2791 |
|
2792 case OP_NOT_DIGIT: |
|
2793 for (i = 1; i <= min; i++) |
|
2794 if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH); |
|
2795 break; |
|
2796 |
|
2797 case OP_DIGIT: |
|
2798 for (i = 1; i <= min; i++) |
|
2799 if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH); |
|
2800 break; |
|
2801 |
|
2802 case OP_NOT_WHITESPACE: |
|
2803 for (i = 1; i <= min; i++) |
|
2804 if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH); |
|
2805 break; |
|
2806 |
|
2807 case OP_WHITESPACE: |
|
2808 for (i = 1; i <= min; i++) |
|
2809 if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH); |
|
2810 break; |
|
2811 |
|
2812 case OP_NOT_WORDCHAR: |
|
2813 for (i = 1; i <= min; i++) |
|
2814 if ((md->ctypes[*eptr++] & ctype_word) != 0) |
|
2815 RRETURN(MATCH_NOMATCH); |
|
2816 break; |
|
2817 |
|
2818 case OP_WORDCHAR: |
|
2819 for (i = 1; i <= min; i++) |
|
2820 if ((md->ctypes[*eptr++] & ctype_word) == 0) |
|
2821 RRETURN(MATCH_NOMATCH); |
|
2822 break; |
|
2823 |
|
2824 default: |
|
2825 RRETURN(PCRE_ERROR_INTERNAL); |
|
2826 } |
|
2827 } |
|
2828 |
|
2829 /* If min = max, continue at the same level without recursing */ |
|
2830 |
|
2831 if (min == max) continue; |
|
2832 |
|
2833 /* If minimizing, we have to test the rest of the pattern before each |
|
2834 subsequent match. Again, separate the UTF-8 case for speed, and also |
|
2835 separate the UCP cases. */ |
|
2836 |
|
2837 if (minimize) |
|
2838 { |
|
2839 #ifdef SUPPORT_UCP |
|
2840 if (prop_type > 0) |
|
2841 { |
|
2842 for (fi = min;; fi++) |
|
2843 { |
|
2844 RMATCH(46, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); |
|
2845 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
2846 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); |
|
2847 GETCHARINC(c, eptr); |
|
2848 prop_category = _pcre_ucp_findchar(c, &prop_chartype, &prop_othercase); |
|
2849 if ((*prop_test_variable == prop_test_against) == prop_fail_result) |
|
2850 RRETURN(MATCH_NOMATCH); |
|
2851 } |
|
2852 } |
|
2853 |
|
2854 /* Match extended Unicode sequences. We will get here only if the |
|
2855 support is in the binary; otherwise a compile-time error occurs. */ |
|
2856 |
|
2857 else if (ctype == OP_EXTUNI) |
|
2858 { |
|
2859 for (fi = min;; fi++) |
|
2860 { |
|
2861 RMATCH(47, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); |
|
2862 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
2863 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); |
|
2864 GETCHARINCTEST(c, eptr); |
|
2865 prop_category = _pcre_ucp_findchar(c, &prop_chartype, &prop_othercase); |
|
2866 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH); |
|
2867 while (eptr < md->end_subject) |
|
2868 { |
|
2869 int len = 1; |
|
2870 if (!utf8) c = *eptr; else |
|
2871 { |
|
2872 GETCHARLEN(c, eptr, len); |
|
2873 } |
|
2874 prop_category = _pcre_ucp_findchar(c, &prop_chartype, &prop_othercase); |
|
2875 if (prop_category != ucp_M) break; |
|
2876 eptr += len; |
|
2877 } |
|
2878 } |
|
2879 } |
|
2880 |
|
2881 else |
|
2882 #endif /* SUPPORT_UCP */ |
|
2883 |
|
2884 #ifdef SUPPORT_UTF8 |
|
2885 /* UTF-8 mode */ |
|
2886 if (utf8) |
|
2887 { |
|
2888 for (fi = min;; fi++) |
|
2889 { |
|
2890 RMATCH(48, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); |
|
2891 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
2892 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); |
|
2893 |
|
2894 GETCHARINC(c, eptr); |
|
2895 switch(ctype) |
|
2896 { |
|
2897 case OP_ANY: |
|
2898 if ((ims & PCRE_DOTALL) == 0 && c == NEWLINE) RRETURN(MATCH_NOMATCH); |
|
2899 break; |
|
2900 |
|
2901 case OP_ANYBYTE: |
|
2902 break; |
|
2903 |
|
2904 case OP_NOT_DIGIT: |
|
2905 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) |
|
2906 RRETURN(MATCH_NOMATCH); |
|
2907 break; |
|
2908 |
|
2909 case OP_DIGIT: |
|
2910 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0) |
|
2911 RRETURN(MATCH_NOMATCH); |
|
2912 break; |
|
2913 |
|
2914 case OP_NOT_WHITESPACE: |
|
2915 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) |
|
2916 RRETURN(MATCH_NOMATCH); |
|
2917 break; |
|
2918 |
|
2919 case OP_WHITESPACE: |
|
2920 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0) |
|
2921 RRETURN(MATCH_NOMATCH); |
|
2922 break; |
|
2923 |
|
2924 case OP_NOT_WORDCHAR: |
|
2925 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) |
|
2926 RRETURN(MATCH_NOMATCH); |
|
2927 break; |
|
2928 |
|
2929 case OP_WORDCHAR: |
|
2930 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) |
|
2931 RRETURN(MATCH_NOMATCH); |
|
2932 break; |
|
2933 |
|
2934 default: |
|
2935 RRETURN(PCRE_ERROR_INTERNAL); |
|
2936 } |
|
2937 } |
|
2938 } |
|
2939 else |
|
2940 #endif |
|
2941 /* Not UTF-8 mode */ |
|
2942 { |
|
2943 for (fi = min;; fi++) |
|
2944 { |
|
2945 RMATCH(49, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); |
|
2946 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
2947 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); |
|
2948 c = *eptr++; |
|
2949 switch(ctype) |
|
2950 { |
|
2951 case OP_ANY: |
|
2952 if ((ims & PCRE_DOTALL) == 0 && c == NEWLINE) RRETURN(MATCH_NOMATCH); |
|
2953 break; |
|
2954 |
|
2955 case OP_ANYBYTE: |
|
2956 break; |
|
2957 |
|
2958 case OP_NOT_DIGIT: |
|
2959 if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH); |
|
2960 break; |
|
2961 |
|
2962 case OP_DIGIT: |
|
2963 if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH); |
|
2964 break; |
|
2965 |
|
2966 case OP_NOT_WHITESPACE: |
|
2967 if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH); |
|
2968 break; |
|
2969 |
|
2970 case OP_WHITESPACE: |
|
2971 if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH); |
|
2972 break; |
|
2973 |
|
2974 case OP_NOT_WORDCHAR: |
|
2975 if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH); |
|
2976 break; |
|
2977 |
|
2978 case OP_WORDCHAR: |
|
2979 if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH); |
|
2980 break; |
|
2981 |
|
2982 default: |
|
2983 RRETURN(PCRE_ERROR_INTERNAL); |
|
2984 } |
|
2985 } |
|
2986 } |
|
2987 /* Control never gets here */ |
|
2988 } |
|
2989 |
|
2990 /* If maximizing it is worth using inline code for speed, doing the type |
|
2991 test once at the start (i.e. keep it out of the loop). Again, keep the |
|
2992 UTF-8 and UCP stuff separate. */ |
|
2993 |
|
2994 else |
|
2995 { |
|
2996 pp = eptr; /* Remember where we started */ |
|
2997 |
|
2998 #ifdef SUPPORT_UCP |
|
2999 if (prop_type > 0) |
|
3000 { |
|
3001 for (i = min; i < max; i++) |
|
3002 { |
|
3003 int len = 1; |
|
3004 if (eptr >= md->end_subject) break; |
|
3005 GETCHARLEN(c, eptr, len); |
|
3006 prop_category = _pcre_ucp_findchar(c, &prop_chartype, &prop_othercase); |
|
3007 if ((*prop_test_variable == prop_test_against) == prop_fail_result) |
|
3008 break; |
|
3009 eptr+= len; |
|
3010 } |
|
3011 |
|
3012 /* eptr is now past the end of the maximum run */ |
|
3013 |
|
3014 for(;;) |
|
3015 { |
|
3016 RMATCH(50, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); |
|
3017 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
3018 if (eptr-- == pp) break; /* Stop if tried at original pos */ |
|
3019 BACKCHAR(eptr); |
|
3020 } |
|
3021 } |
|
3022 |
|
3023 /* Match extended Unicode sequences. We will get here only if the |
|
3024 support is in the binary; otherwise a compile-time error occurs. */ |
|
3025 |
|
3026 else if (ctype == OP_EXTUNI) |
|
3027 { |
|
3028 for (i = min; i < max; i++) |
|
3029 { |
|
3030 if (eptr >= md->end_subject) break; |
|
3031 GETCHARINCTEST(c, eptr); |
|
3032 prop_category = _pcre_ucp_findchar(c, &prop_chartype, &prop_othercase); |
|
3033 if (prop_category == ucp_M) break; |
|
3034 while (eptr < md->end_subject) |
|
3035 { |
|
3036 int len = 1; |
|
3037 if (!utf8) c = *eptr; else |
|
3038 { |
|
3039 GETCHARLEN(c, eptr, len); |
|
3040 } |
|
3041 prop_category = _pcre_ucp_findchar(c, &prop_chartype, &prop_othercase); |
|
3042 if (prop_category != ucp_M) break; |
|
3043 eptr += len; |
|
3044 } |
|
3045 } |
|
3046 |
|
3047 /* eptr is now past the end of the maximum run */ |
|
3048 |
|
3049 for(;;) |
|
3050 { |
|
3051 RMATCH(51, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); |
|
3052 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
3053 if (eptr-- == pp) break; /* Stop if tried at original pos */ |
|
3054 for (;;) /* Move back over one extended */ |
|
3055 { |
|
3056 int len = 1; |
|
3057 BACKCHAR(eptr); |
|
3058 if (!utf8) c = *eptr; else |
|
3059 { |
|
3060 GETCHARLEN(c, eptr, len); |
|
3061 } |
|
3062 prop_category = _pcre_ucp_findchar(c, &prop_chartype, &prop_othercase); |
|
3063 if (prop_category != ucp_M) break; |
|
3064 eptr--; |
|
3065 } |
|
3066 } |
|
3067 } |
|
3068 |
|
3069 else |
|
3070 #endif /* SUPPORT_UCP */ |
|
3071 |
|
3072 #ifdef SUPPORT_UTF8 |
|
3073 /* UTF-8 mode */ |
|
3074 |
|
3075 if (utf8) |
|
3076 { |
|
3077 switch(ctype) |
|
3078 { |
|
3079 case OP_ANY: |
|
3080 |
|
3081 /* Special code is required for UTF8, but when the maximum is unlimited |
|
3082 we don't need it, so we repeat the non-UTF8 code. This is probably |
|
3083 worth it, because .* is quite a common idiom. */ |
|
3084 |
|
3085 if (max < INT_MAX) |
|
3086 { |
|
3087 if ((ims & PCRE_DOTALL) == 0) |
|
3088 { |
|
3089 for (i = min; i < max; i++) |
|
3090 { |
|
3091 if (eptr >= md->end_subject || *eptr == NEWLINE) break; |
|
3092 eptr++; |
|
3093 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; |
|
3094 } |
|
3095 } |
|
3096 else |
|
3097 { |
|
3098 for (i = min; i < max; i++) |
|
3099 { |
|
3100 eptr++; |
|
3101 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; |
|
3102 } |
|
3103 } |
|
3104 } |
|
3105 |
|
3106 /* Handle unlimited UTF-8 repeat */ |
|
3107 |
|
3108 else |
|
3109 { |
|
3110 if ((ims & PCRE_DOTALL) == 0) |
|
3111 { |
|
3112 for (i = min; i < max; i++) |
|
3113 { |
|
3114 if (eptr >= md->end_subject || *eptr == NEWLINE) break; |
|
3115 eptr++; |
|
3116 } |
|
3117 break; |
|
3118 } |
|
3119 else |
|
3120 { |
|
3121 c = max - min; |
|
3122 if (c > md->end_subject - eptr) c = INT_CAST(md->end_subject - eptr); |
|
3123 eptr += c; |
|
3124 } |
|
3125 } |
|
3126 break; |
|
3127 |
|
3128 /* The byte case is the same as non-UTF8 */ |
|
3129 |
|
3130 case OP_ANYBYTE: |
|
3131 c = max - min; |
|
3132 if (c > md->end_subject - eptr) c = INT_CAST(md->end_subject - eptr); |
|
3133 eptr += c; |
|
3134 break; |
|
3135 |
|
3136 case OP_NOT_DIGIT: |
|
3137 for (i = min; i < max; i++) |
|
3138 { |
|
3139 int len = 1; |
|
3140 if (eptr >= md->end_subject) break; |
|
3141 GETCHARLEN(c, eptr, len); |
|
3142 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break; |
|
3143 eptr+= len; |
|
3144 } |
|
3145 break; |
|
3146 |
|
3147 case OP_DIGIT: |
|
3148 for (i = min; i < max; i++) |
|
3149 { |
|
3150 int len = 1; |
|
3151 if (eptr >= md->end_subject) break; |
|
3152 GETCHARLEN(c, eptr, len); |
|
3153 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break; |
|
3154 eptr+= len; |
|
3155 } |
|
3156 break; |
|
3157 |
|
3158 case OP_NOT_WHITESPACE: |
|
3159 for (i = min; i < max; i++) |
|
3160 { |
|
3161 int len = 1; |
|
3162 if (eptr >= md->end_subject) break; |
|
3163 GETCHARLEN(c, eptr, len); |
|
3164 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break; |
|
3165 eptr+= len; |
|
3166 } |
|
3167 break; |
|
3168 |
|
3169 case OP_WHITESPACE: |
|
3170 for (i = min; i < max; i++) |
|
3171 { |
|
3172 int len = 1; |
|
3173 if (eptr >= md->end_subject) break; |
|
3174 GETCHARLEN(c, eptr, len); |
|
3175 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break; |
|
3176 eptr+= len; |
|
3177 } |
|
3178 break; |
|
3179 |
|
3180 case OP_NOT_WORDCHAR: |
|
3181 for (i = min; i < max; i++) |
|
3182 { |
|
3183 int len = 1; |
|
3184 if (eptr >= md->end_subject) break; |
|
3185 GETCHARLEN(c, eptr, len); |
|
3186 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break; |
|
3187 eptr+= len; |
|
3188 } |
|
3189 break; |
|
3190 |
|
3191 case OP_WORDCHAR: |
|
3192 for (i = min; i < max; i++) |
|
3193 { |
|
3194 int len = 1; |
|
3195 if (eptr >= md->end_subject) break; |
|
3196 GETCHARLEN(c, eptr, len); |
|
3197 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break; |
|
3198 eptr+= len; |
|
3199 } |
|
3200 break; |
|
3201 |
|
3202 default: |
|
3203 RRETURN(PCRE_ERROR_INTERNAL); |
|
3204 } |
|
3205 |
|
3206 /* eptr is now past the end of the maximum run */ |
|
3207 |
|
3208 for(;;) |
|
3209 { |
|
3210 RMATCH(52, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); |
|
3211 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
3212 if (eptr-- == pp) break; /* Stop if tried at original pos */ |
|
3213 BACKCHAR(eptr); |
|
3214 } |
|
3215 } |
|
3216 else |
|
3217 #endif |
|
3218 |
|
3219 /* Not UTF-8 mode */ |
|
3220 { |
|
3221 switch(ctype) |
|
3222 { |
|
3223 case OP_ANY: |
|
3224 if ((ims & PCRE_DOTALL) == 0) |
|
3225 { |
|
3226 for (i = min; i < max; i++) |
|
3227 { |
|
3228 if (eptr >= md->end_subject || *eptr == NEWLINE) break; |
|
3229 eptr++; |
|
3230 } |
|
3231 break; |
|
3232 } |
|
3233 /* For DOTALL case, fall through and treat as \C */ |
|
3234 |
|
3235 case OP_ANYBYTE: |
|
3236 c = max - min; |
|
3237 if (c > md->end_subject - eptr) c = INT_CAST(md->end_subject - eptr); |
|
3238 eptr += c; |
|
3239 break; |
|
3240 |
|
3241 case OP_NOT_DIGIT: |
|
3242 for (i = min; i < max; i++) |
|
3243 { |
|
3244 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0) |
|
3245 break; |
|
3246 eptr++; |
|
3247 } |
|
3248 break; |
|
3249 |
|
3250 case OP_DIGIT: |
|
3251 for (i = min; i < max; i++) |
|
3252 { |
|
3253 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0) |
|
3254 break; |
|
3255 eptr++; |
|
3256 } |
|
3257 break; |
|
3258 |
|
3259 case OP_NOT_WHITESPACE: |
|
3260 for (i = min; i < max; i++) |
|
3261 { |
|
3262 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0) |
|
3263 break; |
|
3264 eptr++; |
|
3265 } |
|
3266 break; |
|
3267 |
|
3268 case OP_WHITESPACE: |
|
3269 for (i = min; i < max; i++) |
|
3270 { |
|
3271 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0) |
|
3272 break; |
|
3273 eptr++; |
|
3274 } |
|
3275 break; |
|
3276 |
|
3277 case OP_NOT_WORDCHAR: |
|
3278 for (i = min; i < max; i++) |
|
3279 { |
|
3280 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0) |
|
3281 break; |
|
3282 eptr++; |
|
3283 } |
|
3284 break; |
|
3285 |
|
3286 case OP_WORDCHAR: |
|
3287 for (i = min; i < max; i++) |
|
3288 { |
|
3289 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0) |
|
3290 break; |
|
3291 eptr++; |
|
3292 } |
|
3293 break; |
|
3294 |
|
3295 default: |
|
3296 RRETURN(PCRE_ERROR_INTERNAL); |
|
3297 } |
|
3298 |
|
3299 /* eptr is now past the end of the maximum run */ |
|
3300 |
|
3301 while (eptr >= pp) |
|
3302 { |
|
3303 RMATCH(53, rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); |
|
3304 eptr--; |
|
3305 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
3306 } |
|
3307 } |
|
3308 |
|
3309 /* Get here if we can't make it match with any permitted repetitions */ |
|
3310 |
|
3311 RRETURN(MATCH_NOMATCH); |
|
3312 } |
|
3313 /* Control never gets here */ |
|
3314 |
|
3315 /* There's been some horrible disaster. Since all codes > OP_BRA are |
|
3316 for capturing brackets, and there shouldn't be any gaps between 0 and |
|
3317 OP_BRA, arrival here can only mean there is something seriously wrong |
|
3318 in the code above or the OP_xxx definitions. */ |
|
3319 |
|
3320 default: |
|
3321 DPRINTF(("Unknown opcode %d\n", *ecode)); |
|
3322 RRETURN(PCRE_ERROR_UNKNOWN_NODE); |
|
3323 } |
|
3324 |
|
3325 /* Do not stick any code in here without much thought; it is assumed |
|
3326 that "continue" in the code above comes out to here to repeat the main |
|
3327 loop. */ |
|
3328 |
|
3329 } /* End of main loop */ |
|
3330 /* Control never reaches here */ |
|
3331 |
|
3332 #ifdef NO_RECURSE |
|
3333 #ifndef __GNUC__ |
|
3334 |
|
3335 RRETURN_SWITCH: |
|
3336 switch (frame->Xwhere) |
|
3337 { |
|
3338 case 1: goto RRETURN_1; |
|
3339 case 2: goto RRETURN_2; |
|
3340 case 3: goto RRETURN_3; |
|
3341 case 4: goto RRETURN_4; |
|
3342 case 5: goto RRETURN_5; |
|
3343 case 6: goto RRETURN_6; |
|
3344 case 7: goto RRETURN_7; |
|
3345 case 8: goto RRETURN_8; |
|
3346 case 9: goto RRETURN_9; |
|
3347 case 10: goto RRETURN_10; |
|
3348 case 11: goto RRETURN_11; |
|
3349 case 12: goto RRETURN_12; |
|
3350 case 13: goto RRETURN_13; |
|
3351 case 14: goto RRETURN_14; |
|
3352 case 15: goto RRETURN_15; |
|
3353 case 16: goto RRETURN_16; |
|
3354 case 17: goto RRETURN_17; |
|
3355 case 18: goto RRETURN_18; |
|
3356 case 19: goto RRETURN_19; |
|
3357 case 20: goto RRETURN_20; |
|
3358 case 21: goto RRETURN_21; |
|
3359 case 22: goto RRETURN_22; |
|
3360 case 23: goto RRETURN_23; |
|
3361 case 24: goto RRETURN_24; |
|
3362 case 25: goto RRETURN_25; |
|
3363 case 26: goto RRETURN_26; |
|
3364 case 27: goto RRETURN_27; |
|
3365 case 28: goto RRETURN_28; |
|
3366 case 29: goto RRETURN_29; |
|
3367 case 30: goto RRETURN_30; |
|
3368 case 31: goto RRETURN_31; |
|
3369 case 32: goto RRETURN_32; |
|
3370 case 33: goto RRETURN_33; |
|
3371 case 34: goto RRETURN_34; |
|
3372 case 35: goto RRETURN_35; |
|
3373 case 36: goto RRETURN_36; |
|
3374 case 37: goto RRETURN_37; |
|
3375 case 38: goto RRETURN_38; |
|
3376 case 39: goto RRETURN_39; |
|
3377 case 40: goto RRETURN_40; |
|
3378 case 41: goto RRETURN_41; |
|
3379 case 42: goto RRETURN_42; |
|
3380 case 43: goto RRETURN_43; |
|
3381 case 44: goto RRETURN_44; |
|
3382 case 45: goto RRETURN_45; |
|
3383 case 46: goto RRETURN_46; |
|
3384 case 47: goto RRETURN_47; |
|
3385 case 48: goto RRETURN_48; |
|
3386 case 49: goto RRETURN_49; |
|
3387 case 50: goto RRETURN_50; |
|
3388 case 51: goto RRETURN_51; |
|
3389 case 52: goto RRETURN_52; |
|
3390 case 53: goto RRETURN_53; |
|
3391 } |
|
3392 |
|
3393 #if PCRE_UTF16 |
|
3394 /* It's safer to have the extra symbols here than to try to ifdef the switch statement above, |
|
3395 because we'll get warnings or errors if we have multiply defined symbols but a runtime failure |
|
3396 if we leave something out of the switch statement. */ |
|
3397 RRETURN_32: |
|
3398 RRETURN_33: |
|
3399 RRETURN_34: |
|
3400 RRETURN_35: |
|
3401 RRETURN_36: |
|
3402 RRETURN_37: |
|
3403 #endif |
|
3404 |
|
3405 abort(); |
|
3406 return 0; |
|
3407 |
|
3408 #endif |
|
3409 #endif |
|
3410 |
|
3411 } |
|
3412 |
|
3413 |
|
3414 /*************************************************************************** |
|
3415 **************************************************************************** |
|
3416 RECURSION IN THE match() FUNCTION |
|
3417 |
|
3418 Undefine all the macros that were defined above to handle this. */ |
|
3419 |
|
3420 #ifdef NO_RECURSE |
|
3421 #undef eptr |
|
3422 #undef ecode |
|
3423 #undef offset_top |
|
3424 #undef ims |
|
3425 #undef eptrb |
|
3426 #undef flags |
|
3427 |
|
3428 #undef callpat |
|
3429 #undef charptr |
|
3430 #undef data |
|
3431 #undef next |
|
3432 #undef pp |
|
3433 #undef prev |
|
3434 #undef saved_eptr |
|
3435 |
|
3436 #undef new_recursive |
|
3437 |
|
3438 #undef cur_is_word |
|
3439 #undef condition |
|
3440 #undef minimize |
|
3441 #undef prev_is_word |
|
3442 |
|
3443 #undef original_ims |
|
3444 |
|
3445 #undef ctype |
|
3446 #undef length |
|
3447 #undef max |
|
3448 #undef min |
|
3449 #undef number |
|
3450 #undef offset |
|
3451 #undef op |
|
3452 #undef save_capture_last |
|
3453 #undef save_offset1 |
|
3454 #undef save_offset2 |
|
3455 #undef save_offset3 |
|
3456 #undef stacksave |
|
3457 |
|
3458 #undef newptrb |
|
3459 |
|
3460 #endif |
|
3461 |
|
3462 /* These two are defined as macros in both cases */ |
|
3463 |
|
3464 #undef fc |
|
3465 #undef fi |
|
3466 |
|
3467 /*************************************************************************** |
|
3468 ***************************************************************************/ |
|
3469 |
|
3470 |
|
3471 |
|
3472 /************************************************* |
|
3473 * Execute a Regular Expression * |
|
3474 *************************************************/ |
|
3475 |
|
3476 /* This function applies a compiled re to a subject string and picks out |
|
3477 portions of the string if it matches. Two elements in the vector are set for |
|
3478 each substring: the offsets to the start and end of the substring. |
|
3479 |
|
3480 Arguments: |
|
3481 argument_re points to the compiled expression |
|
3482 extra_data points to extra data or is NULL |
|
3483 subject points to the subject string |
|
3484 length length of subject string (may contain binary zeros) |
|
3485 start_offset where to start in the subject string |
|
3486 options option bits |
|
3487 offsets points to a vector of ints to be filled in with offsets |
|
3488 offsetcount the number of elements in the vector |
|
3489 |
|
3490 Returns: > 0 => success; value is the number of elements filled in |
|
3491 = 0 => success, but offsets is not big enough |
|
3492 -1 => failed to match |
|
3493 < -1 => some kind of unexpected problem |
|
3494 */ |
|
3495 |
|
3496 PCRE_EXPORT int |
|
3497 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data, |
|
3498 const pcre_char *subject, int length, int start_offset, int options, int *offsets, |
|
3499 int offsetcount) |
|
3500 { |
|
3501 int rc, resetcount, ocount; |
|
3502 int first_byte = -1; |
|
3503 int req_byte = -1; |
|
3504 int req_byte2 = -1; |
|
3505 unsigned long int ims = 0; |
|
3506 BOOL using_temporary_offsets = FALSE; |
|
3507 BOOL anchored; |
|
3508 BOOL startline; |
|
3509 BOOL firstline; |
|
3510 BOOL first_byte_caseless = FALSE; |
|
3511 BOOL req_byte_caseless = FALSE; |
|
3512 match_data match_block; |
|
3513 const uschar *tables; |
|
3514 const uschar *start_bits = NULL; |
|
3515 const pcre_uchar *start_match = (const pcre_uchar *)subject + start_offset; |
|
3516 const pcre_uchar *end_subject; |
|
3517 const pcre_uchar *req_byte_ptr = start_match - 1; |
|
3518 |
|
3519 pcre_study_data internal_study; |
|
3520 const pcre_study_data *study; |
|
3521 |
|
3522 real_pcre internal_re; |
|
3523 const real_pcre *external_re = (const real_pcre *)argument_re; |
|
3524 const real_pcre *re = external_re; |
|
3525 |
|
3526 /* Plausibility checks */ |
|
3527 |
|
3528 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION; |
|
3529 if (re == NULL || subject == NULL || |
|
3530 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL; |
|
3531 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT; |
|
3532 |
|
3533 /* Fish out the optional data from the extra_data structure, first setting |
|
3534 the default values. */ |
|
3535 |
|
3536 study = NULL; |
|
3537 match_block.match_limit = MATCH_LIMIT; |
|
3538 match_block.callout_data = NULL; |
|
3539 |
|
3540 /* The table pointer is always in native byte order. */ |
|
3541 |
|
3542 tables = external_re->tables; |
|
3543 |
|
3544 if (extra_data != NULL) |
|
3545 { |
|
3546 register unsigned long flags = extra_data->flags; |
|
3547 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0) |
|
3548 study = (const pcre_study_data *)extra_data->study_data; |
|
3549 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) |
|
3550 match_block.match_limit = extra_data->match_limit; |
|
3551 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0) |
|
3552 match_block.callout_data = extra_data->callout_data; |
|
3553 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables; |
|
3554 } |
|
3555 |
|
3556 /* If the exec call supplied NULL for tables, use the inbuilt ones. This |
|
3557 is a feature that makes it possible to save compiled regex and re-use them |
|
3558 in other programs later. */ |
|
3559 |
|
3560 if (tables == NULL) tables = _pcre_default_tables; |
|
3561 |
|
3562 /* Check that the first field in the block is the magic number. If it is not, |
|
3563 test for a regex that was compiled on a host of opposite endianness. If this is |
|
3564 the case, flipped values are put in internal_re and internal_study if there was |
|
3565 study data too. */ |
|
3566 |
|
3567 if (re->magic_number != MAGIC_NUMBER) |
|
3568 { |
|
3569 re = _pcre_try_flipped(re, &internal_re, study, &internal_study); |
|
3570 if (re == NULL) return PCRE_ERROR_BADMAGIC; |
|
3571 if (study != NULL) study = &internal_study; |
|
3572 } |
|
3573 |
|
3574 /* Set up other data */ |
|
3575 |
|
3576 anchored = ((re->options | options) & PCRE_ANCHORED) != 0; |
|
3577 startline = (re->options & PCRE_STARTLINE) != 0; |
|
3578 firstline = (re->options & PCRE_FIRSTLINE) != 0; |
|
3579 |
|
3580 /* The code starts after the real_pcre block and the capture name table. */ |
|
3581 |
|
3582 match_block.start_code = (const uschar *)external_re + re->name_table_offset + |
|
3583 re->name_count * re->name_entry_size; |
|
3584 |
|
3585 match_block.start_subject = (const pcre_uchar *)subject; |
|
3586 match_block.start_offset = start_offset; |
|
3587 match_block.end_subject = match_block.start_subject + length; |
|
3588 end_subject = match_block.end_subject; |
|
3589 |
|
3590 match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0; |
|
3591 match_block.utf8 = (re->options & PCRE_UTF8) != 0; |
|
3592 |
|
3593 match_block.notbol = (options & PCRE_NOTBOL) != 0; |
|
3594 match_block.noteol = (options & PCRE_NOTEOL) != 0; |
|
3595 match_block.notempty = (options & PCRE_NOTEMPTY) != 0; |
|
3596 match_block.partial = (options & PCRE_PARTIAL) != 0; |
|
3597 match_block.hitend = FALSE; |
|
3598 |
|
3599 match_block.recursive = NULL; /* No recursion at top level */ |
|
3600 |
|
3601 match_block.lcc = tables + lcc_offset; |
|
3602 match_block.ctypes = tables + ctypes_offset; |
|
3603 |
|
3604 /* Partial matching is supported only for a restricted set of regexes at the |
|
3605 moment. */ |
|
3606 |
|
3607 if (match_block.partial && (re->options & PCRE_NOPARTIAL) != 0) |
|
3608 return PCRE_ERROR_BADPARTIAL; |
|
3609 |
|
3610 /* Check a UTF-8 string if required. Unfortunately there's no way of passing |
|
3611 back the character offset. */ |
|
3612 |
|
3613 #if !PCRE_UTF16 |
|
3614 #ifdef SUPPORT_UTF8 |
|
3615 if (match_block.utf8 && (options & PCRE_NO_UTF8_CHECK) == 0) |
|
3616 { |
|
3617 if (_pcre_valid_utf8((pcre_uchar *)subject, length) >= 0) |
|
3618 return PCRE_ERROR_BADUTF8; |
|
3619 if (start_offset > 0 && start_offset < length) |
|
3620 { |
|
3621 int tb = ((pcre_uchar *)subject)[start_offset]; |
|
3622 if (tb > 127) |
|
3623 { |
|
3624 tb &= 0xc0; |
|
3625 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET; |
|
3626 } |
|
3627 } |
|
3628 } |
|
3629 #endif |
|
3630 #endif |
|
3631 |
|
3632 /* The ims options can vary during the matching as a result of the presence |
|
3633 of (?ims) items in the pattern. They are kept in a local variable so that |
|
3634 restoring at the exit of a group is easy. */ |
|
3635 |
|
3636 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL); |
|
3637 |
|
3638 /* If the expression has got more back references than the offsets supplied can |
|
3639 hold, we get a temporary chunk of working store to use during the matching. |
|
3640 Otherwise, we can use the vector supplied, rounding down its size to a multiple |
|
3641 of 3. */ |
|
3642 |
|
3643 ocount = offsetcount - (offsetcount % 3); |
|
3644 |
|
3645 if (re->top_backref > 0 && re->top_backref >= ocount/3) |
|
3646 { |
|
3647 ocount = re->top_backref * 3 + 3; |
|
3648 match_block.offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int)); |
|
3649 if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY; |
|
3650 using_temporary_offsets = TRUE; |
|
3651 DPRINTF(("Got memory to hold back references\n")); |
|
3652 } |
|
3653 else match_block.offset_vector = offsets; |
|
3654 |
|
3655 match_block.offset_end = ocount; |
|
3656 match_block.offset_max = (2*ocount)/3; |
|
3657 match_block.offset_overflow = FALSE; |
|
3658 match_block.capture_last = -1; |
|
3659 |
|
3660 /* Compute the minimum number of offsets that we need to reset each time. Doing |
|
3661 this makes a huge difference to execution time when there aren't many brackets |
|
3662 in the pattern. */ |
|
3663 |
|
3664 resetcount = 2 + re->top_bracket * 2; |
|
3665 if (resetcount > offsetcount) resetcount = ocount; |
|
3666 |
|
3667 /* Reset the working variable associated with each extraction. These should |
|
3668 never be used unless previously set, but they get saved and restored, and so we |
|
3669 initialize them to avoid reading uninitialized locations. */ |
|
3670 |
|
3671 if (match_block.offset_vector != NULL) |
|
3672 { |
|
3673 register int *iptr = match_block.offset_vector + ocount; |
|
3674 register int *iend = iptr - resetcount/2 + 1; |
|
3675 while (--iptr >= iend) *iptr = -1; |
|
3676 } |
|
3677 |
|
3678 /* Set up the first character to match, if available. The first_byte value is |
|
3679 never set for an anchored regular expression, but the anchoring may be forced |
|
3680 at run time, so we have to test for anchoring. The first char may be unset for |
|
3681 an unanchored pattern, of course. If there's no first char and the pattern was |
|
3682 studied, there may be a bitmap of possible first characters. */ |
|
3683 |
|
3684 if (!anchored) |
|
3685 { |
|
3686 if ((re->options & PCRE_FIRSTSET) != 0) |
|
3687 { |
|
3688 first_byte = re->first_byte & 255; |
|
3689 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE) |
|
3690 first_byte = match_block.lcc[first_byte]; |
|
3691 } |
|
3692 else |
|
3693 if (!startline && study != NULL && |
|
3694 (study->options & PCRE_STUDY_MAPPED) != 0) |
|
3695 start_bits = study->start_bits; |
|
3696 } |
|
3697 |
|
3698 /* For anchored or unanchored matches, there may be a "last known required |
|
3699 character" set. */ |
|
3700 |
|
3701 if ((re->options & PCRE_REQCHSET) != 0) |
|
3702 { |
|
3703 req_byte = re->req_byte & 255; |
|
3704 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0; |
|
3705 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */ |
|
3706 } |
|
3707 |
|
3708 /* Loop for handling unanchored repeated matching attempts; for anchored regexs |
|
3709 the loop runs just once. */ |
|
3710 |
|
3711 do |
|
3712 { |
|
3713 const pcre_uchar *save_end_subject = end_subject; |
|
3714 |
|
3715 /* Reset the maximum number of extractions we might see. */ |
|
3716 |
|
3717 if (match_block.offset_vector != NULL) |
|
3718 { |
|
3719 register int *iptr = match_block.offset_vector; |
|
3720 register int *iend = iptr + resetcount; |
|
3721 while (iptr < iend) *iptr++ = -1; |
|
3722 } |
|
3723 |
|
3724 /* Advance to a unique first char if possible. If firstline is TRUE, the |
|
3725 start of the match is constrained to the first line of a multiline string. |
|
3726 Implement this by temporarily adjusting end_subject so that we stop scanning |
|
3727 at a newline. If the match fails at the newline, later code breaks this loop. |
|
3728 */ |
|
3729 |
|
3730 if (firstline) |
|
3731 { |
|
3732 const pcre_uchar *t = start_match; |
|
3733 while (t < save_end_subject && *t != '\n') t++; |
|
3734 end_subject = t; |
|
3735 } |
|
3736 |
|
3737 /* Now test for a unique first byte */ |
|
3738 |
|
3739 if (first_byte >= 0) |
|
3740 { |
|
3741 pcre_uchar first_char = first_byte; |
|
3742 if (first_byte_caseless) |
|
3743 while (start_match < end_subject) |
|
3744 { |
|
3745 int sm = *start_match; |
|
3746 #if PCRE_UTF16 |
|
3747 if (sm > 127) |
|
3748 break; |
|
3749 #endif |
|
3750 if (match_block.lcc[sm] == first_char) |
|
3751 break; |
|
3752 start_match++; |
|
3753 } |
|
3754 else |
|
3755 while (start_match < end_subject && *start_match != first_char) |
|
3756 start_match++; |
|
3757 } |
|
3758 |
|
3759 /* Or to just after \n for a multiline match if possible */ |
|
3760 |
|
3761 else if (startline) |
|
3762 { |
|
3763 if (start_match > match_block.start_subject + start_offset) |
|
3764 { |
|
3765 while (start_match < end_subject && start_match[-1] != NEWLINE) |
|
3766 start_match++; |
|
3767 } |
|
3768 } |
|
3769 |
|
3770 /* Or to a non-unique first char after study */ |
|
3771 |
|
3772 else if (start_bits != NULL) |
|
3773 { |
|
3774 while (start_match < end_subject) |
|
3775 { |
|
3776 register unsigned int c = *start_match; |
|
3777 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break; |
|
3778 } |
|
3779 } |
|
3780 |
|
3781 /* Restore fudged end_subject */ |
|
3782 |
|
3783 end_subject = save_end_subject; |
|
3784 |
|
3785 #ifdef DEBUG /* Sigh. Some compilers never learn. */ |
|
3786 printf(">>>> Match against: "); |
|
3787 pchars(start_match, end_subject - start_match, TRUE, &match_block); |
|
3788 printf("\n"); |
|
3789 #endif |
|
3790 |
|
3791 /* If req_byte is set, we know that that character must appear in the subject |
|
3792 for the match to succeed. If the first character is set, req_byte must be |
|
3793 later in the subject; otherwise the test starts at the match point. This |
|
3794 optimization can save a huge amount of backtracking in patterns with nested |
|
3795 unlimited repeats that aren't going to match. Writing separate code for |
|
3796 cased/caseless versions makes it go faster, as does using an autoincrement |
|
3797 and backing off on a match. |
|
3798 |
|
3799 HOWEVER: when the subject string is very, very long, searching to its end can |
|
3800 take a long time, and give bad performance on quite ordinary patterns. This |
|
3801 showed up when somebody was matching /^C/ on a 32-megabyte string... so we |
|
3802 don't do this when the string is sufficiently long. |
|
3803 |
|
3804 ALSO: this processing is disabled when partial matching is requested. |
|
3805 */ |
|
3806 |
|
3807 if (req_byte >= 0 && |
|
3808 end_subject - start_match < REQ_BYTE_MAX && |
|
3809 !match_block.partial) |
|
3810 { |
|
3811 register const pcre_uchar *p = start_match + ((first_byte >= 0)? 1 : 0); |
|
3812 |
|
3813 /* We don't need to repeat the search if we haven't yet reached the |
|
3814 place we found it at last time. */ |
|
3815 |
|
3816 if (p > req_byte_ptr) |
|
3817 { |
|
3818 if (req_byte_caseless) |
|
3819 { |
|
3820 while (p < end_subject) |
|
3821 { |
|
3822 register int pp = *p++; |
|
3823 if (pp == req_byte || pp == req_byte2) { p--; break; } |
|
3824 } |
|
3825 } |
|
3826 else |
|
3827 { |
|
3828 while (p < end_subject) |
|
3829 { |
|
3830 if (*p++ == req_byte) { p--; break; } |
|
3831 } |
|
3832 } |
|
3833 |
|
3834 /* If we can't find the required character, break the matching loop */ |
|
3835 |
|
3836 if (p >= end_subject) break; |
|
3837 |
|
3838 /* If we have found the required character, save the point where we |
|
3839 found it, so that we don't search again next time round the loop if |
|
3840 the start hasn't passed this character yet. */ |
|
3841 |
|
3842 req_byte_ptr = p; |
|
3843 } |
|
3844 } |
|
3845 |
|
3846 /* When a match occurs, substrings will be set for all internal extractions; |
|
3847 we just need to set up the whole thing as substring 0 before returning. If |
|
3848 there were too many extractions, set the return code to zero. In the case |
|
3849 where we had to get some local store to hold offsets for backreferences, copy |
|
3850 those back references that we can. In this case there need not be overflow |
|
3851 if certain parts of the pattern were not used. */ |
|
3852 |
|
3853 match_block.start_match = start_match; |
|
3854 match_block.match_call_count = 0; |
|
3855 |
|
3856 rc = match(start_match, match_block.start_code, 2, &match_block, ims, NULL, |
|
3857 match_isgroup); |
|
3858 |
|
3859 /* When the result is no match, if the subject's first character was a |
|
3860 newline and the PCRE_FIRSTLINE option is set, break (which will return |
|
3861 PCRE_ERROR_NOMATCH). The option requests that a match occur before the first |
|
3862 newline in the subject. Otherwise, advance the pointer to the next character |
|
3863 and continue - but the continuation will actually happen only when the |
|
3864 pattern is not anchored. */ |
|
3865 |
|
3866 if (rc == MATCH_NOMATCH) |
|
3867 { |
|
3868 if (firstline && *start_match == NEWLINE) break; |
|
3869 start_match++; |
|
3870 #ifdef SUPPORT_UTF8 |
|
3871 if (match_block.utf8) |
|
3872 while(start_match < end_subject && ISMIDCHAR(*start_match)) |
|
3873 start_match++; |
|
3874 #endif |
|
3875 continue; |
|
3876 } |
|
3877 |
|
3878 if (rc != MATCH_MATCH) |
|
3879 { |
|
3880 DPRINTF((">>>> error: returning %d\n", rc)); |
|
3881 return rc; |
|
3882 } |
|
3883 |
|
3884 /* We have a match! Copy the offset information from temporary store if |
|
3885 necessary */ |
|
3886 |
|
3887 if (using_temporary_offsets) |
|
3888 { |
|
3889 if (offsetcount >= 4) |
|
3890 { |
|
3891 memcpy(offsets + 2, match_block.offset_vector + 2, |
|
3892 (offsetcount - 2) * sizeof(int)); |
|
3893 DPRINTF(("Copied offsets from temporary memory\n")); |
|
3894 } |
|
3895 if (match_block.end_offset_top > offsetcount) |
|
3896 match_block.offset_overflow = TRUE; |
|
3897 |
|
3898 DPRINTF(("Freeing temporary memory\n")); |
|
3899 (pcre_free)(match_block.offset_vector); |
|
3900 } |
|
3901 |
|
3902 rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2; |
|
3903 |
|
3904 if (offsetcount < 2) rc = 0; else |
|
3905 { |
|
3906 offsets[0] = INT_CAST(start_match - match_block.start_subject); |
|
3907 offsets[1] = INT_CAST(match_block.end_match_ptr - match_block.start_subject); |
|
3908 } |
|
3909 |
|
3910 DPRINTF((">>>> returning %d\n", rc)); |
|
3911 return rc; |
|
3912 } |
|
3913 |
|
3914 /* This "while" is the end of the "do" above */ |
|
3915 |
|
3916 while (!anchored && start_match <= end_subject); |
|
3917 |
|
3918 if (using_temporary_offsets) |
|
3919 { |
|
3920 DPRINTF(("Freeing temporary memory\n")); |
|
3921 (pcre_free)(match_block.offset_vector); |
|
3922 } |
|
3923 |
|
3924 if (match_block.partial && match_block.hitend) |
|
3925 { |
|
3926 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n")); |
|
3927 return PCRE_ERROR_PARTIAL; |
|
3928 } |
|
3929 else |
|
3930 { |
|
3931 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n")); |
|
3932 return PCRE_ERROR_NOMATCH; |
|
3933 } |
|
3934 } |
|
3935 |
|
3936 /* End of pcre_exec.c */ |