|
1 /************************************************* |
|
2 * Perl-Compatible Regular Expressions * |
|
3 *************************************************/ |
|
4 |
|
5 /* PCRE is a library of functions to support regular expressions whose syntax |
|
6 and semantics are as close as possible to those of the Perl 5 language. |
|
7 |
|
8 Written by Philip Hazel |
|
9 Copyright (c) 1997-2008 University of Cambridge |
|
10 |
|
11 ----------------------------------------------------------------------------- |
|
12 Redistribution and use in source and binary forms, with or without |
|
13 modification, are permitted provided that the following conditions are met: |
|
14 |
|
15 * Redistributions of source code must retain the above copyright notice, |
|
16 this list of conditions and the following disclaimer. |
|
17 |
|
18 * Redistributions in binary form must reproduce the above copyright |
|
19 notice, this list of conditions and the following disclaimer in the |
|
20 documentation and/or other materials provided with the distribution. |
|
21 |
|
22 * Neither the name of the University of Cambridge nor the names of its |
|
23 contributors may be used to endorse or promote products derived from |
|
24 this software without specific prior written permission. |
|
25 |
|
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
|
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
|
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
|
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
|
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
|
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
|
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
|
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
|
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
|
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
|
36 POSSIBILITY OF SUCH DAMAGE. |
|
37 ----------------------------------------------------------------------------- |
|
38 */ |
|
39 |
|
40 |
|
41 /* This module contains pcre_exec(), the externally visible function that does |
|
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as |
|
43 possible. There are also some static supporting functions. */ |
|
44 |
|
45 #ifdef HAVE_CONFIG_H |
|
46 #include "config.h" |
|
47 #endif |
|
48 |
|
49 #define NLBLOCK md /* Block containing newline information */ |
|
50 #define PSSTART start_subject /* Field containing processed string start */ |
|
51 #define PSEND end_subject /* Field containing processed string end */ |
|
52 |
|
53 #include "pcre_internal.h" |
|
54 |
|
55 /* Undefine some potentially clashing cpp symbols */ |
|
56 |
|
57 #undef min |
|
58 #undef max |
|
59 |
|
60 /* Flag bits for the match() function */ |
|
61 |
|
62 #define match_condassert 0x01 /* Called to check a condition assertion */ |
|
63 #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */ |
|
64 |
|
65 /* Non-error returns from the match() function. Error returns are externally |
|
66 defined PCRE_ERROR_xxx codes, which are all negative. */ |
|
67 |
|
68 #define MATCH_MATCH 1 |
|
69 #define MATCH_NOMATCH 0 |
|
70 |
|
71 /* Special internal returns from the match() function. Make them sufficiently |
|
72 negative to avoid the external error codes. */ |
|
73 |
|
74 #define MATCH_COMMIT (-999) |
|
75 #define MATCH_PRUNE (-998) |
|
76 #define MATCH_SKIP (-997) |
|
77 #define MATCH_THEN (-996) |
|
78 |
|
79 /* Maximum number of ints of offset to save on the stack for recursive calls. |
|
80 If the offset vector is bigger, malloc is used. This should be a multiple of 3, |
|
81 because the offset vector is always a multiple of 3 long. */ |
|
82 |
|
83 #define REC_STACK_SAVE_MAX 30 |
|
84 |
|
85 /* Min and max values for the common repeats; for the maxima, 0 => infinity */ |
|
86 |
|
87 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 }; |
|
88 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 }; |
|
89 |
|
90 |
|
91 |
|
92 #ifdef DEBUG |
|
93 /************************************************* |
|
94 * Debugging function to print chars * |
|
95 *************************************************/ |
|
96 |
|
97 /* Print a sequence of chars in printable format, stopping at the end of the |
|
98 subject if the requested. |
|
99 |
|
100 Arguments: |
|
101 p points to characters |
|
102 length number to print |
|
103 is_subject TRUE if printing from within md->start_subject |
|
104 md pointer to matching data block, if is_subject is TRUE |
|
105 |
|
106 Returns: nothing |
|
107 */ |
|
108 |
|
109 static void |
|
110 pchars(const uschar *p, int length, BOOL is_subject, match_data *md) |
|
111 { |
|
112 unsigned int c; |
|
113 if (is_subject && length > md->end_subject - p) length = md->end_subject - p; |
|
114 while (length-- > 0) |
|
115 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c); |
|
116 } |
|
117 #endif |
|
118 |
|
119 |
|
120 |
|
121 /************************************************* |
|
122 * Match a back-reference * |
|
123 *************************************************/ |
|
124 |
|
125 /* If a back reference hasn't been set, the length that is passed is greater |
|
126 than the number of characters left in the string, so the match fails. |
|
127 |
|
128 Arguments: |
|
129 offset index into the offset vector |
|
130 eptr points into the subject |
|
131 length length to be matched |
|
132 md points to match data block |
|
133 ims the ims flags |
|
134 |
|
135 Returns: TRUE if matched |
|
136 */ |
|
137 |
|
138 static BOOL |
|
139 match_ref(int offset, register USPTR eptr, int length, match_data *md, |
|
140 unsigned long int ims) |
|
141 { |
|
142 USPTR p = md->start_subject + md->offset_vector[offset]; |
|
143 |
|
144 #ifdef DEBUG |
|
145 if (eptr >= md->end_subject) |
|
146 printf("matching subject <null>"); |
|
147 else |
|
148 { |
|
149 printf("matching subject "); |
|
150 pchars(eptr, length, TRUE, md); |
|
151 } |
|
152 printf(" against backref "); |
|
153 pchars(p, length, FALSE, md); |
|
154 printf("\n"); |
|
155 #endif |
|
156 |
|
157 /* Always fail if not enough characters left */ |
|
158 |
|
159 if (length > md->end_subject - eptr) return FALSE; |
|
160 |
|
161 /* Separate the caseless case for speed. In UTF-8 mode we can only do this |
|
162 properly if Unicode properties are supported. Otherwise, we can check only |
|
163 ASCII characters. */ |
|
164 |
|
165 if ((ims & PCRE_CASELESS) != 0) |
|
166 { |
|
167 #ifdef SUPPORT_UTF8 |
|
168 #ifdef SUPPORT_UCP |
|
169 if (md->utf8) |
|
170 { |
|
171 USPTR endptr = eptr + length; |
|
172 while (eptr < endptr) |
|
173 { |
|
174 int c, d; |
|
175 GETCHARINC(c, eptr); |
|
176 GETCHARINC(d, p); |
|
177 if (c != d && c != UCD_OTHERCASE(d)) return FALSE; |
|
178 } |
|
179 } |
|
180 else |
|
181 #endif |
|
182 #endif |
|
183 |
|
184 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there |
|
185 is no UCP support. */ |
|
186 |
|
187 while (length-- > 0) |
|
188 { if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; } |
|
189 } |
|
190 |
|
191 /* In the caseful case, we can just compare the bytes, whether or not we |
|
192 are in UTF-8 mode. */ |
|
193 |
|
194 else |
|
195 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; } |
|
196 |
|
197 return TRUE; |
|
198 } |
|
199 |
|
200 |
|
201 |
|
202 /*************************************************************************** |
|
203 **************************************************************************** |
|
204 RECURSION IN THE match() FUNCTION |
|
205 |
|
206 The match() function is highly recursive, though not every recursive call |
|
207 increases the recursive depth. Nevertheless, some regular expressions can cause |
|
208 it to recurse to a great depth. I was writing for Unix, so I just let it call |
|
209 itself recursively. This uses the stack for saving everything that has to be |
|
210 saved for a recursive call. On Unix, the stack can be large, and this works |
|
211 fine. |
|
212 |
|
213 It turns out that on some non-Unix-like systems there are problems with |
|
214 programs that use a lot of stack. (This despite the fact that every last chip |
|
215 has oodles of memory these days, and techniques for extending the stack have |
|
216 been known for decades.) So.... |
|
217 |
|
218 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive |
|
219 calls by keeping local variables that need to be preserved in blocks of memory |
|
220 obtained from malloc() instead instead of on the stack. Macros are used to |
|
221 achieve this so that the actual code doesn't look very different to what it |
|
222 always used to. |
|
223 |
|
224 The original heap-recursive code used longjmp(). However, it seems that this |
|
225 can be very slow on some operating systems. Following a suggestion from Stan |
|
226 Switzer, the use of longjmp() has been abolished, at the cost of having to |
|
227 provide a unique number for each call to RMATCH. There is no way of generating |
|
228 a sequence of numbers at compile time in C. I have given them names, to make |
|
229 them stand out more clearly. |
|
230 |
|
231 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on |
|
232 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard |
|
233 tests. Furthermore, not using longjmp() means that local dynamic variables |
|
234 don't have indeterminate values; this has meant that the frame size can be |
|
235 reduced because the result can be "passed back" by straight setting of the |
|
236 variable instead of being passed in the frame. |
|
237 **************************************************************************** |
|
238 ***************************************************************************/ |
|
239 |
|
240 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN |
|
241 below must be updated in sync. */ |
|
242 |
|
243 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10, |
|
244 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20, |
|
245 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30, |
|
246 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40, |
|
247 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50, |
|
248 RM51, RM52, RM53, RM54 }; |
|
249 |
|
250 /* These versions of the macros use the stack, as normal. There are debugging |
|
251 versions and production versions. Note that the "rw" argument of RMATCH isn't |
|
252 actuall used in this definition. */ |
|
253 |
|
254 #ifndef NO_RECURSE |
|
255 #define REGISTER register |
|
256 |
|
257 #ifdef DEBUG |
|
258 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \ |
|
259 { \ |
|
260 printf("match() called in line %d\n", __LINE__); \ |
|
261 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1); \ |
|
262 printf("to line %d\n", __LINE__); \ |
|
263 } |
|
264 #define RRETURN(ra) \ |
|
265 { \ |
|
266 printf("match() returned %d from line %d ", ra, __LINE__); \ |
|
267 return ra; \ |
|
268 } |
|
269 #else |
|
270 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \ |
|
271 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1) |
|
272 #define RRETURN(ra) return ra |
|
273 #endif |
|
274 |
|
275 #else |
|
276 |
|
277 |
|
278 /* These versions of the macros manage a private stack on the heap. Note that |
|
279 the "rd" argument of RMATCH isn't actually used in this definition. It's the md |
|
280 argument of match(), which never changes. */ |
|
281 |
|
282 #define REGISTER |
|
283 |
|
284 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\ |
|
285 {\ |
|
286 heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\ |
|
287 frame->Xwhere = rw; \ |
|
288 newframe->Xeptr = ra;\ |
|
289 newframe->Xecode = rb;\ |
|
290 newframe->Xmstart = mstart;\ |
|
291 newframe->Xoffset_top = rc;\ |
|
292 newframe->Xims = re;\ |
|
293 newframe->Xeptrb = rf;\ |
|
294 newframe->Xflags = rg;\ |
|
295 newframe->Xrdepth = frame->Xrdepth + 1;\ |
|
296 newframe->Xprevframe = frame;\ |
|
297 frame = newframe;\ |
|
298 DPRINTF(("restarting from line %d\n", __LINE__));\ |
|
299 goto HEAP_RECURSE;\ |
|
300 L_##rw:\ |
|
301 DPRINTF(("jumped back to line %d\n", __LINE__));\ |
|
302 } |
|
303 |
|
304 #define RRETURN(ra)\ |
|
305 {\ |
|
306 heapframe *newframe = frame;\ |
|
307 frame = newframe->Xprevframe;\ |
|
308 (pcre_stack_free)(newframe);\ |
|
309 if (frame != NULL)\ |
|
310 {\ |
|
311 rrc = ra;\ |
|
312 goto HEAP_RETURN;\ |
|
313 }\ |
|
314 return ra;\ |
|
315 } |
|
316 |
|
317 |
|
318 /* Structure for remembering the local variables in a private frame */ |
|
319 |
|
320 typedef struct heapframe { |
|
321 struct heapframe *Xprevframe; |
|
322 |
|
323 /* Function arguments that may change */ |
|
324 |
|
325 const uschar *Xeptr; |
|
326 const uschar *Xecode; |
|
327 const uschar *Xmstart; |
|
328 int Xoffset_top; |
|
329 long int Xims; |
|
330 eptrblock *Xeptrb; |
|
331 int Xflags; |
|
332 unsigned int Xrdepth; |
|
333 |
|
334 /* Function local variables */ |
|
335 |
|
336 const uschar *Xcallpat; |
|
337 const uschar *Xcharptr; |
|
338 const uschar *Xdata; |
|
339 const uschar *Xnext; |
|
340 const uschar *Xpp; |
|
341 const uschar *Xprev; |
|
342 const uschar *Xsaved_eptr; |
|
343 |
|
344 recursion_info Xnew_recursive; |
|
345 |
|
346 BOOL Xcur_is_word; |
|
347 BOOL Xcondition; |
|
348 BOOL Xprev_is_word; |
|
349 |
|
350 unsigned long int Xoriginal_ims; |
|
351 |
|
352 #ifdef SUPPORT_UCP |
|
353 int Xprop_type; |
|
354 int Xprop_value; |
|
355 int Xprop_fail_result; |
|
356 int Xprop_category; |
|
357 int Xprop_chartype; |
|
358 int Xprop_script; |
|
359 int Xoclength; |
|
360 uschar Xocchars[8]; |
|
361 #endif |
|
362 |
|
363 int Xctype; |
|
364 unsigned int Xfc; |
|
365 int Xfi; |
|
366 int Xlength; |
|
367 int Xmax; |
|
368 int Xmin; |
|
369 int Xnumber; |
|
370 int Xoffset; |
|
371 int Xop; |
|
372 int Xsave_capture_last; |
|
373 int Xsave_offset1, Xsave_offset2, Xsave_offset3; |
|
374 int Xstacksave[REC_STACK_SAVE_MAX]; |
|
375 |
|
376 eptrblock Xnewptrb; |
|
377 |
|
378 /* Where to jump back to */ |
|
379 |
|
380 int Xwhere; |
|
381 |
|
382 } heapframe; |
|
383 |
|
384 #endif |
|
385 |
|
386 |
|
387 /*************************************************************************** |
|
388 ***************************************************************************/ |
|
389 |
|
390 |
|
391 |
|
392 /************************************************* |
|
393 * Match from current position * |
|
394 *************************************************/ |
|
395 |
|
396 /* This function is called recursively in many circumstances. Whenever it |
|
397 returns a negative (error) response, the outer incarnation must also return the |
|
398 same response. |
|
399 |
|
400 Performance note: It might be tempting to extract commonly used fields from the |
|
401 md structure (e.g. utf8, end_subject) into individual variables to improve |
|
402 performance. Tests using gcc on a SPARC disproved this; in the first case, it |
|
403 made performance worse. |
|
404 |
|
405 Arguments: |
|
406 eptr pointer to current character in subject |
|
407 ecode pointer to current position in compiled code |
|
408 mstart pointer to the current match start position (can be modified |
|
409 by encountering \K) |
|
410 offset_top current top pointer |
|
411 md pointer to "static" info for the match |
|
412 ims current /i, /m, and /s options |
|
413 eptrb pointer to chain of blocks containing eptr at start of |
|
414 brackets - for testing for empty matches |
|
415 flags can contain |
|
416 match_condassert - this is an assertion condition |
|
417 match_cbegroup - this is the start of an unlimited repeat |
|
418 group that can match an empty string |
|
419 rdepth the recursion depth |
|
420 |
|
421 Returns: MATCH_MATCH if matched ) these values are >= 0 |
|
422 MATCH_NOMATCH if failed to match ) |
|
423 a negative PCRE_ERROR_xxx value if aborted by an error condition |
|
424 (e.g. stopped by repeated call or recursion limit) |
|
425 */ |
|
426 |
|
427 static int |
|
428 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, const uschar *mstart, |
|
429 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb, |
|
430 int flags, unsigned int rdepth) |
|
431 { |
|
432 /* These variables do not need to be preserved over recursion in this function, |
|
433 so they can be ordinary variables in all cases. Mark some of them with |
|
434 "register" because they are used a lot in loops. */ |
|
435 |
|
436 register int rrc; /* Returns from recursive calls */ |
|
437 register int i; /* Used for loops not involving calls to RMATCH() */ |
|
438 register unsigned int c; /* Character values not kept over RMATCH() calls */ |
|
439 register BOOL utf8; /* Local copy of UTF-8 flag for speed */ |
|
440 |
|
441 BOOL minimize, possessive; /* Quantifier options */ |
|
442 |
|
443 /* When recursion is not being used, all "local" variables that have to be |
|
444 preserved over calls to RMATCH() are part of a "frame" which is obtained from |
|
445 heap storage. Set up the top-level frame here; others are obtained from the |
|
446 heap whenever RMATCH() does a "recursion". See the macro definitions above. */ |
|
447 |
|
448 #ifdef NO_RECURSE |
|
449 heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe)); |
|
450 frame->Xprevframe = NULL; /* Marks the top level */ |
|
451 |
|
452 /* Copy in the original argument variables */ |
|
453 |
|
454 frame->Xeptr = eptr; |
|
455 frame->Xecode = ecode; |
|
456 frame->Xmstart = mstart; |
|
457 frame->Xoffset_top = offset_top; |
|
458 frame->Xims = ims; |
|
459 frame->Xeptrb = eptrb; |
|
460 frame->Xflags = flags; |
|
461 frame->Xrdepth = rdepth; |
|
462 |
|
463 /* This is where control jumps back to to effect "recursion" */ |
|
464 |
|
465 HEAP_RECURSE: |
|
466 |
|
467 /* Macros make the argument variables come from the current frame */ |
|
468 |
|
469 #define eptr frame->Xeptr |
|
470 #define ecode frame->Xecode |
|
471 #define mstart frame->Xmstart |
|
472 #define offset_top frame->Xoffset_top |
|
473 #define ims frame->Xims |
|
474 #define eptrb frame->Xeptrb |
|
475 #define flags frame->Xflags |
|
476 #define rdepth frame->Xrdepth |
|
477 |
|
478 /* Ditto for the local variables */ |
|
479 |
|
480 #ifdef SUPPORT_UTF8 |
|
481 #define charptr frame->Xcharptr |
|
482 #endif |
|
483 #define callpat frame->Xcallpat |
|
484 #define data frame->Xdata |
|
485 #define next frame->Xnext |
|
486 #define pp frame->Xpp |
|
487 #define prev frame->Xprev |
|
488 #define saved_eptr frame->Xsaved_eptr |
|
489 |
|
490 #define new_recursive frame->Xnew_recursive |
|
491 |
|
492 #define cur_is_word frame->Xcur_is_word |
|
493 #define condition frame->Xcondition |
|
494 #define prev_is_word frame->Xprev_is_word |
|
495 |
|
496 #define original_ims frame->Xoriginal_ims |
|
497 |
|
498 #ifdef SUPPORT_UCP |
|
499 #define prop_type frame->Xprop_type |
|
500 #define prop_value frame->Xprop_value |
|
501 #define prop_fail_result frame->Xprop_fail_result |
|
502 #define prop_category frame->Xprop_category |
|
503 #define prop_chartype frame->Xprop_chartype |
|
504 #define prop_script frame->Xprop_script |
|
505 #define oclength frame->Xoclength |
|
506 #define occhars frame->Xocchars |
|
507 #endif |
|
508 |
|
509 #define ctype frame->Xctype |
|
510 #define fc frame->Xfc |
|
511 #define fi frame->Xfi |
|
512 #define length frame->Xlength |
|
513 #define max frame->Xmax |
|
514 #define min frame->Xmin |
|
515 #define number frame->Xnumber |
|
516 #define offset frame->Xoffset |
|
517 #define op frame->Xop |
|
518 #define save_capture_last frame->Xsave_capture_last |
|
519 #define save_offset1 frame->Xsave_offset1 |
|
520 #define save_offset2 frame->Xsave_offset2 |
|
521 #define save_offset3 frame->Xsave_offset3 |
|
522 #define stacksave frame->Xstacksave |
|
523 |
|
524 #define newptrb frame->Xnewptrb |
|
525 |
|
526 /* When recursion is being used, local variables are allocated on the stack and |
|
527 get preserved during recursion in the normal way. In this environment, fi and |
|
528 i, and fc and c, can be the same variables. */ |
|
529 |
|
530 #else /* NO_RECURSE not defined */ |
|
531 #define fi i |
|
532 #define fc c |
|
533 |
|
534 |
|
535 #ifdef SUPPORT_UTF8 /* Many of these variables are used only */ |
|
536 const uschar *charptr; /* in small blocks of the code. My normal */ |
|
537 #endif /* style of coding would have declared */ |
|
538 const uschar *callpat; /* them within each of those blocks. */ |
|
539 const uschar *data; /* However, in order to accommodate the */ |
|
540 const uschar *next; /* version of this code that uses an */ |
|
541 USPTR pp; /* external "stack" implemented on the */ |
|
542 const uschar *prev; /* heap, it is easier to declare them all */ |
|
543 USPTR saved_eptr; /* here, so the declarations can be cut */ |
|
544 /* out in a block. The only declarations */ |
|
545 recursion_info new_recursive; /* within blocks below are for variables */ |
|
546 /* that do not have to be preserved over */ |
|
547 BOOL cur_is_word; /* a recursive call to RMATCH(). */ |
|
548 BOOL condition; |
|
549 BOOL prev_is_word; |
|
550 |
|
551 unsigned long int original_ims; |
|
552 |
|
553 #ifdef SUPPORT_UCP |
|
554 int prop_type; |
|
555 int prop_value; |
|
556 int prop_fail_result; |
|
557 int prop_category; |
|
558 int prop_chartype; |
|
559 int prop_script; |
|
560 int oclength; |
|
561 uschar occhars[8]; |
|
562 #endif |
|
563 |
|
564 int ctype; |
|
565 int length; |
|
566 int max; |
|
567 int min; |
|
568 int number; |
|
569 int offset; |
|
570 int op; |
|
571 int save_capture_last; |
|
572 int save_offset1, save_offset2, save_offset3; |
|
573 int stacksave[REC_STACK_SAVE_MAX]; |
|
574 |
|
575 eptrblock newptrb; |
|
576 #endif /* NO_RECURSE */ |
|
577 |
|
578 /* These statements are here to stop the compiler complaining about unitialized |
|
579 variables. */ |
|
580 |
|
581 #ifdef SUPPORT_UCP |
|
582 prop_value = 0; |
|
583 prop_fail_result = 0; |
|
584 #endif |
|
585 |
|
586 |
|
587 /* This label is used for tail recursion, which is used in a few cases even |
|
588 when NO_RECURSE is not defined, in order to reduce the amount of stack that is |
|
589 used. Thanks to Ian Taylor for noticing this possibility and sending the |
|
590 original patch. */ |
|
591 |
|
592 TAIL_RECURSE: |
|
593 |
|
594 /* OK, now we can get on with the real code of the function. Recursive calls |
|
595 are specified by the macro RMATCH and RRETURN is used to return. When |
|
596 NO_RECURSE is *not* defined, these just turn into a recursive call to match() |
|
597 and a "return", respectively (possibly with some debugging if DEBUG is |
|
598 defined). However, RMATCH isn't like a function call because it's quite a |
|
599 complicated macro. It has to be used in one particular way. This shouldn't, |
|
600 however, impact performance when true recursion is being used. */ |
|
601 |
|
602 #ifdef SUPPORT_UTF8 |
|
603 utf8 = md->utf8; /* Local copy of the flag */ |
|
604 #else |
|
605 utf8 = FALSE; |
|
606 #endif |
|
607 |
|
608 /* First check that we haven't called match() too many times, or that we |
|
609 haven't exceeded the recursive call limit. */ |
|
610 |
|
611 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT); |
|
612 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT); |
|
613 |
|
614 original_ims = ims; /* Save for resetting on ')' */ |
|
615 |
|
616 /* At the start of a group with an unlimited repeat that may match an empty |
|
617 string, the match_cbegroup flag is set. When this is the case, add the current |
|
618 subject pointer to the chain of such remembered pointers, to be checked when we |
|
619 hit the closing ket, in order to break infinite loops that match no characters. |
|
620 When match() is called in other circumstances, don't add to the chain. The |
|
621 match_cbegroup flag must NOT be used with tail recursion, because the memory |
|
622 block that is used is on the stack, so a new one may be required for each |
|
623 match(). */ |
|
624 |
|
625 if ((flags & match_cbegroup) != 0) |
|
626 { |
|
627 newptrb.epb_saved_eptr = eptr; |
|
628 newptrb.epb_prev = eptrb; |
|
629 eptrb = &newptrb; |
|
630 } |
|
631 |
|
632 /* Now start processing the opcodes. */ |
|
633 |
|
634 for (;;) |
|
635 { |
|
636 minimize = possessive = FALSE; |
|
637 op = *ecode; |
|
638 |
|
639 /* For partial matching, remember if we ever hit the end of the subject after |
|
640 matching at least one subject character. */ |
|
641 |
|
642 if (md->partial && |
|
643 eptr >= md->end_subject && |
|
644 eptr > mstart) |
|
645 md->hitend = TRUE; |
|
646 |
|
647 switch(op) |
|
648 { |
|
649 case OP_FAIL: |
|
650 RRETURN(MATCH_NOMATCH); |
|
651 |
|
652 case OP_PRUNE: |
|
653 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, |
|
654 ims, eptrb, flags, RM51); |
|
655 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
656 RRETURN(MATCH_PRUNE); |
|
657 |
|
658 case OP_COMMIT: |
|
659 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, |
|
660 ims, eptrb, flags, RM52); |
|
661 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
662 RRETURN(MATCH_COMMIT); |
|
663 |
|
664 case OP_SKIP: |
|
665 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, |
|
666 ims, eptrb, flags, RM53); |
|
667 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
668 md->start_match_ptr = eptr; /* Pass back current position */ |
|
669 RRETURN(MATCH_SKIP); |
|
670 |
|
671 case OP_THEN: |
|
672 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, |
|
673 ims, eptrb, flags, RM54); |
|
674 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
675 RRETURN(MATCH_THEN); |
|
676 |
|
677 /* Handle a capturing bracket. If there is space in the offset vector, save |
|
678 the current subject position in the working slot at the top of the vector. |
|
679 We mustn't change the current values of the data slot, because they may be |
|
680 set from a previous iteration of this group, and be referred to by a |
|
681 reference inside the group. |
|
682 |
|
683 If the bracket fails to match, we need to restore this value and also the |
|
684 values of the final offsets, in case they were set by a previous iteration |
|
685 of the same bracket. |
|
686 |
|
687 If there isn't enough space in the offset vector, treat this as if it were |
|
688 a non-capturing bracket. Don't worry about setting the flag for the error |
|
689 case here; that is handled in the code for KET. */ |
|
690 |
|
691 case OP_CBRA: |
|
692 case OP_SCBRA: |
|
693 number = GET2(ecode, 1+LINK_SIZE); |
|
694 offset = number << 1; |
|
695 |
|
696 #ifdef DEBUG |
|
697 printf("start bracket %d\n", number); |
|
698 printf("subject="); |
|
699 pchars(eptr, 16, TRUE, md); |
|
700 printf("\n"); |
|
701 #endif |
|
702 |
|
703 if (offset < md->offset_max) |
|
704 { |
|
705 save_offset1 = md->offset_vector[offset]; |
|
706 save_offset2 = md->offset_vector[offset+1]; |
|
707 save_offset3 = md->offset_vector[md->offset_end - number]; |
|
708 save_capture_last = md->capture_last; |
|
709 |
|
710 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3)); |
|
711 md->offset_vector[md->offset_end - number] = eptr - md->start_subject; |
|
712 |
|
713 flags = (op == OP_SCBRA)? match_cbegroup : 0; |
|
714 do |
|
715 { |
|
716 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, |
|
717 ims, eptrb, flags, RM1); |
|
718 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc); |
|
719 md->capture_last = save_capture_last; |
|
720 ecode += GET(ecode, 1); |
|
721 } |
|
722 while (*ecode == OP_ALT); |
|
723 |
|
724 DPRINTF(("bracket %d failed\n", number)); |
|
725 |
|
726 md->offset_vector[offset] = save_offset1; |
|
727 md->offset_vector[offset+1] = save_offset2; |
|
728 md->offset_vector[md->offset_end - number] = save_offset3; |
|
729 |
|
730 RRETURN(MATCH_NOMATCH); |
|
731 } |
|
732 |
|
733 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat |
|
734 as a non-capturing bracket. */ |
|
735 |
|
736 /* VVVVVVVVVVVVVVVVVVVVVVVVV */ |
|
737 /* VVVVVVVVVVVVVVVVVVVVVVVVV */ |
|
738 |
|
739 DPRINTF(("insufficient capture room: treat as non-capturing\n")); |
|
740 |
|
741 /* VVVVVVVVVVVVVVVVVVVVVVVVV */ |
|
742 /* VVVVVVVVVVVVVVVVVVVVVVVVV */ |
|
743 |
|
744 /* Non-capturing bracket. Loop for all the alternatives. When we get to the |
|
745 final alternative within the brackets, we would return the result of a |
|
746 recursive call to match() whatever happened. We can reduce stack usage by |
|
747 turning this into a tail recursion, except in the case when match_cbegroup |
|
748 is set.*/ |
|
749 |
|
750 case OP_BRA: |
|
751 case OP_SBRA: |
|
752 DPRINTF(("start non-capturing bracket\n")); |
|
753 flags = (op >= OP_SBRA)? match_cbegroup : 0; |
|
754 for (;;) |
|
755 { |
|
756 if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */ |
|
757 { |
|
758 if (flags == 0) /* Not a possibly empty group */ |
|
759 { |
|
760 ecode += _pcre_OP_lengths[*ecode]; |
|
761 DPRINTF(("bracket 0 tail recursion\n")); |
|
762 goto TAIL_RECURSE; |
|
763 } |
|
764 |
|
765 /* Possibly empty group; can't use tail recursion. */ |
|
766 |
|
767 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims, |
|
768 eptrb, flags, RM48); |
|
769 RRETURN(rrc); |
|
770 } |
|
771 |
|
772 /* For non-final alternatives, continue the loop for a NOMATCH result; |
|
773 otherwise return. */ |
|
774 |
|
775 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims, |
|
776 eptrb, flags, RM2); |
|
777 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc); |
|
778 ecode += GET(ecode, 1); |
|
779 } |
|
780 /* Control never reaches here. */ |
|
781 |
|
782 /* Conditional group: compilation checked that there are no more than |
|
783 two branches. If the condition is false, skipping the first branch takes us |
|
784 past the end if there is only one branch, but that's OK because that is |
|
785 exactly what going to the ket would do. As there is only one branch to be |
|
786 obeyed, we can use tail recursion to avoid using another stack frame. */ |
|
787 |
|
788 case OP_COND: |
|
789 case OP_SCOND: |
|
790 if (ecode[LINK_SIZE+1] == OP_RREF) /* Recursion test */ |
|
791 { |
|
792 offset = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/ |
|
793 condition = md->recursive != NULL && |
|
794 (offset == RREF_ANY || offset == md->recursive->group_num); |
|
795 ecode += condition? 3 : GET(ecode, 1); |
|
796 } |
|
797 |
|
798 else if (ecode[LINK_SIZE+1] == OP_CREF) /* Group used test */ |
|
799 { |
|
800 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */ |
|
801 condition = offset < offset_top && md->offset_vector[offset] >= 0; |
|
802 ecode += condition? 3 : GET(ecode, 1); |
|
803 } |
|
804 |
|
805 else if (ecode[LINK_SIZE+1] == OP_DEF) /* DEFINE - always false */ |
|
806 { |
|
807 condition = FALSE; |
|
808 ecode += GET(ecode, 1); |
|
809 } |
|
810 |
|
811 /* The condition is an assertion. Call match() to evaluate it - setting |
|
812 the final argument match_condassert causes it to stop at the end of an |
|
813 assertion. */ |
|
814 |
|
815 else |
|
816 { |
|
817 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, |
|
818 match_condassert, RM3); |
|
819 if (rrc == MATCH_MATCH) |
|
820 { |
|
821 condition = TRUE; |
|
822 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2); |
|
823 while (*ecode == OP_ALT) ecode += GET(ecode, 1); |
|
824 } |
|
825 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) |
|
826 { |
|
827 RRETURN(rrc); /* Need braces because of following else */ |
|
828 } |
|
829 else |
|
830 { |
|
831 condition = FALSE; |
|
832 ecode += GET(ecode, 1); |
|
833 } |
|
834 } |
|
835 |
|
836 /* We are now at the branch that is to be obeyed. As there is only one, |
|
837 we can use tail recursion to avoid using another stack frame, except when |
|
838 match_cbegroup is required for an unlimited repeat of a possibly empty |
|
839 group. If the second alternative doesn't exist, we can just plough on. */ |
|
840 |
|
841 if (condition || *ecode == OP_ALT) |
|
842 { |
|
843 ecode += 1 + LINK_SIZE; |
|
844 if (op == OP_SCOND) /* Possibly empty group */ |
|
845 { |
|
846 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49); |
|
847 RRETURN(rrc); |
|
848 } |
|
849 else /* Group must match something */ |
|
850 { |
|
851 flags = 0; |
|
852 goto TAIL_RECURSE; |
|
853 } |
|
854 } |
|
855 else /* Condition false & no 2nd alternative */ |
|
856 { |
|
857 ecode += 1 + LINK_SIZE; |
|
858 } |
|
859 break; |
|
860 |
|
861 |
|
862 /* End of the pattern, either real or forced. If we are in a top-level |
|
863 recursion, we should restore the offsets appropriately and continue from |
|
864 after the call. */ |
|
865 |
|
866 case OP_ACCEPT: |
|
867 case OP_END: |
|
868 if (md->recursive != NULL && md->recursive->group_num == 0) |
|
869 { |
|
870 recursion_info *rec = md->recursive; |
|
871 DPRINTF(("End of pattern in a (?0) recursion\n")); |
|
872 md->recursive = rec->prevrec; |
|
873 memmove(md->offset_vector, rec->offset_save, |
|
874 rec->saved_max * sizeof(int)); |
|
875 mstart = rec->save_start; |
|
876 ims = original_ims; |
|
877 ecode = rec->after_call; |
|
878 break; |
|
879 } |
|
880 |
|
881 /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty |
|
882 string - backtracking will then try other alternatives, if any. */ |
|
883 |
|
884 if (md->notempty && eptr == mstart) RRETURN(MATCH_NOMATCH); |
|
885 md->end_match_ptr = eptr; /* Record where we ended */ |
|
886 md->end_offset_top = offset_top; /* and how many extracts were taken */ |
|
887 md->start_match_ptr = mstart; /* and the start (\K can modify) */ |
|
888 RRETURN(MATCH_MATCH); |
|
889 |
|
890 /* Change option settings */ |
|
891 |
|
892 case OP_OPT: |
|
893 ims = ecode[1]; |
|
894 ecode += 2; |
|
895 DPRINTF(("ims set to %02lx\n", ims)); |
|
896 break; |
|
897 |
|
898 /* Assertion brackets. Check the alternative branches in turn - the |
|
899 matching won't pass the KET for an assertion. If any one branch matches, |
|
900 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the |
|
901 start of each branch to move the current point backwards, so the code at |
|
902 this level is identical to the lookahead case. */ |
|
903 |
|
904 case OP_ASSERT: |
|
905 case OP_ASSERTBACK: |
|
906 do |
|
907 { |
|
908 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0, |
|
909 RM4); |
|
910 if (rrc == MATCH_MATCH) break; |
|
911 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc); |
|
912 ecode += GET(ecode, 1); |
|
913 } |
|
914 while (*ecode == OP_ALT); |
|
915 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH); |
|
916 |
|
917 /* If checking an assertion for a condition, return MATCH_MATCH. */ |
|
918 |
|
919 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH); |
|
920 |
|
921 /* Continue from after the assertion, updating the offsets high water |
|
922 mark, since extracts may have been taken during the assertion. */ |
|
923 |
|
924 do ecode += GET(ecode,1); while (*ecode == OP_ALT); |
|
925 ecode += 1 + LINK_SIZE; |
|
926 offset_top = md->end_offset_top; |
|
927 continue; |
|
928 |
|
929 /* Negative assertion: all branches must fail to match */ |
|
930 |
|
931 case OP_ASSERT_NOT: |
|
932 case OP_ASSERTBACK_NOT: |
|
933 do |
|
934 { |
|
935 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0, |
|
936 RM5); |
|
937 if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH); |
|
938 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc); |
|
939 ecode += GET(ecode,1); |
|
940 } |
|
941 while (*ecode == OP_ALT); |
|
942 |
|
943 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH); |
|
944 |
|
945 ecode += 1 + LINK_SIZE; |
|
946 continue; |
|
947 |
|
948 /* Move the subject pointer back. This occurs only at the start of |
|
949 each branch of a lookbehind assertion. If we are too close to the start to |
|
950 move back, this match function fails. When working with UTF-8 we move |
|
951 back a number of characters, not bytes. */ |
|
952 |
|
953 case OP_REVERSE: |
|
954 #ifdef SUPPORT_UTF8 |
|
955 if (utf8) |
|
956 { |
|
957 i = GET(ecode, 1); |
|
958 while (i-- > 0) |
|
959 { |
|
960 eptr--; |
|
961 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH); |
|
962 BACKCHAR(eptr); |
|
963 } |
|
964 } |
|
965 else |
|
966 #endif |
|
967 |
|
968 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */ |
|
969 |
|
970 { |
|
971 eptr -= GET(ecode, 1); |
|
972 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH); |
|
973 } |
|
974 |
|
975 /* Skip to next op code */ |
|
976 |
|
977 ecode += 1 + LINK_SIZE; |
|
978 break; |
|
979 |
|
980 /* The callout item calls an external function, if one is provided, passing |
|
981 details of the match so far. This is mainly for debugging, though the |
|
982 function is able to force a failure. */ |
|
983 |
|
984 case OP_CALLOUT: |
|
985 if (pcre_callout != NULL) |
|
986 { |
|
987 pcre_callout_block cb; |
|
988 cb.version = 1; /* Version 1 of the callout block */ |
|
989 cb.callout_number = ecode[1]; |
|
990 cb.offset_vector = md->offset_vector; |
|
991 cb.subject = (PCRE_SPTR)md->start_subject; |
|
992 cb.subject_length = md->end_subject - md->start_subject; |
|
993 cb.start_match = mstart - md->start_subject; |
|
994 cb.current_position = eptr - md->start_subject; |
|
995 cb.pattern_position = GET(ecode, 2); |
|
996 cb.next_item_length = GET(ecode, 2 + LINK_SIZE); |
|
997 cb.capture_top = offset_top/2; |
|
998 cb.capture_last = md->capture_last; |
|
999 cb.callout_data = md->callout_data; |
|
1000 if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH); |
|
1001 if (rrc < 0) RRETURN(rrc); |
|
1002 } |
|
1003 ecode += 2 + 2*LINK_SIZE; |
|
1004 break; |
|
1005 |
|
1006 /* Recursion either matches the current regex, or some subexpression. The |
|
1007 offset data is the offset to the starting bracket from the start of the |
|
1008 whole pattern. (This is so that it works from duplicated subpatterns.) |
|
1009 |
|
1010 If there are any capturing brackets started but not finished, we have to |
|
1011 save their starting points and reinstate them after the recursion. However, |
|
1012 we don't know how many such there are (offset_top records the completed |
|
1013 total) so we just have to save all the potential data. There may be up to |
|
1014 65535 such values, which is too large to put on the stack, but using malloc |
|
1015 for small numbers seems expensive. As a compromise, the stack is used when |
|
1016 there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc |
|
1017 is used. A problem is what to do if the malloc fails ... there is no way of |
|
1018 returning to the top level with an error. Save the top REC_STACK_SAVE_MAX |
|
1019 values on the stack, and accept that the rest may be wrong. |
|
1020 |
|
1021 There are also other values that have to be saved. We use a chained |
|
1022 sequence of blocks that actually live on the stack. Thanks to Robin Houston |
|
1023 for the original version of this logic. */ |
|
1024 |
|
1025 case OP_RECURSE: |
|
1026 { |
|
1027 callpat = md->start_code + GET(ecode, 1); |
|
1028 new_recursive.group_num = (callpat == md->start_code)? 0 : |
|
1029 GET2(callpat, 1 + LINK_SIZE); |
|
1030 |
|
1031 /* Add to "recursing stack" */ |
|
1032 |
|
1033 new_recursive.prevrec = md->recursive; |
|
1034 md->recursive = &new_recursive; |
|
1035 |
|
1036 /* Find where to continue from afterwards */ |
|
1037 |
|
1038 ecode += 1 + LINK_SIZE; |
|
1039 new_recursive.after_call = ecode; |
|
1040 |
|
1041 /* Now save the offset data. */ |
|
1042 |
|
1043 new_recursive.saved_max = md->offset_end; |
|
1044 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX) |
|
1045 new_recursive.offset_save = stacksave; |
|
1046 else |
|
1047 { |
|
1048 new_recursive.offset_save = |
|
1049 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int)); |
|
1050 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY); |
|
1051 } |
|
1052 |
|
1053 memcpy(new_recursive.offset_save, md->offset_vector, |
|
1054 new_recursive.saved_max * sizeof(int)); |
|
1055 new_recursive.save_start = mstart; |
|
1056 mstart = eptr; |
|
1057 |
|
1058 /* OK, now we can do the recursion. For each top-level alternative we |
|
1059 restore the offset and recursion data. */ |
|
1060 |
|
1061 DPRINTF(("Recursing into group %d\n", new_recursive.group_num)); |
|
1062 flags = (*callpat >= OP_SBRA)? match_cbegroup : 0; |
|
1063 do |
|
1064 { |
|
1065 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top, |
|
1066 md, ims, eptrb, flags, RM6); |
|
1067 if (rrc == MATCH_MATCH) |
|
1068 { |
|
1069 DPRINTF(("Recursion matched\n")); |
|
1070 md->recursive = new_recursive.prevrec; |
|
1071 if (new_recursive.offset_save != stacksave) |
|
1072 (pcre_free)(new_recursive.offset_save); |
|
1073 RRETURN(MATCH_MATCH); |
|
1074 } |
|
1075 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) |
|
1076 { |
|
1077 DPRINTF(("Recursion gave error %d\n", rrc)); |
|
1078 RRETURN(rrc); |
|
1079 } |
|
1080 |
|
1081 md->recursive = &new_recursive; |
|
1082 memcpy(md->offset_vector, new_recursive.offset_save, |
|
1083 new_recursive.saved_max * sizeof(int)); |
|
1084 callpat += GET(callpat, 1); |
|
1085 } |
|
1086 while (*callpat == OP_ALT); |
|
1087 |
|
1088 DPRINTF(("Recursion didn't match\n")); |
|
1089 md->recursive = new_recursive.prevrec; |
|
1090 if (new_recursive.offset_save != stacksave) |
|
1091 (pcre_free)(new_recursive.offset_save); |
|
1092 RRETURN(MATCH_NOMATCH); |
|
1093 } |
|
1094 /* Control never reaches here */ |
|
1095 |
|
1096 /* "Once" brackets are like assertion brackets except that after a match, |
|
1097 the point in the subject string is not moved back. Thus there can never be |
|
1098 a move back into the brackets. Friedl calls these "atomic" subpatterns. |
|
1099 Check the alternative branches in turn - the matching won't pass the KET |
|
1100 for this kind of subpattern. If any one branch matches, we carry on as at |
|
1101 the end of a normal bracket, leaving the subject pointer. */ |
|
1102 |
|
1103 case OP_ONCE: |
|
1104 prev = ecode; |
|
1105 saved_eptr = eptr; |
|
1106 |
|
1107 do |
|
1108 { |
|
1109 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7); |
|
1110 if (rrc == MATCH_MATCH) break; |
|
1111 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc); |
|
1112 ecode += GET(ecode,1); |
|
1113 } |
|
1114 while (*ecode == OP_ALT); |
|
1115 |
|
1116 /* If hit the end of the group (which could be repeated), fail */ |
|
1117 |
|
1118 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH); |
|
1119 |
|
1120 /* Continue as from after the assertion, updating the offsets high water |
|
1121 mark, since extracts may have been taken. */ |
|
1122 |
|
1123 do ecode += GET(ecode, 1); while (*ecode == OP_ALT); |
|
1124 |
|
1125 offset_top = md->end_offset_top; |
|
1126 eptr = md->end_match_ptr; |
|
1127 |
|
1128 /* For a non-repeating ket, just continue at this level. This also |
|
1129 happens for a repeating ket if no characters were matched in the group. |
|
1130 This is the forcible breaking of infinite loops as implemented in Perl |
|
1131 5.005. If there is an options reset, it will get obeyed in the normal |
|
1132 course of events. */ |
|
1133 |
|
1134 if (*ecode == OP_KET || eptr == saved_eptr) |
|
1135 { |
|
1136 ecode += 1+LINK_SIZE; |
|
1137 break; |
|
1138 } |
|
1139 |
|
1140 /* The repeating kets try the rest of the pattern or restart from the |
|
1141 preceding bracket, in the appropriate order. The second "call" of match() |
|
1142 uses tail recursion, to avoid using another stack frame. We need to reset |
|
1143 any options that changed within the bracket before re-running it, so |
|
1144 check the next opcode. */ |
|
1145 |
|
1146 if (ecode[1+LINK_SIZE] == OP_OPT) |
|
1147 { |
|
1148 ims = (ims & ~PCRE_IMS) | ecode[4]; |
|
1149 DPRINTF(("ims set to %02lx at group repeat\n", ims)); |
|
1150 } |
|
1151 |
|
1152 if (*ecode == OP_KETRMIN) |
|
1153 { |
|
1154 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM8); |
|
1155 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
1156 ecode = prev; |
|
1157 flags = 0; |
|
1158 goto TAIL_RECURSE; |
|
1159 } |
|
1160 else /* OP_KETRMAX */ |
|
1161 { |
|
1162 RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9); |
|
1163 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
1164 ecode += 1 + LINK_SIZE; |
|
1165 flags = 0; |
|
1166 goto TAIL_RECURSE; |
|
1167 } |
|
1168 /* Control never gets here */ |
|
1169 |
|
1170 /* An alternation is the end of a branch; scan along to find the end of the |
|
1171 bracketed group and go to there. */ |
|
1172 |
|
1173 case OP_ALT: |
|
1174 do ecode += GET(ecode,1); while (*ecode == OP_ALT); |
|
1175 break; |
|
1176 |
|
1177 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group, |
|
1178 indicating that it may occur zero times. It may repeat infinitely, or not |
|
1179 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets |
|
1180 with fixed upper repeat limits are compiled as a number of copies, with the |
|
1181 optional ones preceded by BRAZERO or BRAMINZERO. */ |
|
1182 |
|
1183 case OP_BRAZERO: |
|
1184 { |
|
1185 next = ecode+1; |
|
1186 RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10); |
|
1187 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
1188 do next += GET(next,1); while (*next == OP_ALT); |
|
1189 ecode = next + 1 + LINK_SIZE; |
|
1190 } |
|
1191 break; |
|
1192 |
|
1193 case OP_BRAMINZERO: |
|
1194 { |
|
1195 next = ecode+1; |
|
1196 do next += GET(next, 1); while (*next == OP_ALT); |
|
1197 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11); |
|
1198 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
1199 ecode++; |
|
1200 } |
|
1201 break; |
|
1202 |
|
1203 case OP_SKIPZERO: |
|
1204 { |
|
1205 next = ecode+1; |
|
1206 do next += GET(next,1); while (*next == OP_ALT); |
|
1207 ecode = next + 1 + LINK_SIZE; |
|
1208 } |
|
1209 break; |
|
1210 |
|
1211 /* End of a group, repeated or non-repeating. */ |
|
1212 |
|
1213 case OP_KET: |
|
1214 case OP_KETRMIN: |
|
1215 case OP_KETRMAX: |
|
1216 prev = ecode - GET(ecode, 1); |
|
1217 |
|
1218 /* If this was a group that remembered the subject start, in order to break |
|
1219 infinite repeats of empty string matches, retrieve the subject start from |
|
1220 the chain. Otherwise, set it NULL. */ |
|
1221 |
|
1222 if (*prev >= OP_SBRA) |
|
1223 { |
|
1224 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */ |
|
1225 eptrb = eptrb->epb_prev; /* Backup to previous group */ |
|
1226 } |
|
1227 else saved_eptr = NULL; |
|
1228 |
|
1229 /* If we are at the end of an assertion group, stop matching and return |
|
1230 MATCH_MATCH, but record the current high water mark for use by positive |
|
1231 assertions. Do this also for the "once" (atomic) groups. */ |
|
1232 |
|
1233 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT || |
|
1234 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT || |
|
1235 *prev == OP_ONCE) |
|
1236 { |
|
1237 md->end_match_ptr = eptr; /* For ONCE */ |
|
1238 md->end_offset_top = offset_top; |
|
1239 RRETURN(MATCH_MATCH); |
|
1240 } |
|
1241 |
|
1242 /* For capturing groups we have to check the group number back at the start |
|
1243 and if necessary complete handling an extraction by setting the offsets and |
|
1244 bumping the high water mark. Note that whole-pattern recursion is coded as |
|
1245 a recurse into group 0, so it won't be picked up here. Instead, we catch it |
|
1246 when the OP_END is reached. Other recursion is handled here. */ |
|
1247 |
|
1248 if (*prev == OP_CBRA || *prev == OP_SCBRA) |
|
1249 { |
|
1250 number = GET2(prev, 1+LINK_SIZE); |
|
1251 offset = number << 1; |
|
1252 |
|
1253 #ifdef DEBUG |
|
1254 printf("end bracket %d", number); |
|
1255 printf("\n"); |
|
1256 #endif |
|
1257 |
|
1258 md->capture_last = number; |
|
1259 if (offset >= md->offset_max) md->offset_overflow = TRUE; else |
|
1260 { |
|
1261 md->offset_vector[offset] = |
|
1262 md->offset_vector[md->offset_end - number]; |
|
1263 md->offset_vector[offset+1] = eptr - md->start_subject; |
|
1264 if (offset_top <= offset) offset_top = offset + 2; |
|
1265 } |
|
1266 |
|
1267 /* Handle a recursively called group. Restore the offsets |
|
1268 appropriately and continue from after the call. */ |
|
1269 |
|
1270 if (md->recursive != NULL && md->recursive->group_num == number) |
|
1271 { |
|
1272 recursion_info *rec = md->recursive; |
|
1273 DPRINTF(("Recursion (%d) succeeded - continuing\n", number)); |
|
1274 md->recursive = rec->prevrec; |
|
1275 mstart = rec->save_start; |
|
1276 memcpy(md->offset_vector, rec->offset_save, |
|
1277 rec->saved_max * sizeof(int)); |
|
1278 ecode = rec->after_call; |
|
1279 ims = original_ims; |
|
1280 break; |
|
1281 } |
|
1282 } |
|
1283 |
|
1284 /* For both capturing and non-capturing groups, reset the value of the ims |
|
1285 flags, in case they got changed during the group. */ |
|
1286 |
|
1287 ims = original_ims; |
|
1288 DPRINTF(("ims reset to %02lx\n", ims)); |
|
1289 |
|
1290 /* For a non-repeating ket, just continue at this level. This also |
|
1291 happens for a repeating ket if no characters were matched in the group. |
|
1292 This is the forcible breaking of infinite loops as implemented in Perl |
|
1293 5.005. If there is an options reset, it will get obeyed in the normal |
|
1294 course of events. */ |
|
1295 |
|
1296 if (*ecode == OP_KET || eptr == saved_eptr) |
|
1297 { |
|
1298 ecode += 1 + LINK_SIZE; |
|
1299 break; |
|
1300 } |
|
1301 |
|
1302 /* The repeating kets try the rest of the pattern or restart from the |
|
1303 preceding bracket, in the appropriate order. In the second case, we can use |
|
1304 tail recursion to avoid using another stack frame, unless we have an |
|
1305 unlimited repeat of a group that can match an empty string. */ |
|
1306 |
|
1307 flags = (*prev >= OP_SBRA)? match_cbegroup : 0; |
|
1308 |
|
1309 if (*ecode == OP_KETRMIN) |
|
1310 { |
|
1311 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM12); |
|
1312 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
1313 if (flags != 0) /* Could match an empty string */ |
|
1314 { |
|
1315 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM50); |
|
1316 RRETURN(rrc); |
|
1317 } |
|
1318 ecode = prev; |
|
1319 goto TAIL_RECURSE; |
|
1320 } |
|
1321 else /* OP_KETRMAX */ |
|
1322 { |
|
1323 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13); |
|
1324 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
1325 ecode += 1 + LINK_SIZE; |
|
1326 flags = 0; |
|
1327 goto TAIL_RECURSE; |
|
1328 } |
|
1329 /* Control never gets here */ |
|
1330 |
|
1331 /* Start of subject unless notbol, or after internal newline if multiline */ |
|
1332 |
|
1333 case OP_CIRC: |
|
1334 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH); |
|
1335 if ((ims & PCRE_MULTILINE) != 0) |
|
1336 { |
|
1337 if (eptr != md->start_subject && |
|
1338 (eptr == md->end_subject || !WAS_NEWLINE(eptr))) |
|
1339 RRETURN(MATCH_NOMATCH); |
|
1340 ecode++; |
|
1341 break; |
|
1342 } |
|
1343 /* ... else fall through */ |
|
1344 |
|
1345 /* Start of subject assertion */ |
|
1346 |
|
1347 case OP_SOD: |
|
1348 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH); |
|
1349 ecode++; |
|
1350 break; |
|
1351 |
|
1352 /* Start of match assertion */ |
|
1353 |
|
1354 case OP_SOM: |
|
1355 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH); |
|
1356 ecode++; |
|
1357 break; |
|
1358 |
|
1359 /* Reset the start of match point */ |
|
1360 |
|
1361 case OP_SET_SOM: |
|
1362 mstart = eptr; |
|
1363 ecode++; |
|
1364 break; |
|
1365 |
|
1366 /* Assert before internal newline if multiline, or before a terminating |
|
1367 newline unless endonly is set, else end of subject unless noteol is set. */ |
|
1368 |
|
1369 case OP_DOLL: |
|
1370 if ((ims & PCRE_MULTILINE) != 0) |
|
1371 { |
|
1372 if (eptr < md->end_subject) |
|
1373 { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); } |
|
1374 else |
|
1375 { if (md->noteol) RRETURN(MATCH_NOMATCH); } |
|
1376 ecode++; |
|
1377 break; |
|
1378 } |
|
1379 else |
|
1380 { |
|
1381 if (md->noteol) RRETURN(MATCH_NOMATCH); |
|
1382 if (!md->endonly) |
|
1383 { |
|
1384 if (eptr != md->end_subject && |
|
1385 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen)) |
|
1386 RRETURN(MATCH_NOMATCH); |
|
1387 ecode++; |
|
1388 break; |
|
1389 } |
|
1390 } |
|
1391 /* ... else fall through for endonly */ |
|
1392 |
|
1393 /* End of subject assertion (\z) */ |
|
1394 |
|
1395 case OP_EOD: |
|
1396 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH); |
|
1397 ecode++; |
|
1398 break; |
|
1399 |
|
1400 /* End of subject or ending \n assertion (\Z) */ |
|
1401 |
|
1402 case OP_EODN: |
|
1403 if (eptr != md->end_subject && |
|
1404 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen)) |
|
1405 RRETURN(MATCH_NOMATCH); |
|
1406 ecode++; |
|
1407 break; |
|
1408 |
|
1409 /* Word boundary assertions */ |
|
1410 |
|
1411 case OP_NOT_WORD_BOUNDARY: |
|
1412 case OP_WORD_BOUNDARY: |
|
1413 { |
|
1414 |
|
1415 /* Find out if the previous and current characters are "word" characters. |
|
1416 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to |
|
1417 be "non-word" characters. */ |
|
1418 |
|
1419 #ifdef SUPPORT_UTF8 |
|
1420 if (utf8) |
|
1421 { |
|
1422 if (eptr == md->start_subject) prev_is_word = FALSE; else |
|
1423 { |
|
1424 const uschar *lastptr = eptr - 1; |
|
1425 while((*lastptr & 0xc0) == 0x80) lastptr--; |
|
1426 GETCHAR(c, lastptr); |
|
1427 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0; |
|
1428 } |
|
1429 if (eptr >= md->end_subject) cur_is_word = FALSE; else |
|
1430 { |
|
1431 GETCHAR(c, eptr); |
|
1432 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0; |
|
1433 } |
|
1434 } |
|
1435 else |
|
1436 #endif |
|
1437 |
|
1438 /* More streamlined when not in UTF-8 mode */ |
|
1439 |
|
1440 { |
|
1441 prev_is_word = (eptr != md->start_subject) && |
|
1442 ((md->ctypes[eptr[-1]] & ctype_word) != 0); |
|
1443 cur_is_word = (eptr < md->end_subject) && |
|
1444 ((md->ctypes[*eptr] & ctype_word) != 0); |
|
1445 } |
|
1446 |
|
1447 /* Now see if the situation is what we want */ |
|
1448 |
|
1449 if ((*ecode++ == OP_WORD_BOUNDARY)? |
|
1450 cur_is_word == prev_is_word : cur_is_word != prev_is_word) |
|
1451 RRETURN(MATCH_NOMATCH); |
|
1452 } |
|
1453 break; |
|
1454 |
|
1455 /* Match a single character type; inline for speed */ |
|
1456 |
|
1457 case OP_ANY: |
|
1458 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); |
|
1459 /* Fall through */ |
|
1460 |
|
1461 case OP_ALLANY: |
|
1462 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH); |
|
1463 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; |
|
1464 ecode++; |
|
1465 break; |
|
1466 |
|
1467 /* Match a single byte, even in UTF-8 mode. This opcode really does match |
|
1468 any byte, even newline, independent of the setting of PCRE_DOTALL. */ |
|
1469 |
|
1470 case OP_ANYBYTE: |
|
1471 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH); |
|
1472 ecode++; |
|
1473 break; |
|
1474 |
|
1475 case OP_NOT_DIGIT: |
|
1476 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); |
|
1477 GETCHARINCTEST(c, eptr); |
|
1478 if ( |
|
1479 #ifdef SUPPORT_UTF8 |
|
1480 c < 256 && |
|
1481 #endif |
|
1482 (md->ctypes[c] & ctype_digit) != 0 |
|
1483 ) |
|
1484 RRETURN(MATCH_NOMATCH); |
|
1485 ecode++; |
|
1486 break; |
|
1487 |
|
1488 case OP_DIGIT: |
|
1489 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); |
|
1490 GETCHARINCTEST(c, eptr); |
|
1491 if ( |
|
1492 #ifdef SUPPORT_UTF8 |
|
1493 c >= 256 || |
|
1494 #endif |
|
1495 (md->ctypes[c] & ctype_digit) == 0 |
|
1496 ) |
|
1497 RRETURN(MATCH_NOMATCH); |
|
1498 ecode++; |
|
1499 break; |
|
1500 |
|
1501 case OP_NOT_WHITESPACE: |
|
1502 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); |
|
1503 GETCHARINCTEST(c, eptr); |
|
1504 if ( |
|
1505 #ifdef SUPPORT_UTF8 |
|
1506 c < 256 && |
|
1507 #endif |
|
1508 (md->ctypes[c] & ctype_space) != 0 |
|
1509 ) |
|
1510 RRETURN(MATCH_NOMATCH); |
|
1511 ecode++; |
|
1512 break; |
|
1513 |
|
1514 case OP_WHITESPACE: |
|
1515 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); |
|
1516 GETCHARINCTEST(c, eptr); |
|
1517 if ( |
|
1518 #ifdef SUPPORT_UTF8 |
|
1519 c >= 256 || |
|
1520 #endif |
|
1521 (md->ctypes[c] & ctype_space) == 0 |
|
1522 ) |
|
1523 RRETURN(MATCH_NOMATCH); |
|
1524 ecode++; |
|
1525 break; |
|
1526 |
|
1527 case OP_NOT_WORDCHAR: |
|
1528 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); |
|
1529 GETCHARINCTEST(c, eptr); |
|
1530 if ( |
|
1531 #ifdef SUPPORT_UTF8 |
|
1532 c < 256 && |
|
1533 #endif |
|
1534 (md->ctypes[c] & ctype_word) != 0 |
|
1535 ) |
|
1536 RRETURN(MATCH_NOMATCH); |
|
1537 ecode++; |
|
1538 break; |
|
1539 |
|
1540 case OP_WORDCHAR: |
|
1541 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); |
|
1542 GETCHARINCTEST(c, eptr); |
|
1543 if ( |
|
1544 #ifdef SUPPORT_UTF8 |
|
1545 c >= 256 || |
|
1546 #endif |
|
1547 (md->ctypes[c] & ctype_word) == 0 |
|
1548 ) |
|
1549 RRETURN(MATCH_NOMATCH); |
|
1550 ecode++; |
|
1551 break; |
|
1552 |
|
1553 case OP_ANYNL: |
|
1554 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); |
|
1555 GETCHARINCTEST(c, eptr); |
|
1556 switch(c) |
|
1557 { |
|
1558 default: RRETURN(MATCH_NOMATCH); |
|
1559 case 0x000d: |
|
1560 if (eptr < md->end_subject && *eptr == 0x0a) eptr++; |
|
1561 break; |
|
1562 |
|
1563 case 0x000a: |
|
1564 break; |
|
1565 |
|
1566 case 0x000b: |
|
1567 case 0x000c: |
|
1568 case 0x0085: |
|
1569 case 0x2028: |
|
1570 case 0x2029: |
|
1571 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH); |
|
1572 break; |
|
1573 } |
|
1574 ecode++; |
|
1575 break; |
|
1576 |
|
1577 case OP_NOT_HSPACE: |
|
1578 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); |
|
1579 GETCHARINCTEST(c, eptr); |
|
1580 switch(c) |
|
1581 { |
|
1582 default: break; |
|
1583 case 0x09: /* HT */ |
|
1584 case 0x20: /* SPACE */ |
|
1585 case 0xa0: /* NBSP */ |
|
1586 case 0x1680: /* OGHAM SPACE MARK */ |
|
1587 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ |
|
1588 case 0x2000: /* EN QUAD */ |
|
1589 case 0x2001: /* EM QUAD */ |
|
1590 case 0x2002: /* EN SPACE */ |
|
1591 case 0x2003: /* EM SPACE */ |
|
1592 case 0x2004: /* THREE-PER-EM SPACE */ |
|
1593 case 0x2005: /* FOUR-PER-EM SPACE */ |
|
1594 case 0x2006: /* SIX-PER-EM SPACE */ |
|
1595 case 0x2007: /* FIGURE SPACE */ |
|
1596 case 0x2008: /* PUNCTUATION SPACE */ |
|
1597 case 0x2009: /* THIN SPACE */ |
|
1598 case 0x200A: /* HAIR SPACE */ |
|
1599 case 0x202f: /* NARROW NO-BREAK SPACE */ |
|
1600 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ |
|
1601 case 0x3000: /* IDEOGRAPHIC SPACE */ |
|
1602 RRETURN(MATCH_NOMATCH); |
|
1603 } |
|
1604 ecode++; |
|
1605 break; |
|
1606 |
|
1607 case OP_HSPACE: |
|
1608 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); |
|
1609 GETCHARINCTEST(c, eptr); |
|
1610 switch(c) |
|
1611 { |
|
1612 default: RRETURN(MATCH_NOMATCH); |
|
1613 case 0x09: /* HT */ |
|
1614 case 0x20: /* SPACE */ |
|
1615 case 0xa0: /* NBSP */ |
|
1616 case 0x1680: /* OGHAM SPACE MARK */ |
|
1617 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ |
|
1618 case 0x2000: /* EN QUAD */ |
|
1619 case 0x2001: /* EM QUAD */ |
|
1620 case 0x2002: /* EN SPACE */ |
|
1621 case 0x2003: /* EM SPACE */ |
|
1622 case 0x2004: /* THREE-PER-EM SPACE */ |
|
1623 case 0x2005: /* FOUR-PER-EM SPACE */ |
|
1624 case 0x2006: /* SIX-PER-EM SPACE */ |
|
1625 case 0x2007: /* FIGURE SPACE */ |
|
1626 case 0x2008: /* PUNCTUATION SPACE */ |
|
1627 case 0x2009: /* THIN SPACE */ |
|
1628 case 0x200A: /* HAIR SPACE */ |
|
1629 case 0x202f: /* NARROW NO-BREAK SPACE */ |
|
1630 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ |
|
1631 case 0x3000: /* IDEOGRAPHIC SPACE */ |
|
1632 break; |
|
1633 } |
|
1634 ecode++; |
|
1635 break; |
|
1636 |
|
1637 case OP_NOT_VSPACE: |
|
1638 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); |
|
1639 GETCHARINCTEST(c, eptr); |
|
1640 switch(c) |
|
1641 { |
|
1642 default: break; |
|
1643 case 0x0a: /* LF */ |
|
1644 case 0x0b: /* VT */ |
|
1645 case 0x0c: /* FF */ |
|
1646 case 0x0d: /* CR */ |
|
1647 case 0x85: /* NEL */ |
|
1648 case 0x2028: /* LINE SEPARATOR */ |
|
1649 case 0x2029: /* PARAGRAPH SEPARATOR */ |
|
1650 RRETURN(MATCH_NOMATCH); |
|
1651 } |
|
1652 ecode++; |
|
1653 break; |
|
1654 |
|
1655 case OP_VSPACE: |
|
1656 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); |
|
1657 GETCHARINCTEST(c, eptr); |
|
1658 switch(c) |
|
1659 { |
|
1660 default: RRETURN(MATCH_NOMATCH); |
|
1661 case 0x0a: /* LF */ |
|
1662 case 0x0b: /* VT */ |
|
1663 case 0x0c: /* FF */ |
|
1664 case 0x0d: /* CR */ |
|
1665 case 0x85: /* NEL */ |
|
1666 case 0x2028: /* LINE SEPARATOR */ |
|
1667 case 0x2029: /* PARAGRAPH SEPARATOR */ |
|
1668 break; |
|
1669 } |
|
1670 ecode++; |
|
1671 break; |
|
1672 |
|
1673 #ifdef SUPPORT_UCP |
|
1674 /* Check the next character by Unicode property. We will get here only |
|
1675 if the support is in the binary; otherwise a compile-time error occurs. */ |
|
1676 |
|
1677 case OP_PROP: |
|
1678 case OP_NOTPROP: |
|
1679 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); |
|
1680 GETCHARINCTEST(c, eptr); |
|
1681 { |
|
1682 const ucd_record * prop = GET_UCD(c); |
|
1683 |
|
1684 switch(ecode[1]) |
|
1685 { |
|
1686 case PT_ANY: |
|
1687 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH); |
|
1688 break; |
|
1689 |
|
1690 case PT_LAMP: |
|
1691 if ((prop->chartype == ucp_Lu || |
|
1692 prop->chartype == ucp_Ll || |
|
1693 prop->chartype == ucp_Lt) == (op == OP_NOTPROP)) |
|
1694 RRETURN(MATCH_NOMATCH); |
|
1695 break; |
|
1696 |
|
1697 case PT_GC: |
|
1698 if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP)) |
|
1699 RRETURN(MATCH_NOMATCH); |
|
1700 break; |
|
1701 |
|
1702 case PT_PC: |
|
1703 if ((ecode[2] != prop->chartype) == (op == OP_PROP)) |
|
1704 RRETURN(MATCH_NOMATCH); |
|
1705 break; |
|
1706 |
|
1707 case PT_SC: |
|
1708 if ((ecode[2] != prop->script) == (op == OP_PROP)) |
|
1709 RRETURN(MATCH_NOMATCH); |
|
1710 break; |
|
1711 |
|
1712 default: |
|
1713 RRETURN(PCRE_ERROR_INTERNAL); |
|
1714 } |
|
1715 |
|
1716 ecode += 3; |
|
1717 } |
|
1718 break; |
|
1719 |
|
1720 /* Match an extended Unicode sequence. We will get here only if the support |
|
1721 is in the binary; otherwise a compile-time error occurs. */ |
|
1722 |
|
1723 case OP_EXTUNI: |
|
1724 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); |
|
1725 GETCHARINCTEST(c, eptr); |
|
1726 { |
|
1727 int category = UCD_CATEGORY(c); |
|
1728 if (category == ucp_M) RRETURN(MATCH_NOMATCH); |
|
1729 while (eptr < md->end_subject) |
|
1730 { |
|
1731 int len = 1; |
|
1732 if (!utf8) c = *eptr; else |
|
1733 { |
|
1734 GETCHARLEN(c, eptr, len); |
|
1735 } |
|
1736 category = UCD_CATEGORY(c); |
|
1737 if (category != ucp_M) break; |
|
1738 eptr += len; |
|
1739 } |
|
1740 } |
|
1741 ecode++; |
|
1742 break; |
|
1743 #endif |
|
1744 |
|
1745 |
|
1746 /* Match a back reference, possibly repeatedly. Look past the end of the |
|
1747 item to see if there is repeat information following. The code is similar |
|
1748 to that for character classes, but repeated for efficiency. Then obey |
|
1749 similar code to character type repeats - written out again for speed. |
|
1750 However, if the referenced string is the empty string, always treat |
|
1751 it as matched, any number of times (otherwise there could be infinite |
|
1752 loops). */ |
|
1753 |
|
1754 case OP_REF: |
|
1755 { |
|
1756 offset = GET2(ecode, 1) << 1; /* Doubled ref number */ |
|
1757 ecode += 3; |
|
1758 |
|
1759 /* If the reference is unset, there are two possibilities: |
|
1760 |
|
1761 (a) In the default, Perl-compatible state, set the length to be longer |
|
1762 than the amount of subject left; this ensures that every attempt at a |
|
1763 match fails. We can't just fail here, because of the possibility of |
|
1764 quantifiers with zero minima. |
|
1765 |
|
1766 (b) If the JavaScript compatibility flag is set, set the length to zero |
|
1767 so that the back reference matches an empty string. |
|
1768 |
|
1769 Otherwise, set the length to the length of what was matched by the |
|
1770 referenced subpattern. */ |
|
1771 |
|
1772 if (offset >= offset_top || md->offset_vector[offset] < 0) |
|
1773 length = (md->jscript_compat)? 0 : md->end_subject - eptr + 1; |
|
1774 else |
|
1775 length = md->offset_vector[offset+1] - md->offset_vector[offset]; |
|
1776 |
|
1777 /* Set up for repetition, or handle the non-repeated case */ |
|
1778 |
|
1779 switch (*ecode) |
|
1780 { |
|
1781 case OP_CRSTAR: |
|
1782 case OP_CRMINSTAR: |
|
1783 case OP_CRPLUS: |
|
1784 case OP_CRMINPLUS: |
|
1785 case OP_CRQUERY: |
|
1786 case OP_CRMINQUERY: |
|
1787 c = *ecode++ - OP_CRSTAR; |
|
1788 minimize = (c & 1) != 0; |
|
1789 min = rep_min[c]; /* Pick up values from tables; */ |
|
1790 max = rep_max[c]; /* zero for max => infinity */ |
|
1791 if (max == 0) max = INT_MAX; |
|
1792 break; |
|
1793 |
|
1794 case OP_CRRANGE: |
|
1795 case OP_CRMINRANGE: |
|
1796 minimize = (*ecode == OP_CRMINRANGE); |
|
1797 min = GET2(ecode, 1); |
|
1798 max = GET2(ecode, 3); |
|
1799 if (max == 0) max = INT_MAX; |
|
1800 ecode += 5; |
|
1801 break; |
|
1802 |
|
1803 default: /* No repeat follows */ |
|
1804 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH); |
|
1805 eptr += length; |
|
1806 continue; /* With the main loop */ |
|
1807 } |
|
1808 |
|
1809 /* If the length of the reference is zero, just continue with the |
|
1810 main loop. */ |
|
1811 |
|
1812 if (length == 0) continue; |
|
1813 |
|
1814 /* First, ensure the minimum number of matches are present. We get back |
|
1815 the length of the reference string explicitly rather than passing the |
|
1816 address of eptr, so that eptr can be a register variable. */ |
|
1817 |
|
1818 for (i = 1; i <= min; i++) |
|
1819 { |
|
1820 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH); |
|
1821 eptr += length; |
|
1822 } |
|
1823 |
|
1824 /* If min = max, continue at the same level without recursion. |
|
1825 They are not both allowed to be zero. */ |
|
1826 |
|
1827 if (min == max) continue; |
|
1828 |
|
1829 /* If minimizing, keep trying and advancing the pointer */ |
|
1830 |
|
1831 if (minimize) |
|
1832 { |
|
1833 for (fi = min;; fi++) |
|
1834 { |
|
1835 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14); |
|
1836 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
1837 if (fi >= max || !match_ref(offset, eptr, length, md, ims)) |
|
1838 RRETURN(MATCH_NOMATCH); |
|
1839 eptr += length; |
|
1840 } |
|
1841 /* Control never gets here */ |
|
1842 } |
|
1843 |
|
1844 /* If maximizing, find the longest string and work backwards */ |
|
1845 |
|
1846 else |
|
1847 { |
|
1848 pp = eptr; |
|
1849 for (i = min; i < max; i++) |
|
1850 { |
|
1851 if (!match_ref(offset, eptr, length, md, ims)) break; |
|
1852 eptr += length; |
|
1853 } |
|
1854 while (eptr >= pp) |
|
1855 { |
|
1856 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15); |
|
1857 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
1858 eptr -= length; |
|
1859 } |
|
1860 RRETURN(MATCH_NOMATCH); |
|
1861 } |
|
1862 } |
|
1863 /* Control never gets here */ |
|
1864 |
|
1865 |
|
1866 |
|
1867 /* Match a bit-mapped character class, possibly repeatedly. This op code is |
|
1868 used when all the characters in the class have values in the range 0-255, |
|
1869 and either the matching is caseful, or the characters are in the range |
|
1870 0-127 when UTF-8 processing is enabled. The only difference between |
|
1871 OP_CLASS and OP_NCLASS occurs when a data character outside the range is |
|
1872 encountered. |
|
1873 |
|
1874 First, look past the end of the item to see if there is repeat information |
|
1875 following. Then obey similar code to character type repeats - written out |
|
1876 again for speed. */ |
|
1877 |
|
1878 case OP_NCLASS: |
|
1879 case OP_CLASS: |
|
1880 { |
|
1881 data = ecode + 1; /* Save for matching */ |
|
1882 ecode += 33; /* Advance past the item */ |
|
1883 |
|
1884 switch (*ecode) |
|
1885 { |
|
1886 case OP_CRSTAR: |
|
1887 case OP_CRMINSTAR: |
|
1888 case OP_CRPLUS: |
|
1889 case OP_CRMINPLUS: |
|
1890 case OP_CRQUERY: |
|
1891 case OP_CRMINQUERY: |
|
1892 c = *ecode++ - OP_CRSTAR; |
|
1893 minimize = (c & 1) != 0; |
|
1894 min = rep_min[c]; /* Pick up values from tables; */ |
|
1895 max = rep_max[c]; /* zero for max => infinity */ |
|
1896 if (max == 0) max = INT_MAX; |
|
1897 break; |
|
1898 |
|
1899 case OP_CRRANGE: |
|
1900 case OP_CRMINRANGE: |
|
1901 minimize = (*ecode == OP_CRMINRANGE); |
|
1902 min = GET2(ecode, 1); |
|
1903 max = GET2(ecode, 3); |
|
1904 if (max == 0) max = INT_MAX; |
|
1905 ecode += 5; |
|
1906 break; |
|
1907 |
|
1908 default: /* No repeat follows */ |
|
1909 min = max = 1; |
|
1910 break; |
|
1911 } |
|
1912 |
|
1913 /* First, ensure the minimum number of matches are present. */ |
|
1914 |
|
1915 #ifdef SUPPORT_UTF8 |
|
1916 /* UTF-8 mode */ |
|
1917 if (utf8) |
|
1918 { |
|
1919 for (i = 1; i <= min; i++) |
|
1920 { |
|
1921 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); |
|
1922 GETCHARINC(c, eptr); |
|
1923 if (c > 255) |
|
1924 { |
|
1925 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH); |
|
1926 } |
|
1927 else |
|
1928 { |
|
1929 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH); |
|
1930 } |
|
1931 } |
|
1932 } |
|
1933 else |
|
1934 #endif |
|
1935 /* Not UTF-8 mode */ |
|
1936 { |
|
1937 for (i = 1; i <= min; i++) |
|
1938 { |
|
1939 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); |
|
1940 c = *eptr++; |
|
1941 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH); |
|
1942 } |
|
1943 } |
|
1944 |
|
1945 /* If max == min we can continue with the main loop without the |
|
1946 need to recurse. */ |
|
1947 |
|
1948 if (min == max) continue; |
|
1949 |
|
1950 /* If minimizing, keep testing the rest of the expression and advancing |
|
1951 the pointer while it matches the class. */ |
|
1952 |
|
1953 if (minimize) |
|
1954 { |
|
1955 #ifdef SUPPORT_UTF8 |
|
1956 /* UTF-8 mode */ |
|
1957 if (utf8) |
|
1958 { |
|
1959 for (fi = min;; fi++) |
|
1960 { |
|
1961 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16); |
|
1962 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
1963 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); |
|
1964 GETCHARINC(c, eptr); |
|
1965 if (c > 255) |
|
1966 { |
|
1967 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH); |
|
1968 } |
|
1969 else |
|
1970 { |
|
1971 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH); |
|
1972 } |
|
1973 } |
|
1974 } |
|
1975 else |
|
1976 #endif |
|
1977 /* Not UTF-8 mode */ |
|
1978 { |
|
1979 for (fi = min;; fi++) |
|
1980 { |
|
1981 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17); |
|
1982 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
1983 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); |
|
1984 c = *eptr++; |
|
1985 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH); |
|
1986 } |
|
1987 } |
|
1988 /* Control never gets here */ |
|
1989 } |
|
1990 |
|
1991 /* If maximizing, find the longest possible run, then work backwards. */ |
|
1992 |
|
1993 else |
|
1994 { |
|
1995 pp = eptr; |
|
1996 |
|
1997 #ifdef SUPPORT_UTF8 |
|
1998 /* UTF-8 mode */ |
|
1999 if (utf8) |
|
2000 { |
|
2001 for (i = min; i < max; i++) |
|
2002 { |
|
2003 int len = 1; |
|
2004 if (eptr >= md->end_subject) break; |
|
2005 GETCHARLEN(c, eptr, len); |
|
2006 if (c > 255) |
|
2007 { |
|
2008 if (op == OP_CLASS) break; |
|
2009 } |
|
2010 else |
|
2011 { |
|
2012 if ((data[c/8] & (1 << (c&7))) == 0) break; |
|
2013 } |
|
2014 eptr += len; |
|
2015 } |
|
2016 for (;;) |
|
2017 { |
|
2018 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18); |
|
2019 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
2020 if (eptr-- == pp) break; /* Stop if tried at original pos */ |
|
2021 BACKCHAR(eptr); |
|
2022 } |
|
2023 } |
|
2024 else |
|
2025 #endif |
|
2026 /* Not UTF-8 mode */ |
|
2027 { |
|
2028 for (i = min; i < max; i++) |
|
2029 { |
|
2030 if (eptr >= md->end_subject) break; |
|
2031 c = *eptr; |
|
2032 if ((data[c/8] & (1 << (c&7))) == 0) break; |
|
2033 eptr++; |
|
2034 } |
|
2035 while (eptr >= pp) |
|
2036 { |
|
2037 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19); |
|
2038 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
2039 eptr--; |
|
2040 } |
|
2041 } |
|
2042 |
|
2043 RRETURN(MATCH_NOMATCH); |
|
2044 } |
|
2045 } |
|
2046 /* Control never gets here */ |
|
2047 |
|
2048 |
|
2049 /* Match an extended character class. This opcode is encountered only |
|
2050 in UTF-8 mode, because that's the only time it is compiled. */ |
|
2051 |
|
2052 #ifdef SUPPORT_UTF8 |
|
2053 case OP_XCLASS: |
|
2054 { |
|
2055 data = ecode + 1 + LINK_SIZE; /* Save for matching */ |
|
2056 ecode += GET(ecode, 1); /* Advance past the item */ |
|
2057 |
|
2058 switch (*ecode) |
|
2059 { |
|
2060 case OP_CRSTAR: |
|
2061 case OP_CRMINSTAR: |
|
2062 case OP_CRPLUS: |
|
2063 case OP_CRMINPLUS: |
|
2064 case OP_CRQUERY: |
|
2065 case OP_CRMINQUERY: |
|
2066 c = *ecode++ - OP_CRSTAR; |
|
2067 minimize = (c & 1) != 0; |
|
2068 min = rep_min[c]; /* Pick up values from tables; */ |
|
2069 max = rep_max[c]; /* zero for max => infinity */ |
|
2070 if (max == 0) max = INT_MAX; |
|
2071 break; |
|
2072 |
|
2073 case OP_CRRANGE: |
|
2074 case OP_CRMINRANGE: |
|
2075 minimize = (*ecode == OP_CRMINRANGE); |
|
2076 min = GET2(ecode, 1); |
|
2077 max = GET2(ecode, 3); |
|
2078 if (max == 0) max = INT_MAX; |
|
2079 ecode += 5; |
|
2080 break; |
|
2081 |
|
2082 default: /* No repeat follows */ |
|
2083 min = max = 1; |
|
2084 break; |
|
2085 } |
|
2086 |
|
2087 /* First, ensure the minimum number of matches are present. */ |
|
2088 |
|
2089 for (i = 1; i <= min; i++) |
|
2090 { |
|
2091 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); |
|
2092 GETCHARINC(c, eptr); |
|
2093 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH); |
|
2094 } |
|
2095 |
|
2096 /* If max == min we can continue with the main loop without the |
|
2097 need to recurse. */ |
|
2098 |
|
2099 if (min == max) continue; |
|
2100 |
|
2101 /* If minimizing, keep testing the rest of the expression and advancing |
|
2102 the pointer while it matches the class. */ |
|
2103 |
|
2104 if (minimize) |
|
2105 { |
|
2106 for (fi = min;; fi++) |
|
2107 { |
|
2108 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20); |
|
2109 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
2110 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); |
|
2111 GETCHARINC(c, eptr); |
|
2112 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH); |
|
2113 } |
|
2114 /* Control never gets here */ |
|
2115 } |
|
2116 |
|
2117 /* If maximizing, find the longest possible run, then work backwards. */ |
|
2118 |
|
2119 else |
|
2120 { |
|
2121 pp = eptr; |
|
2122 for (i = min; i < max; i++) |
|
2123 { |
|
2124 int len = 1; |
|
2125 if (eptr >= md->end_subject) break; |
|
2126 GETCHARLEN(c, eptr, len); |
|
2127 if (!_pcre_xclass(c, data)) break; |
|
2128 eptr += len; |
|
2129 } |
|
2130 for(;;) |
|
2131 { |
|
2132 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21); |
|
2133 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
2134 if (eptr-- == pp) break; /* Stop if tried at original pos */ |
|
2135 if (utf8) BACKCHAR(eptr); |
|
2136 } |
|
2137 RRETURN(MATCH_NOMATCH); |
|
2138 } |
|
2139 |
|
2140 /* Control never gets here */ |
|
2141 } |
|
2142 #endif /* End of XCLASS */ |
|
2143 |
|
2144 /* Match a single character, casefully */ |
|
2145 |
|
2146 case OP_CHAR: |
|
2147 #ifdef SUPPORT_UTF8 |
|
2148 if (utf8) |
|
2149 { |
|
2150 length = 1; |
|
2151 ecode++; |
|
2152 GETCHARLEN(fc, ecode, length); |
|
2153 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH); |
|
2154 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH); |
|
2155 } |
|
2156 else |
|
2157 #endif |
|
2158 |
|
2159 /* Non-UTF-8 mode */ |
|
2160 { |
|
2161 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH); |
|
2162 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH); |
|
2163 ecode += 2; |
|
2164 } |
|
2165 break; |
|
2166 |
|
2167 /* Match a single character, caselessly */ |
|
2168 |
|
2169 case OP_CHARNC: |
|
2170 #ifdef SUPPORT_UTF8 |
|
2171 if (utf8) |
|
2172 { |
|
2173 length = 1; |
|
2174 ecode++; |
|
2175 GETCHARLEN(fc, ecode, length); |
|
2176 |
|
2177 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH); |
|
2178 |
|
2179 /* If the pattern character's value is < 128, we have only one byte, and |
|
2180 can use the fast lookup table. */ |
|
2181 |
|
2182 if (fc < 128) |
|
2183 { |
|
2184 if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH); |
|
2185 } |
|
2186 |
|
2187 /* Otherwise we must pick up the subject character */ |
|
2188 |
|
2189 else |
|
2190 { |
|
2191 unsigned int dc; |
|
2192 GETCHARINC(dc, eptr); |
|
2193 ecode += length; |
|
2194 |
|
2195 /* If we have Unicode property support, we can use it to test the other |
|
2196 case of the character, if there is one. */ |
|
2197 |
|
2198 if (fc != dc) |
|
2199 { |
|
2200 #ifdef SUPPORT_UCP |
|
2201 if (dc != UCD_OTHERCASE(fc)) |
|
2202 #endif |
|
2203 RRETURN(MATCH_NOMATCH); |
|
2204 } |
|
2205 } |
|
2206 } |
|
2207 else |
|
2208 #endif /* SUPPORT_UTF8 */ |
|
2209 |
|
2210 /* Non-UTF-8 mode */ |
|
2211 { |
|
2212 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH); |
|
2213 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH); |
|
2214 ecode += 2; |
|
2215 } |
|
2216 break; |
|
2217 |
|
2218 /* Match a single character repeatedly. */ |
|
2219 |
|
2220 case OP_EXACT: |
|
2221 min = max = GET2(ecode, 1); |
|
2222 ecode += 3; |
|
2223 goto REPEATCHAR; |
|
2224 |
|
2225 case OP_POSUPTO: |
|
2226 possessive = TRUE; |
|
2227 /* Fall through */ |
|
2228 |
|
2229 case OP_UPTO: |
|
2230 case OP_MINUPTO: |
|
2231 min = 0; |
|
2232 max = GET2(ecode, 1); |
|
2233 minimize = *ecode == OP_MINUPTO; |
|
2234 ecode += 3; |
|
2235 goto REPEATCHAR; |
|
2236 |
|
2237 case OP_POSSTAR: |
|
2238 possessive = TRUE; |
|
2239 min = 0; |
|
2240 max = INT_MAX; |
|
2241 ecode++; |
|
2242 goto REPEATCHAR; |
|
2243 |
|
2244 case OP_POSPLUS: |
|
2245 possessive = TRUE; |
|
2246 min = 1; |
|
2247 max = INT_MAX; |
|
2248 ecode++; |
|
2249 goto REPEATCHAR; |
|
2250 |
|
2251 case OP_POSQUERY: |
|
2252 possessive = TRUE; |
|
2253 min = 0; |
|
2254 max = 1; |
|
2255 ecode++; |
|
2256 goto REPEATCHAR; |
|
2257 |
|
2258 case OP_STAR: |
|
2259 case OP_MINSTAR: |
|
2260 case OP_PLUS: |
|
2261 case OP_MINPLUS: |
|
2262 case OP_QUERY: |
|
2263 case OP_MINQUERY: |
|
2264 c = *ecode++ - OP_STAR; |
|
2265 minimize = (c & 1) != 0; |
|
2266 min = rep_min[c]; /* Pick up values from tables; */ |
|
2267 max = rep_max[c]; /* zero for max => infinity */ |
|
2268 if (max == 0) max = INT_MAX; |
|
2269 |
|
2270 /* Common code for all repeated single-character matches. We can give |
|
2271 up quickly if there are fewer than the minimum number of characters left in |
|
2272 the subject. */ |
|
2273 |
|
2274 REPEATCHAR: |
|
2275 #ifdef SUPPORT_UTF8 |
|
2276 if (utf8) |
|
2277 { |
|
2278 length = 1; |
|
2279 charptr = ecode; |
|
2280 GETCHARLEN(fc, ecode, length); |
|
2281 if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH); |
|
2282 ecode += length; |
|
2283 |
|
2284 /* Handle multibyte character matching specially here. There is |
|
2285 support for caseless matching if UCP support is present. */ |
|
2286 |
|
2287 if (length > 1) |
|
2288 { |
|
2289 #ifdef SUPPORT_UCP |
|
2290 unsigned int othercase; |
|
2291 if ((ims & PCRE_CASELESS) != 0 && |
|
2292 (othercase = UCD_OTHERCASE(fc)) != fc) |
|
2293 oclength = _pcre_ord2utf8(othercase, occhars); |
|
2294 else oclength = 0; |
|
2295 #endif /* SUPPORT_UCP */ |
|
2296 |
|
2297 for (i = 1; i <= min; i++) |
|
2298 { |
|
2299 if (memcmp(eptr, charptr, length) == 0) eptr += length; |
|
2300 #ifdef SUPPORT_UCP |
|
2301 /* Need braces because of following else */ |
|
2302 else if (oclength == 0) { RRETURN(MATCH_NOMATCH); } |
|
2303 else |
|
2304 { |
|
2305 if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH); |
|
2306 eptr += oclength; |
|
2307 } |
|
2308 #else /* without SUPPORT_UCP */ |
|
2309 else { RRETURN(MATCH_NOMATCH); } |
|
2310 #endif /* SUPPORT_UCP */ |
|
2311 } |
|
2312 |
|
2313 if (min == max) continue; |
|
2314 |
|
2315 if (minimize) |
|
2316 { |
|
2317 for (fi = min;; fi++) |
|
2318 { |
|
2319 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22); |
|
2320 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
2321 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); |
|
2322 if (memcmp(eptr, charptr, length) == 0) eptr += length; |
|
2323 #ifdef SUPPORT_UCP |
|
2324 /* Need braces because of following else */ |
|
2325 else if (oclength == 0) { RRETURN(MATCH_NOMATCH); } |
|
2326 else |
|
2327 { |
|
2328 if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH); |
|
2329 eptr += oclength; |
|
2330 } |
|
2331 #else /* without SUPPORT_UCP */ |
|
2332 else { RRETURN (MATCH_NOMATCH); } |
|
2333 #endif /* SUPPORT_UCP */ |
|
2334 } |
|
2335 /* Control never gets here */ |
|
2336 } |
|
2337 |
|
2338 else /* Maximize */ |
|
2339 { |
|
2340 pp = eptr; |
|
2341 for (i = min; i < max; i++) |
|
2342 { |
|
2343 if (eptr > md->end_subject - length) break; |
|
2344 if (memcmp(eptr, charptr, length) == 0) eptr += length; |
|
2345 #ifdef SUPPORT_UCP |
|
2346 else if (oclength == 0) break; |
|
2347 else |
|
2348 { |
|
2349 if (memcmp(eptr, occhars, oclength) != 0) break; |
|
2350 eptr += oclength; |
|
2351 } |
|
2352 #else /* without SUPPORT_UCP */ |
|
2353 else break; |
|
2354 #endif /* SUPPORT_UCP */ |
|
2355 } |
|
2356 |
|
2357 if (possessive) continue; |
|
2358 for(;;) |
|
2359 { |
|
2360 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23); |
|
2361 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
2362 if (eptr == pp) RRETURN(MATCH_NOMATCH); |
|
2363 #ifdef SUPPORT_UCP |
|
2364 eptr--; |
|
2365 BACKCHAR(eptr); |
|
2366 #else /* without SUPPORT_UCP */ |
|
2367 eptr -= length; |
|
2368 #endif /* SUPPORT_UCP */ |
|
2369 } |
|
2370 } |
|
2371 /* Control never gets here */ |
|
2372 } |
|
2373 |
|
2374 /* If the length of a UTF-8 character is 1, we fall through here, and |
|
2375 obey the code as for non-UTF-8 characters below, though in this case the |
|
2376 value of fc will always be < 128. */ |
|
2377 } |
|
2378 else |
|
2379 #endif /* SUPPORT_UTF8 */ |
|
2380 |
|
2381 /* When not in UTF-8 mode, load a single-byte character. */ |
|
2382 { |
|
2383 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH); |
|
2384 fc = *ecode++; |
|
2385 } |
|
2386 |
|
2387 /* The value of fc at this point is always less than 256, though we may or |
|
2388 may not be in UTF-8 mode. The code is duplicated for the caseless and |
|
2389 caseful cases, for speed, since matching characters is likely to be quite |
|
2390 common. First, ensure the minimum number of matches are present. If min = |
|
2391 max, continue at the same level without recursing. Otherwise, if |
|
2392 minimizing, keep trying the rest of the expression and advancing one |
|
2393 matching character if failing, up to the maximum. Alternatively, if |
|
2394 maximizing, find the maximum number of characters and work backwards. */ |
|
2395 |
|
2396 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max, |
|
2397 max, eptr)); |
|
2398 |
|
2399 if ((ims & PCRE_CASELESS) != 0) |
|
2400 { |
|
2401 fc = md->lcc[fc]; |
|
2402 for (i = 1; i <= min; i++) |
|
2403 if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH); |
|
2404 if (min == max) continue; |
|
2405 if (minimize) |
|
2406 { |
|
2407 for (fi = min;; fi++) |
|
2408 { |
|
2409 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24); |
|
2410 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
2411 if (fi >= max || eptr >= md->end_subject || |
|
2412 fc != md->lcc[*eptr++]) |
|
2413 RRETURN(MATCH_NOMATCH); |
|
2414 } |
|
2415 /* Control never gets here */ |
|
2416 } |
|
2417 else /* Maximize */ |
|
2418 { |
|
2419 pp = eptr; |
|
2420 for (i = min; i < max; i++) |
|
2421 { |
|
2422 if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break; |
|
2423 eptr++; |
|
2424 } |
|
2425 if (possessive) continue; |
|
2426 while (eptr >= pp) |
|
2427 { |
|
2428 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25); |
|
2429 eptr--; |
|
2430 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
2431 } |
|
2432 RRETURN(MATCH_NOMATCH); |
|
2433 } |
|
2434 /* Control never gets here */ |
|
2435 } |
|
2436 |
|
2437 /* Caseful comparisons (includes all multi-byte characters) */ |
|
2438 |
|
2439 else |
|
2440 { |
|
2441 for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH); |
|
2442 if (min == max) continue; |
|
2443 if (minimize) |
|
2444 { |
|
2445 for (fi = min;; fi++) |
|
2446 { |
|
2447 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26); |
|
2448 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
2449 if (fi >= max || eptr >= md->end_subject || fc != *eptr++) |
|
2450 RRETURN(MATCH_NOMATCH); |
|
2451 } |
|
2452 /* Control never gets here */ |
|
2453 } |
|
2454 else /* Maximize */ |
|
2455 { |
|
2456 pp = eptr; |
|
2457 for (i = min; i < max; i++) |
|
2458 { |
|
2459 if (eptr >= md->end_subject || fc != *eptr) break; |
|
2460 eptr++; |
|
2461 } |
|
2462 if (possessive) continue; |
|
2463 while (eptr >= pp) |
|
2464 { |
|
2465 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27); |
|
2466 eptr--; |
|
2467 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
2468 } |
|
2469 RRETURN(MATCH_NOMATCH); |
|
2470 } |
|
2471 } |
|
2472 /* Control never gets here */ |
|
2473 |
|
2474 /* Match a negated single one-byte character. The character we are |
|
2475 checking can be multibyte. */ |
|
2476 |
|
2477 case OP_NOT: |
|
2478 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); |
|
2479 ecode++; |
|
2480 GETCHARINCTEST(c, eptr); |
|
2481 if ((ims & PCRE_CASELESS) != 0) |
|
2482 { |
|
2483 #ifdef SUPPORT_UTF8 |
|
2484 if (c < 256) |
|
2485 #endif |
|
2486 c = md->lcc[c]; |
|
2487 if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH); |
|
2488 } |
|
2489 else |
|
2490 { |
|
2491 if (*ecode++ == c) RRETURN(MATCH_NOMATCH); |
|
2492 } |
|
2493 break; |
|
2494 |
|
2495 /* Match a negated single one-byte character repeatedly. This is almost a |
|
2496 repeat of the code for a repeated single character, but I haven't found a |
|
2497 nice way of commoning these up that doesn't require a test of the |
|
2498 positive/negative option for each character match. Maybe that wouldn't add |
|
2499 very much to the time taken, but character matching *is* what this is all |
|
2500 about... */ |
|
2501 |
|
2502 case OP_NOTEXACT: |
|
2503 min = max = GET2(ecode, 1); |
|
2504 ecode += 3; |
|
2505 goto REPEATNOTCHAR; |
|
2506 |
|
2507 case OP_NOTUPTO: |
|
2508 case OP_NOTMINUPTO: |
|
2509 min = 0; |
|
2510 max = GET2(ecode, 1); |
|
2511 minimize = *ecode == OP_NOTMINUPTO; |
|
2512 ecode += 3; |
|
2513 goto REPEATNOTCHAR; |
|
2514 |
|
2515 case OP_NOTPOSSTAR: |
|
2516 possessive = TRUE; |
|
2517 min = 0; |
|
2518 max = INT_MAX; |
|
2519 ecode++; |
|
2520 goto REPEATNOTCHAR; |
|
2521 |
|
2522 case OP_NOTPOSPLUS: |
|
2523 possessive = TRUE; |
|
2524 min = 1; |
|
2525 max = INT_MAX; |
|
2526 ecode++; |
|
2527 goto REPEATNOTCHAR; |
|
2528 |
|
2529 case OP_NOTPOSQUERY: |
|
2530 possessive = TRUE; |
|
2531 min = 0; |
|
2532 max = 1; |
|
2533 ecode++; |
|
2534 goto REPEATNOTCHAR; |
|
2535 |
|
2536 case OP_NOTPOSUPTO: |
|
2537 possessive = TRUE; |
|
2538 min = 0; |
|
2539 max = GET2(ecode, 1); |
|
2540 ecode += 3; |
|
2541 goto REPEATNOTCHAR; |
|
2542 |
|
2543 case OP_NOTSTAR: |
|
2544 case OP_NOTMINSTAR: |
|
2545 case OP_NOTPLUS: |
|
2546 case OP_NOTMINPLUS: |
|
2547 case OP_NOTQUERY: |
|
2548 case OP_NOTMINQUERY: |
|
2549 c = *ecode++ - OP_NOTSTAR; |
|
2550 minimize = (c & 1) != 0; |
|
2551 min = rep_min[c]; /* Pick up values from tables; */ |
|
2552 max = rep_max[c]; /* zero for max => infinity */ |
|
2553 if (max == 0) max = INT_MAX; |
|
2554 |
|
2555 /* Common code for all repeated single-byte matches. We can give up quickly |
|
2556 if there are fewer than the minimum number of bytes left in the |
|
2557 subject. */ |
|
2558 |
|
2559 REPEATNOTCHAR: |
|
2560 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH); |
|
2561 fc = *ecode++; |
|
2562 |
|
2563 /* The code is duplicated for the caseless and caseful cases, for speed, |
|
2564 since matching characters is likely to be quite common. First, ensure the |
|
2565 minimum number of matches are present. If min = max, continue at the same |
|
2566 level without recursing. Otherwise, if minimizing, keep trying the rest of |
|
2567 the expression and advancing one matching character if failing, up to the |
|
2568 maximum. Alternatively, if maximizing, find the maximum number of |
|
2569 characters and work backwards. */ |
|
2570 |
|
2571 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max, |
|
2572 max, eptr)); |
|
2573 |
|
2574 if ((ims & PCRE_CASELESS) != 0) |
|
2575 { |
|
2576 fc = md->lcc[fc]; |
|
2577 |
|
2578 #ifdef SUPPORT_UTF8 |
|
2579 /* UTF-8 mode */ |
|
2580 if (utf8) |
|
2581 { |
|
2582 register unsigned int d; |
|
2583 for (i = 1; i <= min; i++) |
|
2584 { |
|
2585 GETCHARINC(d, eptr); |
|
2586 if (d < 256) d = md->lcc[d]; |
|
2587 if (fc == d) RRETURN(MATCH_NOMATCH); |
|
2588 } |
|
2589 } |
|
2590 else |
|
2591 #endif |
|
2592 |
|
2593 /* Not UTF-8 mode */ |
|
2594 { |
|
2595 for (i = 1; i <= min; i++) |
|
2596 if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH); |
|
2597 } |
|
2598 |
|
2599 if (min == max) continue; |
|
2600 |
|
2601 if (minimize) |
|
2602 { |
|
2603 #ifdef SUPPORT_UTF8 |
|
2604 /* UTF-8 mode */ |
|
2605 if (utf8) |
|
2606 { |
|
2607 register unsigned int d; |
|
2608 for (fi = min;; fi++) |
|
2609 { |
|
2610 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28); |
|
2611 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
2612 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); |
|
2613 GETCHARINC(d, eptr); |
|
2614 if (d < 256) d = md->lcc[d]; |
|
2615 if (fc == d) RRETURN(MATCH_NOMATCH); |
|
2616 |
|
2617 } |
|
2618 } |
|
2619 else |
|
2620 #endif |
|
2621 /* Not UTF-8 mode */ |
|
2622 { |
|
2623 for (fi = min;; fi++) |
|
2624 { |
|
2625 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29); |
|
2626 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
2627 if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++]) |
|
2628 RRETURN(MATCH_NOMATCH); |
|
2629 } |
|
2630 } |
|
2631 /* Control never gets here */ |
|
2632 } |
|
2633 |
|
2634 /* Maximize case */ |
|
2635 |
|
2636 else |
|
2637 { |
|
2638 pp = eptr; |
|
2639 |
|
2640 #ifdef SUPPORT_UTF8 |
|
2641 /* UTF-8 mode */ |
|
2642 if (utf8) |
|
2643 { |
|
2644 register unsigned int d; |
|
2645 for (i = min; i < max; i++) |
|
2646 { |
|
2647 int len = 1; |
|
2648 if (eptr >= md->end_subject) break; |
|
2649 GETCHARLEN(d, eptr, len); |
|
2650 if (d < 256) d = md->lcc[d]; |
|
2651 if (fc == d) break; |
|
2652 eptr += len; |
|
2653 } |
|
2654 if (possessive) continue; |
|
2655 for(;;) |
|
2656 { |
|
2657 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30); |
|
2658 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
2659 if (eptr-- == pp) break; /* Stop if tried at original pos */ |
|
2660 BACKCHAR(eptr); |
|
2661 } |
|
2662 } |
|
2663 else |
|
2664 #endif |
|
2665 /* Not UTF-8 mode */ |
|
2666 { |
|
2667 for (i = min; i < max; i++) |
|
2668 { |
|
2669 if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break; |
|
2670 eptr++; |
|
2671 } |
|
2672 if (possessive) continue; |
|
2673 while (eptr >= pp) |
|
2674 { |
|
2675 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31); |
|
2676 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
2677 eptr--; |
|
2678 } |
|
2679 } |
|
2680 |
|
2681 RRETURN(MATCH_NOMATCH); |
|
2682 } |
|
2683 /* Control never gets here */ |
|
2684 } |
|
2685 |
|
2686 /* Caseful comparisons */ |
|
2687 |
|
2688 else |
|
2689 { |
|
2690 #ifdef SUPPORT_UTF8 |
|
2691 /* UTF-8 mode */ |
|
2692 if (utf8) |
|
2693 { |
|
2694 register unsigned int d; |
|
2695 for (i = 1; i <= min; i++) |
|
2696 { |
|
2697 GETCHARINC(d, eptr); |
|
2698 if (fc == d) RRETURN(MATCH_NOMATCH); |
|
2699 } |
|
2700 } |
|
2701 else |
|
2702 #endif |
|
2703 /* Not UTF-8 mode */ |
|
2704 { |
|
2705 for (i = 1; i <= min; i++) |
|
2706 if (fc == *eptr++) RRETURN(MATCH_NOMATCH); |
|
2707 } |
|
2708 |
|
2709 if (min == max) continue; |
|
2710 |
|
2711 if (minimize) |
|
2712 { |
|
2713 #ifdef SUPPORT_UTF8 |
|
2714 /* UTF-8 mode */ |
|
2715 if (utf8) |
|
2716 { |
|
2717 register unsigned int d; |
|
2718 for (fi = min;; fi++) |
|
2719 { |
|
2720 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32); |
|
2721 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
2722 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); |
|
2723 GETCHARINC(d, eptr); |
|
2724 if (fc == d) RRETURN(MATCH_NOMATCH); |
|
2725 } |
|
2726 } |
|
2727 else |
|
2728 #endif |
|
2729 /* Not UTF-8 mode */ |
|
2730 { |
|
2731 for (fi = min;; fi++) |
|
2732 { |
|
2733 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33); |
|
2734 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
2735 if (fi >= max || eptr >= md->end_subject || fc == *eptr++) |
|
2736 RRETURN(MATCH_NOMATCH); |
|
2737 } |
|
2738 } |
|
2739 /* Control never gets here */ |
|
2740 } |
|
2741 |
|
2742 /* Maximize case */ |
|
2743 |
|
2744 else |
|
2745 { |
|
2746 pp = eptr; |
|
2747 |
|
2748 #ifdef SUPPORT_UTF8 |
|
2749 /* UTF-8 mode */ |
|
2750 if (utf8) |
|
2751 { |
|
2752 register unsigned int d; |
|
2753 for (i = min; i < max; i++) |
|
2754 { |
|
2755 int len = 1; |
|
2756 if (eptr >= md->end_subject) break; |
|
2757 GETCHARLEN(d, eptr, len); |
|
2758 if (fc == d) break; |
|
2759 eptr += len; |
|
2760 } |
|
2761 if (possessive) continue; |
|
2762 for(;;) |
|
2763 { |
|
2764 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34); |
|
2765 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
2766 if (eptr-- == pp) break; /* Stop if tried at original pos */ |
|
2767 BACKCHAR(eptr); |
|
2768 } |
|
2769 } |
|
2770 else |
|
2771 #endif |
|
2772 /* Not UTF-8 mode */ |
|
2773 { |
|
2774 for (i = min; i < max; i++) |
|
2775 { |
|
2776 if (eptr >= md->end_subject || fc == *eptr) break; |
|
2777 eptr++; |
|
2778 } |
|
2779 if (possessive) continue; |
|
2780 while (eptr >= pp) |
|
2781 { |
|
2782 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35); |
|
2783 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
2784 eptr--; |
|
2785 } |
|
2786 } |
|
2787 |
|
2788 RRETURN(MATCH_NOMATCH); |
|
2789 } |
|
2790 } |
|
2791 /* Control never gets here */ |
|
2792 |
|
2793 /* Match a single character type repeatedly; several different opcodes |
|
2794 share code. This is very similar to the code for single characters, but we |
|
2795 repeat it in the interests of efficiency. */ |
|
2796 |
|
2797 case OP_TYPEEXACT: |
|
2798 min = max = GET2(ecode, 1); |
|
2799 minimize = TRUE; |
|
2800 ecode += 3; |
|
2801 goto REPEATTYPE; |
|
2802 |
|
2803 case OP_TYPEUPTO: |
|
2804 case OP_TYPEMINUPTO: |
|
2805 min = 0; |
|
2806 max = GET2(ecode, 1); |
|
2807 minimize = *ecode == OP_TYPEMINUPTO; |
|
2808 ecode += 3; |
|
2809 goto REPEATTYPE; |
|
2810 |
|
2811 case OP_TYPEPOSSTAR: |
|
2812 possessive = TRUE; |
|
2813 min = 0; |
|
2814 max = INT_MAX; |
|
2815 ecode++; |
|
2816 goto REPEATTYPE; |
|
2817 |
|
2818 case OP_TYPEPOSPLUS: |
|
2819 possessive = TRUE; |
|
2820 min = 1; |
|
2821 max = INT_MAX; |
|
2822 ecode++; |
|
2823 goto REPEATTYPE; |
|
2824 |
|
2825 case OP_TYPEPOSQUERY: |
|
2826 possessive = TRUE; |
|
2827 min = 0; |
|
2828 max = 1; |
|
2829 ecode++; |
|
2830 goto REPEATTYPE; |
|
2831 |
|
2832 case OP_TYPEPOSUPTO: |
|
2833 possessive = TRUE; |
|
2834 min = 0; |
|
2835 max = GET2(ecode, 1); |
|
2836 ecode += 3; |
|
2837 goto REPEATTYPE; |
|
2838 |
|
2839 case OP_TYPESTAR: |
|
2840 case OP_TYPEMINSTAR: |
|
2841 case OP_TYPEPLUS: |
|
2842 case OP_TYPEMINPLUS: |
|
2843 case OP_TYPEQUERY: |
|
2844 case OP_TYPEMINQUERY: |
|
2845 c = *ecode++ - OP_TYPESTAR; |
|
2846 minimize = (c & 1) != 0; |
|
2847 min = rep_min[c]; /* Pick up values from tables; */ |
|
2848 max = rep_max[c]; /* zero for max => infinity */ |
|
2849 if (max == 0) max = INT_MAX; |
|
2850 |
|
2851 /* Common code for all repeated single character type matches. Note that |
|
2852 in UTF-8 mode, '.' matches a character of any length, but for the other |
|
2853 character types, the valid characters are all one-byte long. */ |
|
2854 |
|
2855 REPEATTYPE: |
|
2856 ctype = *ecode++; /* Code for the character type */ |
|
2857 |
|
2858 #ifdef SUPPORT_UCP |
|
2859 if (ctype == OP_PROP || ctype == OP_NOTPROP) |
|
2860 { |
|
2861 prop_fail_result = ctype == OP_NOTPROP; |
|
2862 prop_type = *ecode++; |
|
2863 prop_value = *ecode++; |
|
2864 } |
|
2865 else prop_type = -1; |
|
2866 #endif |
|
2867 |
|
2868 /* First, ensure the minimum number of matches are present. Use inline |
|
2869 code for maximizing the speed, and do the type test once at the start |
|
2870 (i.e. keep it out of the loop). Also we can test that there are at least |
|
2871 the minimum number of bytes before we start. This isn't as effective in |
|
2872 UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that |
|
2873 is tidier. Also separate the UCP code, which can be the same for both UTF-8 |
|
2874 and single-bytes. */ |
|
2875 |
|
2876 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH); |
|
2877 if (min > 0) |
|
2878 { |
|
2879 #ifdef SUPPORT_UCP |
|
2880 if (prop_type >= 0) |
|
2881 { |
|
2882 switch(prop_type) |
|
2883 { |
|
2884 case PT_ANY: |
|
2885 if (prop_fail_result) RRETURN(MATCH_NOMATCH); |
|
2886 for (i = 1; i <= min; i++) |
|
2887 { |
|
2888 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); |
|
2889 GETCHARINCTEST(c, eptr); |
|
2890 } |
|
2891 break; |
|
2892 |
|
2893 case PT_LAMP: |
|
2894 for (i = 1; i <= min; i++) |
|
2895 { |
|
2896 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); |
|
2897 GETCHARINCTEST(c, eptr); |
|
2898 prop_chartype = UCD_CHARTYPE(c); |
|
2899 if ((prop_chartype == ucp_Lu || |
|
2900 prop_chartype == ucp_Ll || |
|
2901 prop_chartype == ucp_Lt) == prop_fail_result) |
|
2902 RRETURN(MATCH_NOMATCH); |
|
2903 } |
|
2904 break; |
|
2905 |
|
2906 case PT_GC: |
|
2907 for (i = 1; i <= min; i++) |
|
2908 { |
|
2909 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); |
|
2910 GETCHARINCTEST(c, eptr); |
|
2911 prop_category = UCD_CATEGORY(c); |
|
2912 if ((prop_category == prop_value) == prop_fail_result) |
|
2913 RRETURN(MATCH_NOMATCH); |
|
2914 } |
|
2915 break; |
|
2916 |
|
2917 case PT_PC: |
|
2918 for (i = 1; i <= min; i++) |
|
2919 { |
|
2920 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); |
|
2921 GETCHARINCTEST(c, eptr); |
|
2922 prop_chartype = UCD_CHARTYPE(c); |
|
2923 if ((prop_chartype == prop_value) == prop_fail_result) |
|
2924 RRETURN(MATCH_NOMATCH); |
|
2925 } |
|
2926 break; |
|
2927 |
|
2928 case PT_SC: |
|
2929 for (i = 1; i <= min; i++) |
|
2930 { |
|
2931 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); |
|
2932 GETCHARINCTEST(c, eptr); |
|
2933 prop_script = UCD_SCRIPT(c); |
|
2934 if ((prop_script == prop_value) == prop_fail_result) |
|
2935 RRETURN(MATCH_NOMATCH); |
|
2936 } |
|
2937 break; |
|
2938 |
|
2939 default: |
|
2940 RRETURN(PCRE_ERROR_INTERNAL); |
|
2941 } |
|
2942 } |
|
2943 |
|
2944 /* Match extended Unicode sequences. We will get here only if the |
|
2945 support is in the binary; otherwise a compile-time error occurs. */ |
|
2946 |
|
2947 else if (ctype == OP_EXTUNI) |
|
2948 { |
|
2949 for (i = 1; i <= min; i++) |
|
2950 { |
|
2951 GETCHARINCTEST(c, eptr); |
|
2952 prop_category = UCD_CATEGORY(c); |
|
2953 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH); |
|
2954 while (eptr < md->end_subject) |
|
2955 { |
|
2956 int len = 1; |
|
2957 if (!utf8) c = *eptr; else |
|
2958 { |
|
2959 GETCHARLEN(c, eptr, len); |
|
2960 } |
|
2961 prop_category = UCD_CATEGORY(c); |
|
2962 if (prop_category != ucp_M) break; |
|
2963 eptr += len; |
|
2964 } |
|
2965 } |
|
2966 } |
|
2967 |
|
2968 else |
|
2969 #endif /* SUPPORT_UCP */ |
|
2970 |
|
2971 /* Handle all other cases when the coding is UTF-8 */ |
|
2972 |
|
2973 #ifdef SUPPORT_UTF8 |
|
2974 if (utf8) switch(ctype) |
|
2975 { |
|
2976 case OP_ANY: |
|
2977 for (i = 1; i <= min; i++) |
|
2978 { |
|
2979 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) |
|
2980 RRETURN(MATCH_NOMATCH); |
|
2981 eptr++; |
|
2982 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; |
|
2983 } |
|
2984 break; |
|
2985 |
|
2986 case OP_ALLANY: |
|
2987 for (i = 1; i <= min; i++) |
|
2988 { |
|
2989 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); |
|
2990 eptr++; |
|
2991 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; |
|
2992 } |
|
2993 break; |
|
2994 |
|
2995 case OP_ANYBYTE: |
|
2996 eptr += min; |
|
2997 break; |
|
2998 |
|
2999 case OP_ANYNL: |
|
3000 for (i = 1; i <= min; i++) |
|
3001 { |
|
3002 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); |
|
3003 GETCHARINC(c, eptr); |
|
3004 switch(c) |
|
3005 { |
|
3006 default: RRETURN(MATCH_NOMATCH); |
|
3007 case 0x000d: |
|
3008 if (eptr < md->end_subject && *eptr == 0x0a) eptr++; |
|
3009 break; |
|
3010 |
|
3011 case 0x000a: |
|
3012 break; |
|
3013 |
|
3014 case 0x000b: |
|
3015 case 0x000c: |
|
3016 case 0x0085: |
|
3017 case 0x2028: |
|
3018 case 0x2029: |
|
3019 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH); |
|
3020 break; |
|
3021 } |
|
3022 } |
|
3023 break; |
|
3024 |
|
3025 case OP_NOT_HSPACE: |
|
3026 for (i = 1; i <= min; i++) |
|
3027 { |
|
3028 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); |
|
3029 GETCHARINC(c, eptr); |
|
3030 switch(c) |
|
3031 { |
|
3032 default: break; |
|
3033 case 0x09: /* HT */ |
|
3034 case 0x20: /* SPACE */ |
|
3035 case 0xa0: /* NBSP */ |
|
3036 case 0x1680: /* OGHAM SPACE MARK */ |
|
3037 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ |
|
3038 case 0x2000: /* EN QUAD */ |
|
3039 case 0x2001: /* EM QUAD */ |
|
3040 case 0x2002: /* EN SPACE */ |
|
3041 case 0x2003: /* EM SPACE */ |
|
3042 case 0x2004: /* THREE-PER-EM SPACE */ |
|
3043 case 0x2005: /* FOUR-PER-EM SPACE */ |
|
3044 case 0x2006: /* SIX-PER-EM SPACE */ |
|
3045 case 0x2007: /* FIGURE SPACE */ |
|
3046 case 0x2008: /* PUNCTUATION SPACE */ |
|
3047 case 0x2009: /* THIN SPACE */ |
|
3048 case 0x200A: /* HAIR SPACE */ |
|
3049 case 0x202f: /* NARROW NO-BREAK SPACE */ |
|
3050 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ |
|
3051 case 0x3000: /* IDEOGRAPHIC SPACE */ |
|
3052 RRETURN(MATCH_NOMATCH); |
|
3053 } |
|
3054 } |
|
3055 break; |
|
3056 |
|
3057 case OP_HSPACE: |
|
3058 for (i = 1; i <= min; i++) |
|
3059 { |
|
3060 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); |
|
3061 GETCHARINC(c, eptr); |
|
3062 switch(c) |
|
3063 { |
|
3064 default: RRETURN(MATCH_NOMATCH); |
|
3065 case 0x09: /* HT */ |
|
3066 case 0x20: /* SPACE */ |
|
3067 case 0xa0: /* NBSP */ |
|
3068 case 0x1680: /* OGHAM SPACE MARK */ |
|
3069 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ |
|
3070 case 0x2000: /* EN QUAD */ |
|
3071 case 0x2001: /* EM QUAD */ |
|
3072 case 0x2002: /* EN SPACE */ |
|
3073 case 0x2003: /* EM SPACE */ |
|
3074 case 0x2004: /* THREE-PER-EM SPACE */ |
|
3075 case 0x2005: /* FOUR-PER-EM SPACE */ |
|
3076 case 0x2006: /* SIX-PER-EM SPACE */ |
|
3077 case 0x2007: /* FIGURE SPACE */ |
|
3078 case 0x2008: /* PUNCTUATION SPACE */ |
|
3079 case 0x2009: /* THIN SPACE */ |
|
3080 case 0x200A: /* HAIR SPACE */ |
|
3081 case 0x202f: /* NARROW NO-BREAK SPACE */ |
|
3082 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ |
|
3083 case 0x3000: /* IDEOGRAPHIC SPACE */ |
|
3084 break; |
|
3085 } |
|
3086 } |
|
3087 break; |
|
3088 |
|
3089 case OP_NOT_VSPACE: |
|
3090 for (i = 1; i <= min; i++) |
|
3091 { |
|
3092 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); |
|
3093 GETCHARINC(c, eptr); |
|
3094 switch(c) |
|
3095 { |
|
3096 default: break; |
|
3097 case 0x0a: /* LF */ |
|
3098 case 0x0b: /* VT */ |
|
3099 case 0x0c: /* FF */ |
|
3100 case 0x0d: /* CR */ |
|
3101 case 0x85: /* NEL */ |
|
3102 case 0x2028: /* LINE SEPARATOR */ |
|
3103 case 0x2029: /* PARAGRAPH SEPARATOR */ |
|
3104 RRETURN(MATCH_NOMATCH); |
|
3105 } |
|
3106 } |
|
3107 break; |
|
3108 |
|
3109 case OP_VSPACE: |
|
3110 for (i = 1; i <= min; i++) |
|
3111 { |
|
3112 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); |
|
3113 GETCHARINC(c, eptr); |
|
3114 switch(c) |
|
3115 { |
|
3116 default: RRETURN(MATCH_NOMATCH); |
|
3117 case 0x0a: /* LF */ |
|
3118 case 0x0b: /* VT */ |
|
3119 case 0x0c: /* FF */ |
|
3120 case 0x0d: /* CR */ |
|
3121 case 0x85: /* NEL */ |
|
3122 case 0x2028: /* LINE SEPARATOR */ |
|
3123 case 0x2029: /* PARAGRAPH SEPARATOR */ |
|
3124 break; |
|
3125 } |
|
3126 } |
|
3127 break; |
|
3128 |
|
3129 case OP_NOT_DIGIT: |
|
3130 for (i = 1; i <= min; i++) |
|
3131 { |
|
3132 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); |
|
3133 GETCHARINC(c, eptr); |
|
3134 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0) |
|
3135 RRETURN(MATCH_NOMATCH); |
|
3136 } |
|
3137 break; |
|
3138 |
|
3139 case OP_DIGIT: |
|
3140 for (i = 1; i <= min; i++) |
|
3141 { |
|
3142 if (eptr >= md->end_subject || |
|
3143 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0) |
|
3144 RRETURN(MATCH_NOMATCH); |
|
3145 /* No need to skip more bytes - we know it's a 1-byte character */ |
|
3146 } |
|
3147 break; |
|
3148 |
|
3149 case OP_NOT_WHITESPACE: |
|
3150 for (i = 1; i <= min; i++) |
|
3151 { |
|
3152 if (eptr >= md->end_subject || |
|
3153 (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)) |
|
3154 RRETURN(MATCH_NOMATCH); |
|
3155 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80); |
|
3156 } |
|
3157 break; |
|
3158 |
|
3159 case OP_WHITESPACE: |
|
3160 for (i = 1; i <= min; i++) |
|
3161 { |
|
3162 if (eptr >= md->end_subject || |
|
3163 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0) |
|
3164 RRETURN(MATCH_NOMATCH); |
|
3165 /* No need to skip more bytes - we know it's a 1-byte character */ |
|
3166 } |
|
3167 break; |
|
3168 |
|
3169 case OP_NOT_WORDCHAR: |
|
3170 for (i = 1; i <= min; i++) |
|
3171 { |
|
3172 if (eptr >= md->end_subject || |
|
3173 (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)) |
|
3174 RRETURN(MATCH_NOMATCH); |
|
3175 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80); |
|
3176 } |
|
3177 break; |
|
3178 |
|
3179 case OP_WORDCHAR: |
|
3180 for (i = 1; i <= min; i++) |
|
3181 { |
|
3182 if (eptr >= md->end_subject || |
|
3183 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0) |
|
3184 RRETURN(MATCH_NOMATCH); |
|
3185 /* No need to skip more bytes - we know it's a 1-byte character */ |
|
3186 } |
|
3187 break; |
|
3188 |
|
3189 default: |
|
3190 RRETURN(PCRE_ERROR_INTERNAL); |
|
3191 } /* End switch(ctype) */ |
|
3192 |
|
3193 else |
|
3194 #endif /* SUPPORT_UTF8 */ |
|
3195 |
|
3196 /* Code for the non-UTF-8 case for minimum matching of operators other |
|
3197 than OP_PROP and OP_NOTPROP. We can assume that there are the minimum |
|
3198 number of bytes present, as this was tested above. */ |
|
3199 |
|
3200 switch(ctype) |
|
3201 { |
|
3202 case OP_ANY: |
|
3203 for (i = 1; i <= min; i++) |
|
3204 { |
|
3205 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); |
|
3206 eptr++; |
|
3207 } |
|
3208 break; |
|
3209 |
|
3210 case OP_ALLANY: |
|
3211 eptr += min; |
|
3212 break; |
|
3213 |
|
3214 case OP_ANYBYTE: |
|
3215 eptr += min; |
|
3216 break; |
|
3217 |
|
3218 /* Because of the CRLF case, we can't assume the minimum number of |
|
3219 bytes are present in this case. */ |
|
3220 |
|
3221 case OP_ANYNL: |
|
3222 for (i = 1; i <= min; i++) |
|
3223 { |
|
3224 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); |
|
3225 switch(*eptr++) |
|
3226 { |
|
3227 default: RRETURN(MATCH_NOMATCH); |
|
3228 case 0x000d: |
|
3229 if (eptr < md->end_subject && *eptr == 0x0a) eptr++; |
|
3230 break; |
|
3231 case 0x000a: |
|
3232 break; |
|
3233 |
|
3234 case 0x000b: |
|
3235 case 0x000c: |
|
3236 case 0x0085: |
|
3237 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH); |
|
3238 break; |
|
3239 } |
|
3240 } |
|
3241 break; |
|
3242 |
|
3243 case OP_NOT_HSPACE: |
|
3244 for (i = 1; i <= min; i++) |
|
3245 { |
|
3246 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); |
|
3247 switch(*eptr++) |
|
3248 { |
|
3249 default: break; |
|
3250 case 0x09: /* HT */ |
|
3251 case 0x20: /* SPACE */ |
|
3252 case 0xa0: /* NBSP */ |
|
3253 RRETURN(MATCH_NOMATCH); |
|
3254 } |
|
3255 } |
|
3256 break; |
|
3257 |
|
3258 case OP_HSPACE: |
|
3259 for (i = 1; i <= min; i++) |
|
3260 { |
|
3261 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); |
|
3262 switch(*eptr++) |
|
3263 { |
|
3264 default: RRETURN(MATCH_NOMATCH); |
|
3265 case 0x09: /* HT */ |
|
3266 case 0x20: /* SPACE */ |
|
3267 case 0xa0: /* NBSP */ |
|
3268 break; |
|
3269 } |
|
3270 } |
|
3271 break; |
|
3272 |
|
3273 case OP_NOT_VSPACE: |
|
3274 for (i = 1; i <= min; i++) |
|
3275 { |
|
3276 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); |
|
3277 switch(*eptr++) |
|
3278 { |
|
3279 default: break; |
|
3280 case 0x0a: /* LF */ |
|
3281 case 0x0b: /* VT */ |
|
3282 case 0x0c: /* FF */ |
|
3283 case 0x0d: /* CR */ |
|
3284 case 0x85: /* NEL */ |
|
3285 RRETURN(MATCH_NOMATCH); |
|
3286 } |
|
3287 } |
|
3288 break; |
|
3289 |
|
3290 case OP_VSPACE: |
|
3291 for (i = 1; i <= min; i++) |
|
3292 { |
|
3293 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); |
|
3294 switch(*eptr++) |
|
3295 { |
|
3296 default: RRETURN(MATCH_NOMATCH); |
|
3297 case 0x0a: /* LF */ |
|
3298 case 0x0b: /* VT */ |
|
3299 case 0x0c: /* FF */ |
|
3300 case 0x0d: /* CR */ |
|
3301 case 0x85: /* NEL */ |
|
3302 break; |
|
3303 } |
|
3304 } |
|
3305 break; |
|
3306 |
|
3307 case OP_NOT_DIGIT: |
|
3308 for (i = 1; i <= min; i++) |
|
3309 if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH); |
|
3310 break; |
|
3311 |
|
3312 case OP_DIGIT: |
|
3313 for (i = 1; i <= min; i++) |
|
3314 if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH); |
|
3315 break; |
|
3316 |
|
3317 case OP_NOT_WHITESPACE: |
|
3318 for (i = 1; i <= min; i++) |
|
3319 if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH); |
|
3320 break; |
|
3321 |
|
3322 case OP_WHITESPACE: |
|
3323 for (i = 1; i <= min; i++) |
|
3324 if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH); |
|
3325 break; |
|
3326 |
|
3327 case OP_NOT_WORDCHAR: |
|
3328 for (i = 1; i <= min; i++) |
|
3329 if ((md->ctypes[*eptr++] & ctype_word) != 0) |
|
3330 RRETURN(MATCH_NOMATCH); |
|
3331 break; |
|
3332 |
|
3333 case OP_WORDCHAR: |
|
3334 for (i = 1; i <= min; i++) |
|
3335 if ((md->ctypes[*eptr++] & ctype_word) == 0) |
|
3336 RRETURN(MATCH_NOMATCH); |
|
3337 break; |
|
3338 |
|
3339 default: |
|
3340 RRETURN(PCRE_ERROR_INTERNAL); |
|
3341 } |
|
3342 } |
|
3343 |
|
3344 /* If min = max, continue at the same level without recursing */ |
|
3345 |
|
3346 if (min == max) continue; |
|
3347 |
|
3348 /* If minimizing, we have to test the rest of the pattern before each |
|
3349 subsequent match. Again, separate the UTF-8 case for speed, and also |
|
3350 separate the UCP cases. */ |
|
3351 |
|
3352 if (minimize) |
|
3353 { |
|
3354 #ifdef SUPPORT_UCP |
|
3355 if (prop_type >= 0) |
|
3356 { |
|
3357 switch(prop_type) |
|
3358 { |
|
3359 case PT_ANY: |
|
3360 for (fi = min;; fi++) |
|
3361 { |
|
3362 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36); |
|
3363 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
3364 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); |
|
3365 GETCHARINC(c, eptr); |
|
3366 if (prop_fail_result) RRETURN(MATCH_NOMATCH); |
|
3367 } |
|
3368 /* Control never gets here */ |
|
3369 |
|
3370 case PT_LAMP: |
|
3371 for (fi = min;; fi++) |
|
3372 { |
|
3373 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37); |
|
3374 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
3375 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); |
|
3376 GETCHARINC(c, eptr); |
|
3377 prop_chartype = UCD_CHARTYPE(c); |
|
3378 if ((prop_chartype == ucp_Lu || |
|
3379 prop_chartype == ucp_Ll || |
|
3380 prop_chartype == ucp_Lt) == prop_fail_result) |
|
3381 RRETURN(MATCH_NOMATCH); |
|
3382 } |
|
3383 /* Control never gets here */ |
|
3384 |
|
3385 case PT_GC: |
|
3386 for (fi = min;; fi++) |
|
3387 { |
|
3388 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38); |
|
3389 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
3390 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); |
|
3391 GETCHARINC(c, eptr); |
|
3392 prop_category = UCD_CATEGORY(c); |
|
3393 if ((prop_category == prop_value) == prop_fail_result) |
|
3394 RRETURN(MATCH_NOMATCH); |
|
3395 } |
|
3396 /* Control never gets here */ |
|
3397 |
|
3398 case PT_PC: |
|
3399 for (fi = min;; fi++) |
|
3400 { |
|
3401 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39); |
|
3402 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
3403 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); |
|
3404 GETCHARINC(c, eptr); |
|
3405 prop_chartype = UCD_CHARTYPE(c); |
|
3406 if ((prop_chartype == prop_value) == prop_fail_result) |
|
3407 RRETURN(MATCH_NOMATCH); |
|
3408 } |
|
3409 /* Control never gets here */ |
|
3410 |
|
3411 case PT_SC: |
|
3412 for (fi = min;; fi++) |
|
3413 { |
|
3414 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40); |
|
3415 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
3416 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); |
|
3417 GETCHARINC(c, eptr); |
|
3418 prop_script = UCD_SCRIPT(c); |
|
3419 if ((prop_script == prop_value) == prop_fail_result) |
|
3420 RRETURN(MATCH_NOMATCH); |
|
3421 } |
|
3422 /* Control never gets here */ |
|
3423 |
|
3424 default: |
|
3425 RRETURN(PCRE_ERROR_INTERNAL); |
|
3426 } |
|
3427 } |
|
3428 |
|
3429 /* Match extended Unicode sequences. We will get here only if the |
|
3430 support is in the binary; otherwise a compile-time error occurs. */ |
|
3431 |
|
3432 else if (ctype == OP_EXTUNI) |
|
3433 { |
|
3434 for (fi = min;; fi++) |
|
3435 { |
|
3436 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41); |
|
3437 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
3438 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); |
|
3439 GETCHARINCTEST(c, eptr); |
|
3440 prop_category = UCD_CATEGORY(c); |
|
3441 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH); |
|
3442 while (eptr < md->end_subject) |
|
3443 { |
|
3444 int len = 1; |
|
3445 if (!utf8) c = *eptr; else |
|
3446 { |
|
3447 GETCHARLEN(c, eptr, len); |
|
3448 } |
|
3449 prop_category = UCD_CATEGORY(c); |
|
3450 if (prop_category != ucp_M) break; |
|
3451 eptr += len; |
|
3452 } |
|
3453 } |
|
3454 } |
|
3455 |
|
3456 else |
|
3457 #endif /* SUPPORT_UCP */ |
|
3458 |
|
3459 #ifdef SUPPORT_UTF8 |
|
3460 /* UTF-8 mode */ |
|
3461 if (utf8) |
|
3462 { |
|
3463 for (fi = min;; fi++) |
|
3464 { |
|
3465 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42); |
|
3466 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
3467 if (fi >= max || eptr >= md->end_subject || |
|
3468 (ctype == OP_ANY && IS_NEWLINE(eptr))) |
|
3469 RRETURN(MATCH_NOMATCH); |
|
3470 |
|
3471 GETCHARINC(c, eptr); |
|
3472 switch(ctype) |
|
3473 { |
|
3474 case OP_ANY: /* This is the non-NL case */ |
|
3475 case OP_ALLANY: |
|
3476 case OP_ANYBYTE: |
|
3477 break; |
|
3478 |
|
3479 case OP_ANYNL: |
|
3480 switch(c) |
|
3481 { |
|
3482 default: RRETURN(MATCH_NOMATCH); |
|
3483 case 0x000d: |
|
3484 if (eptr < md->end_subject && *eptr == 0x0a) eptr++; |
|
3485 break; |
|
3486 case 0x000a: |
|
3487 break; |
|
3488 |
|
3489 case 0x000b: |
|
3490 case 0x000c: |
|
3491 case 0x0085: |
|
3492 case 0x2028: |
|
3493 case 0x2029: |
|
3494 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH); |
|
3495 break; |
|
3496 } |
|
3497 break; |
|
3498 |
|
3499 case OP_NOT_HSPACE: |
|
3500 switch(c) |
|
3501 { |
|
3502 default: break; |
|
3503 case 0x09: /* HT */ |
|
3504 case 0x20: /* SPACE */ |
|
3505 case 0xa0: /* NBSP */ |
|
3506 case 0x1680: /* OGHAM SPACE MARK */ |
|
3507 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ |
|
3508 case 0x2000: /* EN QUAD */ |
|
3509 case 0x2001: /* EM QUAD */ |
|
3510 case 0x2002: /* EN SPACE */ |
|
3511 case 0x2003: /* EM SPACE */ |
|
3512 case 0x2004: /* THREE-PER-EM SPACE */ |
|
3513 case 0x2005: /* FOUR-PER-EM SPACE */ |
|
3514 case 0x2006: /* SIX-PER-EM SPACE */ |
|
3515 case 0x2007: /* FIGURE SPACE */ |
|
3516 case 0x2008: /* PUNCTUATION SPACE */ |
|
3517 case 0x2009: /* THIN SPACE */ |
|
3518 case 0x200A: /* HAIR SPACE */ |
|
3519 case 0x202f: /* NARROW NO-BREAK SPACE */ |
|
3520 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ |
|
3521 case 0x3000: /* IDEOGRAPHIC SPACE */ |
|
3522 RRETURN(MATCH_NOMATCH); |
|
3523 } |
|
3524 break; |
|
3525 |
|
3526 case OP_HSPACE: |
|
3527 switch(c) |
|
3528 { |
|
3529 default: RRETURN(MATCH_NOMATCH); |
|
3530 case 0x09: /* HT */ |
|
3531 case 0x20: /* SPACE */ |
|
3532 case 0xa0: /* NBSP */ |
|
3533 case 0x1680: /* OGHAM SPACE MARK */ |
|
3534 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ |
|
3535 case 0x2000: /* EN QUAD */ |
|
3536 case 0x2001: /* EM QUAD */ |
|
3537 case 0x2002: /* EN SPACE */ |
|
3538 case 0x2003: /* EM SPACE */ |
|
3539 case 0x2004: /* THREE-PER-EM SPACE */ |
|
3540 case 0x2005: /* FOUR-PER-EM SPACE */ |
|
3541 case 0x2006: /* SIX-PER-EM SPACE */ |
|
3542 case 0x2007: /* FIGURE SPACE */ |
|
3543 case 0x2008: /* PUNCTUATION SPACE */ |
|
3544 case 0x2009: /* THIN SPACE */ |
|
3545 case 0x200A: /* HAIR SPACE */ |
|
3546 case 0x202f: /* NARROW NO-BREAK SPACE */ |
|
3547 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ |
|
3548 case 0x3000: /* IDEOGRAPHIC SPACE */ |
|
3549 break; |
|
3550 } |
|
3551 break; |
|
3552 |
|
3553 case OP_NOT_VSPACE: |
|
3554 switch(c) |
|
3555 { |
|
3556 default: break; |
|
3557 case 0x0a: /* LF */ |
|
3558 case 0x0b: /* VT */ |
|
3559 case 0x0c: /* FF */ |
|
3560 case 0x0d: /* CR */ |
|
3561 case 0x85: /* NEL */ |
|
3562 case 0x2028: /* LINE SEPARATOR */ |
|
3563 case 0x2029: /* PARAGRAPH SEPARATOR */ |
|
3564 RRETURN(MATCH_NOMATCH); |
|
3565 } |
|
3566 break; |
|
3567 |
|
3568 case OP_VSPACE: |
|
3569 switch(c) |
|
3570 { |
|
3571 default: RRETURN(MATCH_NOMATCH); |
|
3572 case 0x0a: /* LF */ |
|
3573 case 0x0b: /* VT */ |
|
3574 case 0x0c: /* FF */ |
|
3575 case 0x0d: /* CR */ |
|
3576 case 0x85: /* NEL */ |
|
3577 case 0x2028: /* LINE SEPARATOR */ |
|
3578 case 0x2029: /* PARAGRAPH SEPARATOR */ |
|
3579 break; |
|
3580 } |
|
3581 break; |
|
3582 |
|
3583 case OP_NOT_DIGIT: |
|
3584 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) |
|
3585 RRETURN(MATCH_NOMATCH); |
|
3586 break; |
|
3587 |
|
3588 case OP_DIGIT: |
|
3589 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0) |
|
3590 RRETURN(MATCH_NOMATCH); |
|
3591 break; |
|
3592 |
|
3593 case OP_NOT_WHITESPACE: |
|
3594 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) |
|
3595 RRETURN(MATCH_NOMATCH); |
|
3596 break; |
|
3597 |
|
3598 case OP_WHITESPACE: |
|
3599 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0) |
|
3600 RRETURN(MATCH_NOMATCH); |
|
3601 break; |
|
3602 |
|
3603 case OP_NOT_WORDCHAR: |
|
3604 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) |
|
3605 RRETURN(MATCH_NOMATCH); |
|
3606 break; |
|
3607 |
|
3608 case OP_WORDCHAR: |
|
3609 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) |
|
3610 RRETURN(MATCH_NOMATCH); |
|
3611 break; |
|
3612 |
|
3613 default: |
|
3614 RRETURN(PCRE_ERROR_INTERNAL); |
|
3615 } |
|
3616 } |
|
3617 } |
|
3618 else |
|
3619 #endif |
|
3620 /* Not UTF-8 mode */ |
|
3621 { |
|
3622 for (fi = min;; fi++) |
|
3623 { |
|
3624 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43); |
|
3625 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
3626 if (fi >= max || eptr >= md->end_subject || |
|
3627 (ctype == OP_ANY && IS_NEWLINE(eptr))) |
|
3628 RRETURN(MATCH_NOMATCH); |
|
3629 |
|
3630 c = *eptr++; |
|
3631 switch(ctype) |
|
3632 { |
|
3633 case OP_ANY: /* This is the non-NL case */ |
|
3634 case OP_ALLANY: |
|
3635 case OP_ANYBYTE: |
|
3636 break; |
|
3637 |
|
3638 case OP_ANYNL: |
|
3639 switch(c) |
|
3640 { |
|
3641 default: RRETURN(MATCH_NOMATCH); |
|
3642 case 0x000d: |
|
3643 if (eptr < md->end_subject && *eptr == 0x0a) eptr++; |
|
3644 break; |
|
3645 |
|
3646 case 0x000a: |
|
3647 break; |
|
3648 |
|
3649 case 0x000b: |
|
3650 case 0x000c: |
|
3651 case 0x0085: |
|
3652 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH); |
|
3653 break; |
|
3654 } |
|
3655 break; |
|
3656 |
|
3657 case OP_NOT_HSPACE: |
|
3658 switch(c) |
|
3659 { |
|
3660 default: break; |
|
3661 case 0x09: /* HT */ |
|
3662 case 0x20: /* SPACE */ |
|
3663 case 0xa0: /* NBSP */ |
|
3664 RRETURN(MATCH_NOMATCH); |
|
3665 } |
|
3666 break; |
|
3667 |
|
3668 case OP_HSPACE: |
|
3669 switch(c) |
|
3670 { |
|
3671 default: RRETURN(MATCH_NOMATCH); |
|
3672 case 0x09: /* HT */ |
|
3673 case 0x20: /* SPACE */ |
|
3674 case 0xa0: /* NBSP */ |
|
3675 break; |
|
3676 } |
|
3677 break; |
|
3678 |
|
3679 case OP_NOT_VSPACE: |
|
3680 switch(c) |
|
3681 { |
|
3682 default: break; |
|
3683 case 0x0a: /* LF */ |
|
3684 case 0x0b: /* VT */ |
|
3685 case 0x0c: /* FF */ |
|
3686 case 0x0d: /* CR */ |
|
3687 case 0x85: /* NEL */ |
|
3688 RRETURN(MATCH_NOMATCH); |
|
3689 } |
|
3690 break; |
|
3691 |
|
3692 case OP_VSPACE: |
|
3693 switch(c) |
|
3694 { |
|
3695 default: RRETURN(MATCH_NOMATCH); |
|
3696 case 0x0a: /* LF */ |
|
3697 case 0x0b: /* VT */ |
|
3698 case 0x0c: /* FF */ |
|
3699 case 0x0d: /* CR */ |
|
3700 case 0x85: /* NEL */ |
|
3701 break; |
|
3702 } |
|
3703 break; |
|
3704 |
|
3705 case OP_NOT_DIGIT: |
|
3706 if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH); |
|
3707 break; |
|
3708 |
|
3709 case OP_DIGIT: |
|
3710 if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH); |
|
3711 break; |
|
3712 |
|
3713 case OP_NOT_WHITESPACE: |
|
3714 if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH); |
|
3715 break; |
|
3716 |
|
3717 case OP_WHITESPACE: |
|
3718 if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH); |
|
3719 break; |
|
3720 |
|
3721 case OP_NOT_WORDCHAR: |
|
3722 if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH); |
|
3723 break; |
|
3724 |
|
3725 case OP_WORDCHAR: |
|
3726 if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH); |
|
3727 break; |
|
3728 |
|
3729 default: |
|
3730 RRETURN(PCRE_ERROR_INTERNAL); |
|
3731 } |
|
3732 } |
|
3733 } |
|
3734 /* Control never gets here */ |
|
3735 } |
|
3736 |
|
3737 /* If maximizing, it is worth using inline code for speed, doing the type |
|
3738 test once at the start (i.e. keep it out of the loop). Again, keep the |
|
3739 UTF-8 and UCP stuff separate. */ |
|
3740 |
|
3741 else |
|
3742 { |
|
3743 pp = eptr; /* Remember where we started */ |
|
3744 |
|
3745 #ifdef SUPPORT_UCP |
|
3746 if (prop_type >= 0) |
|
3747 { |
|
3748 switch(prop_type) |
|
3749 { |
|
3750 case PT_ANY: |
|
3751 for (i = min; i < max; i++) |
|
3752 { |
|
3753 int len = 1; |
|
3754 if (eptr >= md->end_subject) break; |
|
3755 GETCHARLEN(c, eptr, len); |
|
3756 if (prop_fail_result) break; |
|
3757 eptr+= len; |
|
3758 } |
|
3759 break; |
|
3760 |
|
3761 case PT_LAMP: |
|
3762 for (i = min; i < max; i++) |
|
3763 { |
|
3764 int len = 1; |
|
3765 if (eptr >= md->end_subject) break; |
|
3766 GETCHARLEN(c, eptr, len); |
|
3767 prop_chartype = UCD_CHARTYPE(c); |
|
3768 if ((prop_chartype == ucp_Lu || |
|
3769 prop_chartype == ucp_Ll || |
|
3770 prop_chartype == ucp_Lt) == prop_fail_result) |
|
3771 break; |
|
3772 eptr+= len; |
|
3773 } |
|
3774 break; |
|
3775 |
|
3776 case PT_GC: |
|
3777 for (i = min; i < max; i++) |
|
3778 { |
|
3779 int len = 1; |
|
3780 if (eptr >= md->end_subject) break; |
|
3781 GETCHARLEN(c, eptr, len); |
|
3782 prop_category = UCD_CATEGORY(c); |
|
3783 if ((prop_category == prop_value) == prop_fail_result) |
|
3784 break; |
|
3785 eptr+= len; |
|
3786 } |
|
3787 break; |
|
3788 |
|
3789 case PT_PC: |
|
3790 for (i = min; i < max; i++) |
|
3791 { |
|
3792 int len = 1; |
|
3793 if (eptr >= md->end_subject) break; |
|
3794 GETCHARLEN(c, eptr, len); |
|
3795 prop_chartype = UCD_CHARTYPE(c); |
|
3796 if ((prop_chartype == prop_value) == prop_fail_result) |
|
3797 break; |
|
3798 eptr+= len; |
|
3799 } |
|
3800 break; |
|
3801 |
|
3802 case PT_SC: |
|
3803 for (i = min; i < max; i++) |
|
3804 { |
|
3805 int len = 1; |
|
3806 if (eptr >= md->end_subject) break; |
|
3807 GETCHARLEN(c, eptr, len); |
|
3808 prop_script = UCD_SCRIPT(c); |
|
3809 if ((prop_script == prop_value) == prop_fail_result) |
|
3810 break; |
|
3811 eptr+= len; |
|
3812 } |
|
3813 break; |
|
3814 } |
|
3815 |
|
3816 /* eptr is now past the end of the maximum run */ |
|
3817 |
|
3818 if (possessive) continue; |
|
3819 for(;;) |
|
3820 { |
|
3821 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM44); |
|
3822 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
3823 if (eptr-- == pp) break; /* Stop if tried at original pos */ |
|
3824 if (utf8) BACKCHAR(eptr); |
|
3825 } |
|
3826 } |
|
3827 |
|
3828 /* Match extended Unicode sequences. We will get here only if the |
|
3829 support is in the binary; otherwise a compile-time error occurs. */ |
|
3830 |
|
3831 else if (ctype == OP_EXTUNI) |
|
3832 { |
|
3833 for (i = min; i < max; i++) |
|
3834 { |
|
3835 if (eptr >= md->end_subject) break; |
|
3836 GETCHARINCTEST(c, eptr); |
|
3837 prop_category = UCD_CATEGORY(c); |
|
3838 if (prop_category == ucp_M) break; |
|
3839 while (eptr < md->end_subject) |
|
3840 { |
|
3841 int len = 1; |
|
3842 if (!utf8) c = *eptr; else |
|
3843 { |
|
3844 GETCHARLEN(c, eptr, len); |
|
3845 } |
|
3846 prop_category = UCD_CATEGORY(c); |
|
3847 if (prop_category != ucp_M) break; |
|
3848 eptr += len; |
|
3849 } |
|
3850 } |
|
3851 |
|
3852 /* eptr is now past the end of the maximum run */ |
|
3853 |
|
3854 if (possessive) continue; |
|
3855 for(;;) |
|
3856 { |
|
3857 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM45); |
|
3858 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
3859 if (eptr-- == pp) break; /* Stop if tried at original pos */ |
|
3860 for (;;) /* Move back over one extended */ |
|
3861 { |
|
3862 int len = 1; |
|
3863 if (!utf8) c = *eptr; else |
|
3864 { |
|
3865 BACKCHAR(eptr); |
|
3866 GETCHARLEN(c, eptr, len); |
|
3867 } |
|
3868 prop_category = UCD_CATEGORY(c); |
|
3869 if (prop_category != ucp_M) break; |
|
3870 eptr--; |
|
3871 } |
|
3872 } |
|
3873 } |
|
3874 |
|
3875 else |
|
3876 #endif /* SUPPORT_UCP */ |
|
3877 |
|
3878 #ifdef SUPPORT_UTF8 |
|
3879 /* UTF-8 mode */ |
|
3880 |
|
3881 if (utf8) |
|
3882 { |
|
3883 switch(ctype) |
|
3884 { |
|
3885 case OP_ANY: |
|
3886 if (max < INT_MAX) |
|
3887 { |
|
3888 for (i = min; i < max; i++) |
|
3889 { |
|
3890 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break; |
|
3891 eptr++; |
|
3892 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; |
|
3893 } |
|
3894 } |
|
3895 |
|
3896 /* Handle unlimited UTF-8 repeat */ |
|
3897 |
|
3898 else |
|
3899 { |
|
3900 for (i = min; i < max; i++) |
|
3901 { |
|
3902 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break; |
|
3903 eptr++; |
|
3904 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; |
|
3905 } |
|
3906 } |
|
3907 break; |
|
3908 |
|
3909 case OP_ALLANY: |
|
3910 if (max < INT_MAX) |
|
3911 { |
|
3912 for (i = min; i < max; i++) |
|
3913 { |
|
3914 if (eptr >= md->end_subject) break; |
|
3915 eptr++; |
|
3916 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; |
|
3917 } |
|
3918 } |
|
3919 else eptr = md->end_subject; /* Unlimited UTF-8 repeat */ |
|
3920 break; |
|
3921 |
|
3922 /* The byte case is the same as non-UTF8 */ |
|
3923 |
|
3924 case OP_ANYBYTE: |
|
3925 c = max - min; |
|
3926 if (c > (unsigned int)(md->end_subject - eptr)) |
|
3927 c = md->end_subject - eptr; |
|
3928 eptr += c; |
|
3929 break; |
|
3930 |
|
3931 case OP_ANYNL: |
|
3932 for (i = min; i < max; i++) |
|
3933 { |
|
3934 int len = 1; |
|
3935 if (eptr >= md->end_subject) break; |
|
3936 GETCHARLEN(c, eptr, len); |
|
3937 if (c == 0x000d) |
|
3938 { |
|
3939 if (++eptr >= md->end_subject) break; |
|
3940 if (*eptr == 0x000a) eptr++; |
|
3941 } |
|
3942 else |
|
3943 { |
|
3944 if (c != 0x000a && |
|
3945 (md->bsr_anycrlf || |
|
3946 (c != 0x000b && c != 0x000c && |
|
3947 c != 0x0085 && c != 0x2028 && c != 0x2029))) |
|
3948 break; |
|
3949 eptr += len; |
|
3950 } |
|
3951 } |
|
3952 break; |
|
3953 |
|
3954 case OP_NOT_HSPACE: |
|
3955 case OP_HSPACE: |
|
3956 for (i = min; i < max; i++) |
|
3957 { |
|
3958 BOOL gotspace; |
|
3959 int len = 1; |
|
3960 if (eptr >= md->end_subject) break; |
|
3961 GETCHARLEN(c, eptr, len); |
|
3962 switch(c) |
|
3963 { |
|
3964 default: gotspace = FALSE; break; |
|
3965 case 0x09: /* HT */ |
|
3966 case 0x20: /* SPACE */ |
|
3967 case 0xa0: /* NBSP */ |
|
3968 case 0x1680: /* OGHAM SPACE MARK */ |
|
3969 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ |
|
3970 case 0x2000: /* EN QUAD */ |
|
3971 case 0x2001: /* EM QUAD */ |
|
3972 case 0x2002: /* EN SPACE */ |
|
3973 case 0x2003: /* EM SPACE */ |
|
3974 case 0x2004: /* THREE-PER-EM SPACE */ |
|
3975 case 0x2005: /* FOUR-PER-EM SPACE */ |
|
3976 case 0x2006: /* SIX-PER-EM SPACE */ |
|
3977 case 0x2007: /* FIGURE SPACE */ |
|
3978 case 0x2008: /* PUNCTUATION SPACE */ |
|
3979 case 0x2009: /* THIN SPACE */ |
|
3980 case 0x200A: /* HAIR SPACE */ |
|
3981 case 0x202f: /* NARROW NO-BREAK SPACE */ |
|
3982 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ |
|
3983 case 0x3000: /* IDEOGRAPHIC SPACE */ |
|
3984 gotspace = TRUE; |
|
3985 break; |
|
3986 } |
|
3987 if (gotspace == (ctype == OP_NOT_HSPACE)) break; |
|
3988 eptr += len; |
|
3989 } |
|
3990 break; |
|
3991 |
|
3992 case OP_NOT_VSPACE: |
|
3993 case OP_VSPACE: |
|
3994 for (i = min; i < max; i++) |
|
3995 { |
|
3996 BOOL gotspace; |
|
3997 int len = 1; |
|
3998 if (eptr >= md->end_subject) break; |
|
3999 GETCHARLEN(c, eptr, len); |
|
4000 switch(c) |
|
4001 { |
|
4002 default: gotspace = FALSE; break; |
|
4003 case 0x0a: /* LF */ |
|
4004 case 0x0b: /* VT */ |
|
4005 case 0x0c: /* FF */ |
|
4006 case 0x0d: /* CR */ |
|
4007 case 0x85: /* NEL */ |
|
4008 case 0x2028: /* LINE SEPARATOR */ |
|
4009 case 0x2029: /* PARAGRAPH SEPARATOR */ |
|
4010 gotspace = TRUE; |
|
4011 break; |
|
4012 } |
|
4013 if (gotspace == (ctype == OP_NOT_VSPACE)) break; |
|
4014 eptr += len; |
|
4015 } |
|
4016 break; |
|
4017 |
|
4018 case OP_NOT_DIGIT: |
|
4019 for (i = min; i < max; i++) |
|
4020 { |
|
4021 int len = 1; |
|
4022 if (eptr >= md->end_subject) break; |
|
4023 GETCHARLEN(c, eptr, len); |
|
4024 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break; |
|
4025 eptr+= len; |
|
4026 } |
|
4027 break; |
|
4028 |
|
4029 case OP_DIGIT: |
|
4030 for (i = min; i < max; i++) |
|
4031 { |
|
4032 int len = 1; |
|
4033 if (eptr >= md->end_subject) break; |
|
4034 GETCHARLEN(c, eptr, len); |
|
4035 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break; |
|
4036 eptr+= len; |
|
4037 } |
|
4038 break; |
|
4039 |
|
4040 case OP_NOT_WHITESPACE: |
|
4041 for (i = min; i < max; i++) |
|
4042 { |
|
4043 int len = 1; |
|
4044 if (eptr >= md->end_subject) break; |
|
4045 GETCHARLEN(c, eptr, len); |
|
4046 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break; |
|
4047 eptr+= len; |
|
4048 } |
|
4049 break; |
|
4050 |
|
4051 case OP_WHITESPACE: |
|
4052 for (i = min; i < max; i++) |
|
4053 { |
|
4054 int len = 1; |
|
4055 if (eptr >= md->end_subject) break; |
|
4056 GETCHARLEN(c, eptr, len); |
|
4057 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break; |
|
4058 eptr+= len; |
|
4059 } |
|
4060 break; |
|
4061 |
|
4062 case OP_NOT_WORDCHAR: |
|
4063 for (i = min; i < max; i++) |
|
4064 { |
|
4065 int len = 1; |
|
4066 if (eptr >= md->end_subject) break; |
|
4067 GETCHARLEN(c, eptr, len); |
|
4068 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break; |
|
4069 eptr+= len; |
|
4070 } |
|
4071 break; |
|
4072 |
|
4073 case OP_WORDCHAR: |
|
4074 for (i = min; i < max; i++) |
|
4075 { |
|
4076 int len = 1; |
|
4077 if (eptr >= md->end_subject) break; |
|
4078 GETCHARLEN(c, eptr, len); |
|
4079 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break; |
|
4080 eptr+= len; |
|
4081 } |
|
4082 break; |
|
4083 |
|
4084 default: |
|
4085 RRETURN(PCRE_ERROR_INTERNAL); |
|
4086 } |
|
4087 |
|
4088 /* eptr is now past the end of the maximum run */ |
|
4089 |
|
4090 if (possessive) continue; |
|
4091 for(;;) |
|
4092 { |
|
4093 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM46); |
|
4094 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
4095 if (eptr-- == pp) break; /* Stop if tried at original pos */ |
|
4096 BACKCHAR(eptr); |
|
4097 } |
|
4098 } |
|
4099 else |
|
4100 #endif /* SUPPORT_UTF8 */ |
|
4101 |
|
4102 /* Not UTF-8 mode */ |
|
4103 { |
|
4104 switch(ctype) |
|
4105 { |
|
4106 case OP_ANY: |
|
4107 for (i = min; i < max; i++) |
|
4108 { |
|
4109 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break; |
|
4110 eptr++; |
|
4111 } |
|
4112 break; |
|
4113 |
|
4114 case OP_ALLANY: |
|
4115 case OP_ANYBYTE: |
|
4116 c = max - min; |
|
4117 if (c > (unsigned int)(md->end_subject - eptr)) |
|
4118 c = md->end_subject - eptr; |
|
4119 eptr += c; |
|
4120 break; |
|
4121 |
|
4122 case OP_ANYNL: |
|
4123 for (i = min; i < max; i++) |
|
4124 { |
|
4125 if (eptr >= md->end_subject) break; |
|
4126 c = *eptr; |
|
4127 if (c == 0x000d) |
|
4128 { |
|
4129 if (++eptr >= md->end_subject) break; |
|
4130 if (*eptr == 0x000a) eptr++; |
|
4131 } |
|
4132 else |
|
4133 { |
|
4134 if (c != 0x000a && |
|
4135 (md->bsr_anycrlf || |
|
4136 (c != 0x000b && c != 0x000c && c != 0x0085))) |
|
4137 break; |
|
4138 eptr++; |
|
4139 } |
|
4140 } |
|
4141 break; |
|
4142 |
|
4143 case OP_NOT_HSPACE: |
|
4144 for (i = min; i < max; i++) |
|
4145 { |
|
4146 if (eptr >= md->end_subject) break; |
|
4147 c = *eptr; |
|
4148 if (c == 0x09 || c == 0x20 || c == 0xa0) break; |
|
4149 eptr++; |
|
4150 } |
|
4151 break; |
|
4152 |
|
4153 case OP_HSPACE: |
|
4154 for (i = min; i < max; i++) |
|
4155 { |
|
4156 if (eptr >= md->end_subject) break; |
|
4157 c = *eptr; |
|
4158 if (c != 0x09 && c != 0x20 && c != 0xa0) break; |
|
4159 eptr++; |
|
4160 } |
|
4161 break; |
|
4162 |
|
4163 case OP_NOT_VSPACE: |
|
4164 for (i = min; i < max; i++) |
|
4165 { |
|
4166 if (eptr >= md->end_subject) break; |
|
4167 c = *eptr; |
|
4168 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85) |
|
4169 break; |
|
4170 eptr++; |
|
4171 } |
|
4172 break; |
|
4173 |
|
4174 case OP_VSPACE: |
|
4175 for (i = min; i < max; i++) |
|
4176 { |
|
4177 if (eptr >= md->end_subject) break; |
|
4178 c = *eptr; |
|
4179 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85) |
|
4180 break; |
|
4181 eptr++; |
|
4182 } |
|
4183 break; |
|
4184 |
|
4185 case OP_NOT_DIGIT: |
|
4186 for (i = min; i < max; i++) |
|
4187 { |
|
4188 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0) |
|
4189 break; |
|
4190 eptr++; |
|
4191 } |
|
4192 break; |
|
4193 |
|
4194 case OP_DIGIT: |
|
4195 for (i = min; i < max; i++) |
|
4196 { |
|
4197 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0) |
|
4198 break; |
|
4199 eptr++; |
|
4200 } |
|
4201 break; |
|
4202 |
|
4203 case OP_NOT_WHITESPACE: |
|
4204 for (i = min; i < max; i++) |
|
4205 { |
|
4206 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0) |
|
4207 break; |
|
4208 eptr++; |
|
4209 } |
|
4210 break; |
|
4211 |
|
4212 case OP_WHITESPACE: |
|
4213 for (i = min; i < max; i++) |
|
4214 { |
|
4215 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0) |
|
4216 break; |
|
4217 eptr++; |
|
4218 } |
|
4219 break; |
|
4220 |
|
4221 case OP_NOT_WORDCHAR: |
|
4222 for (i = min; i < max; i++) |
|
4223 { |
|
4224 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0) |
|
4225 break; |
|
4226 eptr++; |
|
4227 } |
|
4228 break; |
|
4229 |
|
4230 case OP_WORDCHAR: |
|
4231 for (i = min; i < max; i++) |
|
4232 { |
|
4233 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0) |
|
4234 break; |
|
4235 eptr++; |
|
4236 } |
|
4237 break; |
|
4238 |
|
4239 default: |
|
4240 RRETURN(PCRE_ERROR_INTERNAL); |
|
4241 } |
|
4242 |
|
4243 /* eptr is now past the end of the maximum run */ |
|
4244 |
|
4245 if (possessive) continue; |
|
4246 while (eptr >= pp) |
|
4247 { |
|
4248 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM47); |
|
4249 eptr--; |
|
4250 if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|
4251 } |
|
4252 } |
|
4253 |
|
4254 /* Get here if we can't make it match with any permitted repetitions */ |
|
4255 |
|
4256 RRETURN(MATCH_NOMATCH); |
|
4257 } |
|
4258 /* Control never gets here */ |
|
4259 |
|
4260 /* There's been some horrible disaster. Arrival here can only mean there is |
|
4261 something seriously wrong in the code above or the OP_xxx definitions. */ |
|
4262 |
|
4263 default: |
|
4264 DPRINTF(("Unknown opcode %d\n", *ecode)); |
|
4265 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE); |
|
4266 } |
|
4267 |
|
4268 /* Do not stick any code in here without much thought; it is assumed |
|
4269 that "continue" in the code above comes out to here to repeat the main |
|
4270 loop. */ |
|
4271 |
|
4272 } /* End of main loop */ |
|
4273 /* Control never reaches here */ |
|
4274 |
|
4275 |
|
4276 /* When compiling to use the heap rather than the stack for recursive calls to |
|
4277 match(), the RRETURN() macro jumps here. The number that is saved in |
|
4278 frame->Xwhere indicates which label we actually want to return to. */ |
|
4279 |
|
4280 #ifdef NO_RECURSE |
|
4281 #define LBL(val) case val: goto L_RM##val; |
|
4282 HEAP_RETURN: |
|
4283 switch (frame->Xwhere) |
|
4284 { |
|
4285 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8) |
|
4286 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17) |
|
4287 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33) |
|
4288 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52) |
|
4289 LBL(53) LBL(54) |
|
4290 #ifdef SUPPORT_UTF8 |
|
4291 LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30) |
|
4292 LBL(32) LBL(34) LBL(42) LBL(46) |
|
4293 #ifdef SUPPORT_UCP |
|
4294 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45) |
|
4295 #endif /* SUPPORT_UCP */ |
|
4296 #endif /* SUPPORT_UTF8 */ |
|
4297 default: |
|
4298 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere)); |
|
4299 return PCRE_ERROR_INTERNAL; |
|
4300 } |
|
4301 #undef LBL |
|
4302 #endif /* NO_RECURSE */ |
|
4303 } |
|
4304 |
|
4305 |
|
4306 /*************************************************************************** |
|
4307 **************************************************************************** |
|
4308 RECURSION IN THE match() FUNCTION |
|
4309 |
|
4310 Undefine all the macros that were defined above to handle this. */ |
|
4311 |
|
4312 #ifdef NO_RECURSE |
|
4313 #undef eptr |
|
4314 #undef ecode |
|
4315 #undef mstart |
|
4316 #undef offset_top |
|
4317 #undef ims |
|
4318 #undef eptrb |
|
4319 #undef flags |
|
4320 |
|
4321 #undef callpat |
|
4322 #undef charptr |
|
4323 #undef data |
|
4324 #undef next |
|
4325 #undef pp |
|
4326 #undef prev |
|
4327 #undef saved_eptr |
|
4328 |
|
4329 #undef new_recursive |
|
4330 |
|
4331 #undef cur_is_word |
|
4332 #undef condition |
|
4333 #undef prev_is_word |
|
4334 |
|
4335 #undef original_ims |
|
4336 |
|
4337 #undef ctype |
|
4338 #undef length |
|
4339 #undef max |
|
4340 #undef min |
|
4341 #undef number |
|
4342 #undef offset |
|
4343 #undef op |
|
4344 #undef save_capture_last |
|
4345 #undef save_offset1 |
|
4346 #undef save_offset2 |
|
4347 #undef save_offset3 |
|
4348 #undef stacksave |
|
4349 |
|
4350 #undef newptrb |
|
4351 |
|
4352 #endif |
|
4353 |
|
4354 /* These two are defined as macros in both cases */ |
|
4355 |
|
4356 #undef fc |
|
4357 #undef fi |
|
4358 |
|
4359 /*************************************************************************** |
|
4360 ***************************************************************************/ |
|
4361 |
|
4362 |
|
4363 |
|
4364 /************************************************* |
|
4365 * Execute a Regular Expression * |
|
4366 *************************************************/ |
|
4367 |
|
4368 /* This function applies a compiled re to a subject string and picks out |
|
4369 portions of the string if it matches. Two elements in the vector are set for |
|
4370 each substring: the offsets to the start and end of the substring. |
|
4371 |
|
4372 Arguments: |
|
4373 argument_re points to the compiled expression |
|
4374 extra_data points to extra data or is NULL |
|
4375 subject points to the subject string |
|
4376 length length of subject string (may contain binary zeros) |
|
4377 start_offset where to start in the subject string |
|
4378 options option bits |
|
4379 offsets points to a vector of ints to be filled in with offsets |
|
4380 offsetcount the number of elements in the vector |
|
4381 |
|
4382 Returns: > 0 => success; value is the number of elements filled in |
|
4383 = 0 => success, but offsets is not big enough |
|
4384 -1 => failed to match |
|
4385 < -1 => some kind of unexpected problem |
|
4386 */ |
|
4387 |
|
4388 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION |
|
4389 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data, |
|
4390 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets, |
|
4391 int offsetcount) |
|
4392 { |
|
4393 int rc, resetcount, ocount; |
|
4394 int first_byte = -1; |
|
4395 int req_byte = -1; |
|
4396 int req_byte2 = -1; |
|
4397 int newline; |
|
4398 unsigned long int ims; |
|
4399 BOOL using_temporary_offsets = FALSE; |
|
4400 BOOL anchored; |
|
4401 BOOL startline; |
|
4402 BOOL firstline; |
|
4403 BOOL first_byte_caseless = FALSE; |
|
4404 BOOL req_byte_caseless = FALSE; |
|
4405 BOOL utf8; |
|
4406 match_data match_block; |
|
4407 match_data *md = &match_block; |
|
4408 const uschar *tables; |
|
4409 const uschar *start_bits = NULL; |
|
4410 USPTR start_match = (USPTR)subject + start_offset; |
|
4411 USPTR end_subject; |
|
4412 USPTR req_byte_ptr = start_match - 1; |
|
4413 |
|
4414 pcre_study_data internal_study; |
|
4415 const pcre_study_data *study; |
|
4416 |
|
4417 real_pcre internal_re; |
|
4418 const real_pcre *external_re = (const real_pcre *)argument_re; |
|
4419 const real_pcre *re = external_re; |
|
4420 |
|
4421 /* Plausibility checks */ |
|
4422 |
|
4423 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION; |
|
4424 if (re == NULL || subject == NULL || |
|
4425 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL; |
|
4426 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT; |
|
4427 |
|
4428 /* Fish out the optional data from the extra_data structure, first setting |
|
4429 the default values. */ |
|
4430 |
|
4431 study = NULL; |
|
4432 md->match_limit = MATCH_LIMIT; |
|
4433 md->match_limit_recursion = MATCH_LIMIT_RECURSION; |
|
4434 md->callout_data = NULL; |
|
4435 |
|
4436 /* The table pointer is always in native byte order. */ |
|
4437 |
|
4438 tables = external_re->tables; |
|
4439 |
|
4440 if (extra_data != NULL) |
|
4441 { |
|
4442 register unsigned int flags = extra_data->flags; |
|
4443 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0) |
|
4444 study = (const pcre_study_data *)extra_data->study_data; |
|
4445 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) |
|
4446 md->match_limit = extra_data->match_limit; |
|
4447 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0) |
|
4448 md->match_limit_recursion = extra_data->match_limit_recursion; |
|
4449 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0) |
|
4450 md->callout_data = extra_data->callout_data; |
|
4451 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables; |
|
4452 } |
|
4453 |
|
4454 /* If the exec call supplied NULL for tables, use the inbuilt ones. This |
|
4455 is a feature that makes it possible to save compiled regex and re-use them |
|
4456 in other programs later. */ |
|
4457 |
|
4458 if (tables == NULL) tables = _pcre_default_tables; |
|
4459 |
|
4460 /* Check that the first field in the block is the magic number. If it is not, |
|
4461 test for a regex that was compiled on a host of opposite endianness. If this is |
|
4462 the case, flipped values are put in internal_re and internal_study if there was |
|
4463 study data too. */ |
|
4464 |
|
4465 if (re->magic_number != MAGIC_NUMBER) |
|
4466 { |
|
4467 re = _pcre_try_flipped(re, &internal_re, study, &internal_study); |
|
4468 if (re == NULL) return PCRE_ERROR_BADMAGIC; |
|
4469 if (study != NULL) study = &internal_study; |
|
4470 } |
|
4471 |
|
4472 /* Set up other data */ |
|
4473 |
|
4474 anchored = ((re->options | options) & PCRE_ANCHORED) != 0; |
|
4475 startline = (re->flags & PCRE_STARTLINE) != 0; |
|
4476 firstline = (re->options & PCRE_FIRSTLINE) != 0; |
|
4477 |
|
4478 /* The code starts after the real_pcre block and the capture name table. */ |
|
4479 |
|
4480 md->start_code = (const uschar *)external_re + re->name_table_offset + |
|
4481 re->name_count * re->name_entry_size; |
|
4482 |
|
4483 md->start_subject = (USPTR)subject; |
|
4484 md->start_offset = start_offset; |
|
4485 md->end_subject = md->start_subject + length; |
|
4486 end_subject = md->end_subject; |
|
4487 |
|
4488 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0; |
|
4489 utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0; |
|
4490 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0; |
|
4491 |
|
4492 md->notbol = (options & PCRE_NOTBOL) != 0; |
|
4493 md->noteol = (options & PCRE_NOTEOL) != 0; |
|
4494 md->notempty = (options & PCRE_NOTEMPTY) != 0; |
|
4495 md->partial = (options & PCRE_PARTIAL) != 0; |
|
4496 md->hitend = FALSE; |
|
4497 |
|
4498 md->recursive = NULL; /* No recursion at top level */ |
|
4499 |
|
4500 md->lcc = tables + lcc_offset; |
|
4501 md->ctypes = tables + ctypes_offset; |
|
4502 |
|
4503 /* Handle different \R options. */ |
|
4504 |
|
4505 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) |
|
4506 { |
|
4507 case 0: |
|
4508 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0) |
|
4509 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0; |
|
4510 else |
|
4511 #ifdef BSR_ANYCRLF |
|
4512 md->bsr_anycrlf = TRUE; |
|
4513 #else |
|
4514 md->bsr_anycrlf = FALSE; |
|
4515 #endif |
|
4516 break; |
|
4517 |
|
4518 case PCRE_BSR_ANYCRLF: |
|
4519 md->bsr_anycrlf = TRUE; |
|
4520 break; |
|
4521 |
|
4522 case PCRE_BSR_UNICODE: |
|
4523 md->bsr_anycrlf = FALSE; |
|
4524 break; |
|
4525 |
|
4526 default: return PCRE_ERROR_BADNEWLINE; |
|
4527 } |
|
4528 |
|
4529 /* Handle different types of newline. The three bits give eight cases. If |
|
4530 nothing is set at run time, whatever was used at compile time applies. */ |
|
4531 |
|
4532 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : |
|
4533 (pcre_uint32)options) & PCRE_NEWLINE_BITS) |
|
4534 { |
|
4535 case 0: newline = NEWLINE; break; /* Compile-time default */ |
|
4536 case PCRE_NEWLINE_CR: newline = '\r'; break; |
|
4537 case PCRE_NEWLINE_LF: newline = '\n'; break; |
|
4538 case PCRE_NEWLINE_CR+ |
|
4539 PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break; |
|
4540 case PCRE_NEWLINE_ANY: newline = -1; break; |
|
4541 case PCRE_NEWLINE_ANYCRLF: newline = -2; break; |
|
4542 default: return PCRE_ERROR_BADNEWLINE; |
|
4543 } |
|
4544 |
|
4545 if (newline == -2) |
|
4546 { |
|
4547 md->nltype = NLTYPE_ANYCRLF; |
|
4548 } |
|
4549 else if (newline < 0) |
|
4550 { |
|
4551 md->nltype = NLTYPE_ANY; |
|
4552 } |
|
4553 else |
|
4554 { |
|
4555 md->nltype = NLTYPE_FIXED; |
|
4556 if (newline > 255) |
|
4557 { |
|
4558 md->nllen = 2; |
|
4559 md->nl[0] = (newline >> 8) & 255; |
|
4560 md->nl[1] = newline & 255; |
|
4561 } |
|
4562 else |
|
4563 { |
|
4564 md->nllen = 1; |
|
4565 md->nl[0] = newline; |
|
4566 } |
|
4567 } |
|
4568 |
|
4569 /* Partial matching is supported only for a restricted set of regexes at the |
|
4570 moment. */ |
|
4571 |
|
4572 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0) |
|
4573 return PCRE_ERROR_BADPARTIAL; |
|
4574 |
|
4575 /* Check a UTF-8 string if required. Unfortunately there's no way of passing |
|
4576 back the character offset. */ |
|
4577 |
|
4578 #ifdef SUPPORT_UTF8 |
|
4579 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0) |
|
4580 { |
|
4581 if (_pcre_valid_utf8((uschar *)subject, length) >= 0) |
|
4582 return PCRE_ERROR_BADUTF8; |
|
4583 if (start_offset > 0 && start_offset < length) |
|
4584 { |
|
4585 int tb = ((uschar *)subject)[start_offset]; |
|
4586 if (tb > 127) |
|
4587 { |
|
4588 tb &= 0xc0; |
|
4589 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET; |
|
4590 } |
|
4591 } |
|
4592 } |
|
4593 #endif |
|
4594 |
|
4595 /* The ims options can vary during the matching as a result of the presence |
|
4596 of (?ims) items in the pattern. They are kept in a local variable so that |
|
4597 restoring at the exit of a group is easy. */ |
|
4598 |
|
4599 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL); |
|
4600 |
|
4601 /* If the expression has got more back references than the offsets supplied can |
|
4602 hold, we get a temporary chunk of working store to use during the matching. |
|
4603 Otherwise, we can use the vector supplied, rounding down its size to a multiple |
|
4604 of 3. */ |
|
4605 |
|
4606 ocount = offsetcount - (offsetcount % 3); |
|
4607 |
|
4608 if (re->top_backref > 0 && re->top_backref >= ocount/3) |
|
4609 { |
|
4610 ocount = re->top_backref * 3 + 3; |
|
4611 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int)); |
|
4612 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY; |
|
4613 using_temporary_offsets = TRUE; |
|
4614 DPRINTF(("Got memory to hold back references\n")); |
|
4615 } |
|
4616 else md->offset_vector = offsets; |
|
4617 |
|
4618 md->offset_end = ocount; |
|
4619 md->offset_max = (2*ocount)/3; |
|
4620 md->offset_overflow = FALSE; |
|
4621 md->capture_last = -1; |
|
4622 |
|
4623 /* Compute the minimum number of offsets that we need to reset each time. Doing |
|
4624 this makes a huge difference to execution time when there aren't many brackets |
|
4625 in the pattern. */ |
|
4626 |
|
4627 resetcount = 2 + re->top_bracket * 2; |
|
4628 if (resetcount > offsetcount) resetcount = ocount; |
|
4629 |
|
4630 /* Reset the working variable associated with each extraction. These should |
|
4631 never be used unless previously set, but they get saved and restored, and so we |
|
4632 initialize them to avoid reading uninitialized locations. */ |
|
4633 |
|
4634 if (md->offset_vector != NULL) |
|
4635 { |
|
4636 register int *iptr = md->offset_vector + ocount; |
|
4637 register int *iend = iptr - resetcount/2 + 1; |
|
4638 while (--iptr >= iend) *iptr = -1; |
|
4639 } |
|
4640 |
|
4641 /* Set up the first character to match, if available. The first_byte value is |
|
4642 never set for an anchored regular expression, but the anchoring may be forced |
|
4643 at run time, so we have to test for anchoring. The first char may be unset for |
|
4644 an unanchored pattern, of course. If there's no first char and the pattern was |
|
4645 studied, there may be a bitmap of possible first characters. */ |
|
4646 |
|
4647 if (!anchored) |
|
4648 { |
|
4649 if ((re->flags & PCRE_FIRSTSET) != 0) |
|
4650 { |
|
4651 first_byte = re->first_byte & 255; |
|
4652 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE) |
|
4653 first_byte = md->lcc[first_byte]; |
|
4654 } |
|
4655 else |
|
4656 if (!startline && study != NULL && |
|
4657 (study->options & PCRE_STUDY_MAPPED) != 0) |
|
4658 start_bits = study->start_bits; |
|
4659 } |
|
4660 |
|
4661 /* For anchored or unanchored matches, there may be a "last known required |
|
4662 character" set. */ |
|
4663 |
|
4664 if ((re->flags & PCRE_REQCHSET) != 0) |
|
4665 { |
|
4666 req_byte = re->req_byte & 255; |
|
4667 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0; |
|
4668 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */ |
|
4669 } |
|
4670 |
|
4671 |
|
4672 /* ==========================================================================*/ |
|
4673 |
|
4674 /* Loop for handling unanchored repeated matching attempts; for anchored regexs |
|
4675 the loop runs just once. */ |
|
4676 |
|
4677 for(;;) |
|
4678 { |
|
4679 USPTR save_end_subject = end_subject; |
|
4680 USPTR new_start_match; |
|
4681 |
|
4682 /* Reset the maximum number of extractions we might see. */ |
|
4683 |
|
4684 if (md->offset_vector != NULL) |
|
4685 { |
|
4686 register int *iptr = md->offset_vector; |
|
4687 register int *iend = iptr + resetcount; |
|
4688 while (iptr < iend) *iptr++ = -1; |
|
4689 } |
|
4690 |
|
4691 /* Advance to a unique first char if possible. If firstline is TRUE, the |
|
4692 start of the match is constrained to the first line of a multiline string. |
|
4693 That is, the match must be before or at the first newline. Implement this by |
|
4694 temporarily adjusting end_subject so that we stop scanning at a newline. If |
|
4695 the match fails at the newline, later code breaks this loop. */ |
|
4696 |
|
4697 if (firstline) |
|
4698 { |
|
4699 USPTR t = start_match; |
|
4700 #ifdef SUPPORT_UTF8 |
|
4701 if (utf8) |
|
4702 { |
|
4703 while (t < md->end_subject && !IS_NEWLINE(t)) |
|
4704 { |
|
4705 t++; |
|
4706 while (t < end_subject && (*t & 0xc0) == 0x80) t++; |
|
4707 } |
|
4708 } |
|
4709 else |
|
4710 #endif |
|
4711 while (t < md->end_subject && !IS_NEWLINE(t)) t++; |
|
4712 end_subject = t; |
|
4713 } |
|
4714 |
|
4715 /* Now advance to a unique first byte if there is one. */ |
|
4716 |
|
4717 if (first_byte >= 0) |
|
4718 { |
|
4719 if (first_byte_caseless) |
|
4720 while (start_match < end_subject && md->lcc[*start_match] != first_byte) |
|
4721 start_match++; |
|
4722 else |
|
4723 while (start_match < end_subject && *start_match != first_byte) |
|
4724 start_match++; |
|
4725 } |
|
4726 |
|
4727 /* Or to just after a linebreak for a multiline match */ |
|
4728 |
|
4729 else if (startline) |
|
4730 { |
|
4731 if (start_match > md->start_subject + start_offset) |
|
4732 { |
|
4733 #ifdef SUPPORT_UTF8 |
|
4734 if (utf8) |
|
4735 { |
|
4736 while (start_match < end_subject && !WAS_NEWLINE(start_match)) |
|
4737 { |
|
4738 start_match++; |
|
4739 while(start_match < end_subject && (*start_match & 0xc0) == 0x80) |
|
4740 start_match++; |
|
4741 } |
|
4742 } |
|
4743 else |
|
4744 #endif |
|
4745 while (start_match < end_subject && !WAS_NEWLINE(start_match)) |
|
4746 start_match++; |
|
4747 |
|
4748 /* If we have just passed a CR and the newline option is ANY or ANYCRLF, |
|
4749 and we are now at a LF, advance the match position by one more character. |
|
4750 */ |
|
4751 |
|
4752 if (start_match[-1] == '\r' && |
|
4753 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) && |
|
4754 start_match < end_subject && |
|
4755 *start_match == '\n') |
|
4756 start_match++; |
|
4757 } |
|
4758 } |
|
4759 |
|
4760 /* Or to a non-unique first byte after study */ |
|
4761 |
|
4762 else if (start_bits != NULL) |
|
4763 { |
|
4764 while (start_match < end_subject) |
|
4765 { |
|
4766 register unsigned int c = *start_match; |
|
4767 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; |
|
4768 else break; |
|
4769 } |
|
4770 } |
|
4771 |
|
4772 /* Restore fudged end_subject */ |
|
4773 |
|
4774 end_subject = save_end_subject; |
|
4775 |
|
4776 #ifdef DEBUG /* Sigh. Some compilers never learn. */ |
|
4777 printf(">>>> Match against: "); |
|
4778 pchars(start_match, end_subject - start_match, TRUE, md); |
|
4779 printf("\n"); |
|
4780 #endif |
|
4781 |
|
4782 /* If req_byte is set, we know that that character must appear in the subject |
|
4783 for the match to succeed. If the first character is set, req_byte must be |
|
4784 later in the subject; otherwise the test starts at the match point. This |
|
4785 optimization can save a huge amount of backtracking in patterns with nested |
|
4786 unlimited repeats that aren't going to match. Writing separate code for |
|
4787 cased/caseless versions makes it go faster, as does using an autoincrement |
|
4788 and backing off on a match. |
|
4789 |
|
4790 HOWEVER: when the subject string is very, very long, searching to its end can |
|
4791 take a long time, and give bad performance on quite ordinary patterns. This |
|
4792 showed up when somebody was matching something like /^\d+C/ on a 32-megabyte |
|
4793 string... so we don't do this when the string is sufficiently long. |
|
4794 |
|
4795 ALSO: this processing is disabled when partial matching is requested. |
|
4796 */ |
|
4797 |
|
4798 if (req_byte >= 0 && |
|
4799 end_subject - start_match < REQ_BYTE_MAX && |
|
4800 !md->partial) |
|
4801 { |
|
4802 register USPTR p = start_match + ((first_byte >= 0)? 1 : 0); |
|
4803 |
|
4804 /* We don't need to repeat the search if we haven't yet reached the |
|
4805 place we found it at last time. */ |
|
4806 |
|
4807 if (p > req_byte_ptr) |
|
4808 { |
|
4809 if (req_byte_caseless) |
|
4810 { |
|
4811 while (p < end_subject) |
|
4812 { |
|
4813 register int pp = *p++; |
|
4814 if (pp == req_byte || pp == req_byte2) { p--; break; } |
|
4815 } |
|
4816 } |
|
4817 else |
|
4818 { |
|
4819 while (p < end_subject) |
|
4820 { |
|
4821 if (*p++ == req_byte) { p--; break; } |
|
4822 } |
|
4823 } |
|
4824 |
|
4825 /* If we can't find the required character, break the matching loop, |
|
4826 forcing a match failure. */ |
|
4827 |
|
4828 if (p >= end_subject) |
|
4829 { |
|
4830 rc = MATCH_NOMATCH; |
|
4831 break; |
|
4832 } |
|
4833 |
|
4834 /* If we have found the required character, save the point where we |
|
4835 found it, so that we don't search again next time round the loop if |
|
4836 the start hasn't passed this character yet. */ |
|
4837 |
|
4838 req_byte_ptr = p; |
|
4839 } |
|
4840 } |
|
4841 |
|
4842 /* OK, we can now run the match. */ |
|
4843 |
|
4844 md->start_match_ptr = start_match; |
|
4845 md->match_call_count = 0; |
|
4846 rc = match(start_match, md->start_code, start_match, 2, md, ims, NULL, 0, 0); |
|
4847 |
|
4848 switch(rc) |
|
4849 { |
|
4850 /* NOMATCH and PRUNE advance by one character. THEN at this level acts |
|
4851 exactly like PRUNE. */ |
|
4852 |
|
4853 case MATCH_NOMATCH: |
|
4854 case MATCH_PRUNE: |
|
4855 case MATCH_THEN: |
|
4856 new_start_match = start_match + 1; |
|
4857 #ifdef SUPPORT_UTF8 |
|
4858 if (utf8) |
|
4859 while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80) |
|
4860 new_start_match++; |
|
4861 #endif |
|
4862 break; |
|
4863 |
|
4864 /* SKIP passes back the next starting point explicitly. */ |
|
4865 |
|
4866 case MATCH_SKIP: |
|
4867 new_start_match = md->start_match_ptr; |
|
4868 break; |
|
4869 |
|
4870 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */ |
|
4871 |
|
4872 case MATCH_COMMIT: |
|
4873 rc = MATCH_NOMATCH; |
|
4874 goto ENDLOOP; |
|
4875 |
|
4876 /* Any other return is some kind of error. */ |
|
4877 |
|
4878 default: |
|
4879 goto ENDLOOP; |
|
4880 } |
|
4881 |
|
4882 /* Control reaches here for the various types of "no match at this point" |
|
4883 result. Reset the code to MATCH_NOMATCH for subsequent checking. */ |
|
4884 |
|
4885 rc = MATCH_NOMATCH; |
|
4886 |
|
4887 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first |
|
4888 newline in the subject (though it may continue over the newline). Therefore, |
|
4889 if we have just failed to match, starting at a newline, do not continue. */ |
|
4890 |
|
4891 if (firstline && IS_NEWLINE(start_match)) break; |
|
4892 |
|
4893 /* Advance to new matching position */ |
|
4894 |
|
4895 start_match = new_start_match; |
|
4896 |
|
4897 /* Break the loop if the pattern is anchored or if we have passed the end of |
|
4898 the subject. */ |
|
4899 |
|
4900 if (anchored || start_match > end_subject) break; |
|
4901 |
|
4902 /* If we have just passed a CR and we are now at a LF, and the pattern does |
|
4903 not contain any explicit matches for \r or \n, and the newline option is CRLF |
|
4904 or ANY or ANYCRLF, advance the match position by one more character. */ |
|
4905 |
|
4906 if (start_match[-1] == '\r' && |
|
4907 start_match < end_subject && |
|
4908 *start_match == '\n' && |
|
4909 (re->flags & PCRE_HASCRORLF) == 0 && |
|
4910 (md->nltype == NLTYPE_ANY || |
|
4911 md->nltype == NLTYPE_ANYCRLF || |
|
4912 md->nllen == 2)) |
|
4913 start_match++; |
|
4914 |
|
4915 } /* End of for(;;) "bumpalong" loop */ |
|
4916 |
|
4917 /* ==========================================================================*/ |
|
4918 |
|
4919 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping |
|
4920 conditions is true: |
|
4921 |
|
4922 (1) The pattern is anchored or the match was failed by (*COMMIT); |
|
4923 |
|
4924 (2) We are past the end of the subject; |
|
4925 |
|
4926 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because |
|
4927 this option requests that a match occur at or before the first newline in |
|
4928 the subject. |
|
4929 |
|
4930 When we have a match and the offset vector is big enough to deal with any |
|
4931 backreferences, captured substring offsets will already be set up. In the case |
|
4932 where we had to get some local store to hold offsets for backreference |
|
4933 processing, copy those that we can. In this case there need not be overflow if |
|
4934 certain parts of the pattern were not used, even though there are more |
|
4935 capturing parentheses than vector slots. */ |
|
4936 |
|
4937 ENDLOOP: |
|
4938 |
|
4939 if (rc == MATCH_MATCH) |
|
4940 { |
|
4941 if (using_temporary_offsets) |
|
4942 { |
|
4943 if (offsetcount >= 4) |
|
4944 { |
|
4945 memcpy(offsets + 2, md->offset_vector + 2, |
|
4946 (offsetcount - 2) * sizeof(int)); |
|
4947 DPRINTF(("Copied offsets from temporary memory\n")); |
|
4948 } |
|
4949 if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE; |
|
4950 DPRINTF(("Freeing temporary memory\n")); |
|
4951 (pcre_free)(md->offset_vector); |
|
4952 } |
|
4953 |
|
4954 /* Set the return code to the number of captured strings, or 0 if there are |
|
4955 too many to fit into the vector. */ |
|
4956 |
|
4957 rc = md->offset_overflow? 0 : md->end_offset_top/2; |
|
4958 |
|
4959 /* If there is space, set up the whole thing as substring 0. The value of |
|
4960 md->start_match_ptr might be modified if \K was encountered on the success |
|
4961 matching path. */ |
|
4962 |
|
4963 if (offsetcount < 2) rc = 0; else |
|
4964 { |
|
4965 offsets[0] = md->start_match_ptr - md->start_subject; |
|
4966 offsets[1] = md->end_match_ptr - md->start_subject; |
|
4967 } |
|
4968 |
|
4969 DPRINTF((">>>> returning %d\n", rc)); |
|
4970 return rc; |
|
4971 } |
|
4972 |
|
4973 /* Control gets here if there has been an error, or if the overall match |
|
4974 attempt has failed at all permitted starting positions. */ |
|
4975 |
|
4976 if (using_temporary_offsets) |
|
4977 { |
|
4978 DPRINTF(("Freeing temporary memory\n")); |
|
4979 (pcre_free)(md->offset_vector); |
|
4980 } |
|
4981 |
|
4982 if (rc != MATCH_NOMATCH) |
|
4983 { |
|
4984 DPRINTF((">>>> error: returning %d\n", rc)); |
|
4985 return rc; |
|
4986 } |
|
4987 else if (md->partial && md->hitend) |
|
4988 { |
|
4989 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n")); |
|
4990 return PCRE_ERROR_PARTIAL; |
|
4991 } |
|
4992 else |
|
4993 { |
|
4994 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n")); |
|
4995 return PCRE_ERROR_NOMATCH; |
|
4996 } |
|
4997 } |
|
4998 |
|
4999 /* End of pcre_exec.c */ |