|
1 /************************************************* |
|
2 * Perl-Compatible Regular Expressions * |
|
3 *************************************************/ |
|
4 |
|
5 /* PCRE is a library of functions to support regular expressions whose syntax |
|
6 and semantics are as close as possible to those of the Perl 5 language. |
|
7 |
|
8 Written by Philip Hazel |
|
9 Copyright (c) 1997-2008 University of Cambridge |
|
10 |
|
11 ----------------------------------------------------------------------------- |
|
12 Redistribution and use in source and binary forms, with or without |
|
13 modification, are permitted provided that the following conditions are met: |
|
14 |
|
15 * Redistributions of source code must retain the above copyright notice, |
|
16 this list of conditions and the following disclaimer. |
|
17 |
|
18 * Redistributions in binary form must reproduce the above copyright |
|
19 notice, this list of conditions and the following disclaimer in the |
|
20 documentation and/or other materials provided with the distribution. |
|
21 |
|
22 * Neither the name of the University of Cambridge nor the names of its |
|
23 contributors may be used to endorse or promote products derived from |
|
24 this software without specific prior written permission. |
|
25 |
|
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
|
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
|
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
|
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
|
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
|
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
|
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
|
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
|
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
|
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
|
36 POSSIBILITY OF SUCH DAMAGE. |
|
37 ----------------------------------------------------------------------------- |
|
38 */ |
|
39 |
|
40 |
|
41 /* This module contains the external function pcre_dfa_exec(), which is an |
|
42 alternative matching function that uses a sort of DFA algorithm (not a true |
|
43 FSM). This is NOT Perl- compatible, but it has advantages in certain |
|
44 applications. */ |
|
45 |
|
46 |
|
47 #ifdef HAVE_CONFIG_H |
|
48 #include "config.h" |
|
49 #endif |
|
50 |
|
51 #define NLBLOCK md /* Block containing newline information */ |
|
52 #define PSSTART start_subject /* Field containing processed string start */ |
|
53 #define PSEND end_subject /* Field containing processed string end */ |
|
54 |
|
55 #include "pcre_internal.h" |
|
56 |
|
57 |
|
58 /* For use to indent debugging output */ |
|
59 |
|
60 #define SP " " |
|
61 |
|
62 |
|
63 |
|
64 /************************************************* |
|
65 * Code parameters and static tables * |
|
66 *************************************************/ |
|
67 |
|
68 /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes |
|
69 into others, under special conditions. A gap of 20 between the blocks should be |
|
70 enough. The resulting opcodes don't have to be less than 256 because they are |
|
71 never stored, so we push them well clear of the normal opcodes. */ |
|
72 |
|
73 #define OP_PROP_EXTRA 300 |
|
74 #define OP_EXTUNI_EXTRA 320 |
|
75 #define OP_ANYNL_EXTRA 340 |
|
76 #define OP_HSPACE_EXTRA 360 |
|
77 #define OP_VSPACE_EXTRA 380 |
|
78 |
|
79 |
|
80 /* This table identifies those opcodes that are followed immediately by a |
|
81 character that is to be tested in some way. This makes is possible to |
|
82 centralize the loading of these characters. In the case of Type * etc, the |
|
83 "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a |
|
84 small value. ***NOTE*** If the start of this table is modified, the two tables |
|
85 that follow must also be modified. */ |
|
86 |
|
87 static const uschar coptable[] = { |
|
88 0, /* End */ |
|
89 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */ |
|
90 0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */ |
|
91 0, 0, 0, /* Any, AllAny, Anybyte */ |
|
92 0, 0, 0, /* NOTPROP, PROP, EXTUNI */ |
|
93 0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */ |
|
94 0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */ |
|
95 1, /* Char */ |
|
96 1, /* Charnc */ |
|
97 1, /* not */ |
|
98 /* Positive single-char repeats */ |
|
99 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */ |
|
100 3, 3, 3, /* upto, minupto, exact */ |
|
101 1, 1, 1, 3, /* *+, ++, ?+, upto+ */ |
|
102 /* Negative single-char repeats - only for chars < 256 */ |
|
103 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */ |
|
104 3, 3, 3, /* NOT upto, minupto, exact */ |
|
105 1, 1, 1, 3, /* NOT *+, ++, ?+, updo+ */ |
|
106 /* Positive type repeats */ |
|
107 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */ |
|
108 3, 3, 3, /* Type upto, minupto, exact */ |
|
109 1, 1, 1, 3, /* Type *+, ++, ?+, upto+ */ |
|
110 /* Character class & ref repeats */ |
|
111 0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */ |
|
112 0, 0, /* CRRANGE, CRMINRANGE */ |
|
113 0, /* CLASS */ |
|
114 0, /* NCLASS */ |
|
115 0, /* XCLASS - variable length */ |
|
116 0, /* REF */ |
|
117 0, /* RECURSE */ |
|
118 0, /* CALLOUT */ |
|
119 0, /* Alt */ |
|
120 0, /* Ket */ |
|
121 0, /* KetRmax */ |
|
122 0, /* KetRmin */ |
|
123 0, /* Assert */ |
|
124 0, /* Assert not */ |
|
125 0, /* Assert behind */ |
|
126 0, /* Assert behind not */ |
|
127 0, /* Reverse */ |
|
128 0, 0, 0, 0, /* ONCE, BRA, CBRA, COND */ |
|
129 0, 0, 0, /* SBRA, SCBRA, SCOND */ |
|
130 0, /* CREF */ |
|
131 0, /* RREF */ |
|
132 0, /* DEF */ |
|
133 0, 0, /* BRAZERO, BRAMINZERO */ |
|
134 0, 0, 0, 0, /* PRUNE, SKIP, THEN, COMMIT */ |
|
135 0, 0, 0 /* FAIL, ACCEPT, SKIPZERO */ |
|
136 }; |
|
137 |
|
138 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W, |
|
139 and \w */ |
|
140 |
|
141 static const uschar toptable1[] = { |
|
142 0, 0, 0, 0, 0, 0, |
|
143 ctype_digit, ctype_digit, |
|
144 ctype_space, ctype_space, |
|
145 ctype_word, ctype_word, |
|
146 0, 0 /* OP_ANY, OP_ALLANY */ |
|
147 }; |
|
148 |
|
149 static const uschar toptable2[] = { |
|
150 0, 0, 0, 0, 0, 0, |
|
151 ctype_digit, 0, |
|
152 ctype_space, 0, |
|
153 ctype_word, 0, |
|
154 1, 1 /* OP_ANY, OP_ALLANY */ |
|
155 }; |
|
156 |
|
157 |
|
158 /* Structure for holding data about a particular state, which is in effect the |
|
159 current data for an active path through the match tree. It must consist |
|
160 entirely of ints because the working vector we are passed, and which we put |
|
161 these structures in, is a vector of ints. */ |
|
162 |
|
163 typedef struct stateblock { |
|
164 int offset; /* Offset to opcode */ |
|
165 int count; /* Count for repeats */ |
|
166 int ims; /* ims flag bits */ |
|
167 int data; /* Some use extra data */ |
|
168 } stateblock; |
|
169 |
|
170 #define INTS_PER_STATEBLOCK (sizeof(stateblock)/sizeof(int)) |
|
171 |
|
172 |
|
173 #ifdef DEBUG |
|
174 /************************************************* |
|
175 * Print character string * |
|
176 *************************************************/ |
|
177 |
|
178 /* Character string printing function for debugging. |
|
179 |
|
180 Arguments: |
|
181 p points to string |
|
182 length number of bytes |
|
183 f where to print |
|
184 |
|
185 Returns: nothing |
|
186 */ |
|
187 |
|
188 static void |
|
189 pchars(unsigned char *p, int length, FILE *f) |
|
190 { |
|
191 int c; |
|
192 while (length-- > 0) |
|
193 { |
|
194 if (isprint(c = *(p++))) |
|
195 fprintf(f, "%c", c); |
|
196 else |
|
197 fprintf(f, "\\x%02x", c); |
|
198 } |
|
199 } |
|
200 #endif |
|
201 |
|
202 |
|
203 |
|
204 /************************************************* |
|
205 * Execute a Regular Expression - DFA engine * |
|
206 *************************************************/ |
|
207 |
|
208 /* This internal function applies a compiled pattern to a subject string, |
|
209 starting at a given point, using a DFA engine. This function is called from the |
|
210 external one, possibly multiple times if the pattern is not anchored. The |
|
211 function calls itself recursively for some kinds of subpattern. |
|
212 |
|
213 Arguments: |
|
214 md the match_data block with fixed information |
|
215 this_start_code the opening bracket of this subexpression's code |
|
216 current_subject where we currently are in the subject string |
|
217 start_offset start offset in the subject string |
|
218 offsets vector to contain the matching string offsets |
|
219 offsetcount size of same |
|
220 workspace vector of workspace |
|
221 wscount size of same |
|
222 ims the current ims flags |
|
223 rlevel function call recursion level |
|
224 recursing regex recursive call level |
|
225 |
|
226 Returns: > 0 => number of match offset pairs placed in offsets |
|
227 = 0 => offsets overflowed; longest matches are present |
|
228 -1 => failed to match |
|
229 < -1 => some kind of unexpected problem |
|
230 |
|
231 The following macros are used for adding states to the two state vectors (one |
|
232 for the current character, one for the following character). */ |
|
233 |
|
234 #define ADD_ACTIVE(x,y) \ |
|
235 if (active_count++ < wscount) \ |
|
236 { \ |
|
237 next_active_state->offset = (x); \ |
|
238 next_active_state->count = (y); \ |
|
239 next_active_state->ims = ims; \ |
|
240 next_active_state++; \ |
|
241 DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \ |
|
242 } \ |
|
243 else return PCRE_ERROR_DFA_WSSIZE |
|
244 |
|
245 #define ADD_ACTIVE_DATA(x,y,z) \ |
|
246 if (active_count++ < wscount) \ |
|
247 { \ |
|
248 next_active_state->offset = (x); \ |
|
249 next_active_state->count = (y); \ |
|
250 next_active_state->ims = ims; \ |
|
251 next_active_state->data = (z); \ |
|
252 next_active_state++; \ |
|
253 DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \ |
|
254 } \ |
|
255 else return PCRE_ERROR_DFA_WSSIZE |
|
256 |
|
257 #define ADD_NEW(x,y) \ |
|
258 if (new_count++ < wscount) \ |
|
259 { \ |
|
260 next_new_state->offset = (x); \ |
|
261 next_new_state->count = (y); \ |
|
262 next_new_state->ims = ims; \ |
|
263 next_new_state++; \ |
|
264 DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \ |
|
265 } \ |
|
266 else return PCRE_ERROR_DFA_WSSIZE |
|
267 |
|
268 #define ADD_NEW_DATA(x,y,z) \ |
|
269 if (new_count++ < wscount) \ |
|
270 { \ |
|
271 next_new_state->offset = (x); \ |
|
272 next_new_state->count = (y); \ |
|
273 next_new_state->ims = ims; \ |
|
274 next_new_state->data = (z); \ |
|
275 next_new_state++; \ |
|
276 DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \ |
|
277 } \ |
|
278 else return PCRE_ERROR_DFA_WSSIZE |
|
279 |
|
280 /* And now, here is the code */ |
|
281 |
|
282 static int |
|
283 internal_dfa_exec( |
|
284 dfa_match_data *md, |
|
285 const uschar *this_start_code, |
|
286 const uschar *current_subject, |
|
287 int start_offset, |
|
288 int *offsets, |
|
289 int offsetcount, |
|
290 int *workspace, |
|
291 int wscount, |
|
292 int ims, |
|
293 int rlevel, |
|
294 int recursing) |
|
295 { |
|
296 stateblock *active_states, *new_states, *temp_states; |
|
297 stateblock *next_active_state, *next_new_state; |
|
298 |
|
299 const uschar *ctypes, *lcc, *fcc; |
|
300 const uschar *ptr; |
|
301 const uschar *end_code, *first_op; |
|
302 |
|
303 int active_count, new_count, match_count; |
|
304 |
|
305 /* Some fields in the md block are frequently referenced, so we load them into |
|
306 independent variables in the hope that this will perform better. */ |
|
307 |
|
308 const uschar *start_subject = md->start_subject; |
|
309 const uschar *end_subject = md->end_subject; |
|
310 const uschar *start_code = md->start_code; |
|
311 |
|
312 #ifdef SUPPORT_UTF8 |
|
313 BOOL utf8 = (md->poptions & PCRE_UTF8) != 0; |
|
314 #else |
|
315 BOOL utf8 = FALSE; |
|
316 #endif |
|
317 |
|
318 rlevel++; |
|
319 offsetcount &= (-2); |
|
320 |
|
321 wscount -= 2; |
|
322 wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) / |
|
323 (2 * INTS_PER_STATEBLOCK); |
|
324 |
|
325 DPRINTF(("\n%.*s---------------------\n" |
|
326 "%.*sCall to internal_dfa_exec f=%d r=%d\n", |
|
327 rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing)); |
|
328 |
|
329 ctypes = md->tables + ctypes_offset; |
|
330 lcc = md->tables + lcc_offset; |
|
331 fcc = md->tables + fcc_offset; |
|
332 |
|
333 match_count = PCRE_ERROR_NOMATCH; /* A negative number */ |
|
334 |
|
335 active_states = (stateblock *)(workspace + 2); |
|
336 next_new_state = new_states = active_states + wscount; |
|
337 new_count = 0; |
|
338 |
|
339 first_op = this_start_code + 1 + LINK_SIZE + |
|
340 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0); |
|
341 |
|
342 /* The first thing in any (sub) pattern is a bracket of some sort. Push all |
|
343 the alternative states onto the list, and find out where the end is. This |
|
344 makes is possible to use this function recursively, when we want to stop at a |
|
345 matching internal ket rather than at the end. |
|
346 |
|
347 If the first opcode in the first alternative is OP_REVERSE, we are dealing with |
|
348 a backward assertion. In that case, we have to find out the maximum amount to |
|
349 move back, and set up each alternative appropriately. */ |
|
350 |
|
351 if (*first_op == OP_REVERSE) |
|
352 { |
|
353 int max_back = 0; |
|
354 int gone_back; |
|
355 |
|
356 end_code = this_start_code; |
|
357 do |
|
358 { |
|
359 int back = GET(end_code, 2+LINK_SIZE); |
|
360 if (back > max_back) max_back = back; |
|
361 end_code += GET(end_code, 1); |
|
362 } |
|
363 while (*end_code == OP_ALT); |
|
364 |
|
365 /* If we can't go back the amount required for the longest lookbehind |
|
366 pattern, go back as far as we can; some alternatives may still be viable. */ |
|
367 |
|
368 #ifdef SUPPORT_UTF8 |
|
369 /* In character mode we have to step back character by character */ |
|
370 |
|
371 if (utf8) |
|
372 { |
|
373 for (gone_back = 0; gone_back < max_back; gone_back++) |
|
374 { |
|
375 if (current_subject <= start_subject) break; |
|
376 current_subject--; |
|
377 while (current_subject > start_subject && |
|
378 (*current_subject & 0xc0) == 0x80) |
|
379 current_subject--; |
|
380 } |
|
381 } |
|
382 else |
|
383 #endif |
|
384 |
|
385 /* In byte-mode we can do this quickly. */ |
|
386 |
|
387 { |
|
388 gone_back = (current_subject - max_back < start_subject)? |
|
389 current_subject - start_subject : max_back; |
|
390 current_subject -= gone_back; |
|
391 } |
|
392 |
|
393 /* Now we can process the individual branches. */ |
|
394 |
|
395 end_code = this_start_code; |
|
396 do |
|
397 { |
|
398 int back = GET(end_code, 2+LINK_SIZE); |
|
399 if (back <= gone_back) |
|
400 { |
|
401 int bstate = end_code - start_code + 2 + 2*LINK_SIZE; |
|
402 ADD_NEW_DATA(-bstate, 0, gone_back - back); |
|
403 } |
|
404 end_code += GET(end_code, 1); |
|
405 } |
|
406 while (*end_code == OP_ALT); |
|
407 } |
|
408 |
|
409 /* This is the code for a "normal" subpattern (not a backward assertion). The |
|
410 start of a whole pattern is always one of these. If we are at the top level, |
|
411 we may be asked to restart matching from the same point that we reached for a |
|
412 previous partial match. We still have to scan through the top-level branches to |
|
413 find the end state. */ |
|
414 |
|
415 else |
|
416 { |
|
417 end_code = this_start_code; |
|
418 |
|
419 /* Restarting */ |
|
420 |
|
421 if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0) |
|
422 { |
|
423 do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT); |
|
424 new_count = workspace[1]; |
|
425 if (!workspace[0]) |
|
426 memcpy(new_states, active_states, new_count * sizeof(stateblock)); |
|
427 } |
|
428 |
|
429 /* Not restarting */ |
|
430 |
|
431 else |
|
432 { |
|
433 int length = 1 + LINK_SIZE + |
|
434 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0); |
|
435 do |
|
436 { |
|
437 ADD_NEW(end_code - start_code + length, 0); |
|
438 end_code += GET(end_code, 1); |
|
439 length = 1 + LINK_SIZE; |
|
440 } |
|
441 while (*end_code == OP_ALT); |
|
442 } |
|
443 } |
|
444 |
|
445 workspace[0] = 0; /* Bit indicating which vector is current */ |
|
446 |
|
447 DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code)); |
|
448 |
|
449 /* Loop for scanning the subject */ |
|
450 |
|
451 ptr = current_subject; |
|
452 for (;;) |
|
453 { |
|
454 int i, j; |
|
455 int clen, dlen; |
|
456 unsigned int c, d; |
|
457 |
|
458 /* Make the new state list into the active state list and empty the |
|
459 new state list. */ |
|
460 |
|
461 temp_states = active_states; |
|
462 active_states = new_states; |
|
463 new_states = temp_states; |
|
464 active_count = new_count; |
|
465 new_count = 0; |
|
466 |
|
467 workspace[0] ^= 1; /* Remember for the restarting feature */ |
|
468 workspace[1] = active_count; |
|
469 |
|
470 #ifdef DEBUG |
|
471 printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP); |
|
472 pchars((uschar *)ptr, strlen((char *)ptr), stdout); |
|
473 printf("\"\n"); |
|
474 |
|
475 printf("%.*sActive states: ", rlevel*2-2, SP); |
|
476 for (i = 0; i < active_count; i++) |
|
477 printf("%d/%d ", active_states[i].offset, active_states[i].count); |
|
478 printf("\n"); |
|
479 #endif |
|
480 |
|
481 /* Set the pointers for adding new states */ |
|
482 |
|
483 next_active_state = active_states + active_count; |
|
484 next_new_state = new_states; |
|
485 |
|
486 /* Load the current character from the subject outside the loop, as many |
|
487 different states may want to look at it, and we assume that at least one |
|
488 will. */ |
|
489 |
|
490 if (ptr < end_subject) |
|
491 { |
|
492 clen = 1; /* Number of bytes in the character */ |
|
493 #ifdef SUPPORT_UTF8 |
|
494 if (utf8) { GETCHARLEN(c, ptr, clen); } else |
|
495 #endif /* SUPPORT_UTF8 */ |
|
496 c = *ptr; |
|
497 } |
|
498 else |
|
499 { |
|
500 clen = 0; /* This indicates the end of the subject */ |
|
501 c = NOTACHAR; /* This value should never actually be used */ |
|
502 } |
|
503 |
|
504 /* Scan up the active states and act on each one. The result of an action |
|
505 may be to add more states to the currently active list (e.g. on hitting a |
|
506 parenthesis) or it may be to put states on the new list, for considering |
|
507 when we move the character pointer on. */ |
|
508 |
|
509 for (i = 0; i < active_count; i++) |
|
510 { |
|
511 stateblock *current_state = active_states + i; |
|
512 const uschar *code; |
|
513 int state_offset = current_state->offset; |
|
514 int count, codevalue; |
|
515 |
|
516 #ifdef DEBUG |
|
517 printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset); |
|
518 if (clen == 0) printf("EOL\n"); |
|
519 else if (c > 32 && c < 127) printf("'%c'\n", c); |
|
520 else printf("0x%02x\n", c); |
|
521 #endif |
|
522 |
|
523 /* This variable is referred to implicity in the ADD_xxx macros. */ |
|
524 |
|
525 ims = current_state->ims; |
|
526 |
|
527 /* A negative offset is a special case meaning "hold off going to this |
|
528 (negated) state until the number of characters in the data field have |
|
529 been skipped". */ |
|
530 |
|
531 if (state_offset < 0) |
|
532 { |
|
533 if (current_state->data > 0) |
|
534 { |
|
535 DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP)); |
|
536 ADD_NEW_DATA(state_offset, current_state->count, |
|
537 current_state->data - 1); |
|
538 continue; |
|
539 } |
|
540 else |
|
541 { |
|
542 current_state->offset = state_offset = -state_offset; |
|
543 } |
|
544 } |
|
545 |
|
546 /* Check for a duplicate state with the same count, and skip if found. */ |
|
547 |
|
548 for (j = 0; j < i; j++) |
|
549 { |
|
550 if (active_states[j].offset == state_offset && |
|
551 active_states[j].count == current_state->count) |
|
552 { |
|
553 DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP)); |
|
554 goto NEXT_ACTIVE_STATE; |
|
555 } |
|
556 } |
|
557 |
|
558 /* The state offset is the offset to the opcode */ |
|
559 |
|
560 code = start_code + state_offset; |
|
561 codevalue = *code; |
|
562 |
|
563 /* If this opcode is followed by an inline character, load it. It is |
|
564 tempting to test for the presence of a subject character here, but that |
|
565 is wrong, because sometimes zero repetitions of the subject are |
|
566 permitted. |
|
567 |
|
568 We also use this mechanism for opcodes such as OP_TYPEPLUS that take an |
|
569 argument that is not a data character - but is always one byte long. We |
|
570 have to take special action to deal with \P, \p, \H, \h, \V, \v and \X in |
|
571 this case. To keep the other cases fast, convert these ones to new opcodes. |
|
572 */ |
|
573 |
|
574 if (coptable[codevalue] > 0) |
|
575 { |
|
576 dlen = 1; |
|
577 #ifdef SUPPORT_UTF8 |
|
578 if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else |
|
579 #endif /* SUPPORT_UTF8 */ |
|
580 d = code[coptable[codevalue]]; |
|
581 if (codevalue >= OP_TYPESTAR) |
|
582 { |
|
583 switch(d) |
|
584 { |
|
585 case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM; |
|
586 case OP_NOTPROP: |
|
587 case OP_PROP: codevalue += OP_PROP_EXTRA; break; |
|
588 case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break; |
|
589 case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break; |
|
590 case OP_NOT_HSPACE: |
|
591 case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break; |
|
592 case OP_NOT_VSPACE: |
|
593 case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break; |
|
594 default: break; |
|
595 } |
|
596 } |
|
597 } |
|
598 else |
|
599 { |
|
600 dlen = 0; /* Not strictly necessary, but compilers moan */ |
|
601 d = NOTACHAR; /* if these variables are not set. */ |
|
602 } |
|
603 |
|
604 |
|
605 /* Now process the individual opcodes */ |
|
606 |
|
607 switch (codevalue) |
|
608 { |
|
609 |
|
610 /* ========================================================================== */ |
|
611 /* Reached a closing bracket. If not at the end of the pattern, carry |
|
612 on with the next opcode. Otherwise, unless we have an empty string and |
|
613 PCRE_NOTEMPTY is set, save the match data, shifting up all previous |
|
614 matches so we always have the longest first. */ |
|
615 |
|
616 case OP_KET: |
|
617 case OP_KETRMIN: |
|
618 case OP_KETRMAX: |
|
619 if (code != end_code) |
|
620 { |
|
621 ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0); |
|
622 if (codevalue != OP_KET) |
|
623 { |
|
624 ADD_ACTIVE(state_offset - GET(code, 1), 0); |
|
625 } |
|
626 } |
|
627 else if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0) |
|
628 { |
|
629 if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0; |
|
630 else if (match_count > 0 && ++match_count * 2 >= offsetcount) |
|
631 match_count = 0; |
|
632 count = ((match_count == 0)? offsetcount : match_count * 2) - 2; |
|
633 if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int)); |
|
634 if (offsetcount >= 2) |
|
635 { |
|
636 offsets[0] = current_subject - start_subject; |
|
637 offsets[1] = ptr - start_subject; |
|
638 DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP, |
|
639 offsets[1] - offsets[0], current_subject)); |
|
640 } |
|
641 if ((md->moptions & PCRE_DFA_SHORTEST) != 0) |
|
642 { |
|
643 DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n" |
|
644 "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, |
|
645 match_count, rlevel*2-2, SP)); |
|
646 return match_count; |
|
647 } |
|
648 } |
|
649 break; |
|
650 |
|
651 /* ========================================================================== */ |
|
652 /* These opcodes add to the current list of states without looking |
|
653 at the current character. */ |
|
654 |
|
655 /*-----------------------------------------------------------------*/ |
|
656 case OP_ALT: |
|
657 do { code += GET(code, 1); } while (*code == OP_ALT); |
|
658 ADD_ACTIVE(code - start_code, 0); |
|
659 break; |
|
660 |
|
661 /*-----------------------------------------------------------------*/ |
|
662 case OP_BRA: |
|
663 case OP_SBRA: |
|
664 do |
|
665 { |
|
666 ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0); |
|
667 code += GET(code, 1); |
|
668 } |
|
669 while (*code == OP_ALT); |
|
670 break; |
|
671 |
|
672 /*-----------------------------------------------------------------*/ |
|
673 case OP_CBRA: |
|
674 case OP_SCBRA: |
|
675 ADD_ACTIVE(code - start_code + 3 + LINK_SIZE, 0); |
|
676 code += GET(code, 1); |
|
677 while (*code == OP_ALT) |
|
678 { |
|
679 ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0); |
|
680 code += GET(code, 1); |
|
681 } |
|
682 break; |
|
683 |
|
684 /*-----------------------------------------------------------------*/ |
|
685 case OP_BRAZERO: |
|
686 case OP_BRAMINZERO: |
|
687 ADD_ACTIVE(state_offset + 1, 0); |
|
688 code += 1 + GET(code, 2); |
|
689 while (*code == OP_ALT) code += GET(code, 1); |
|
690 ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0); |
|
691 break; |
|
692 |
|
693 /*-----------------------------------------------------------------*/ |
|
694 case OP_SKIPZERO: |
|
695 code += 1 + GET(code, 2); |
|
696 while (*code == OP_ALT) code += GET(code, 1); |
|
697 ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0); |
|
698 break; |
|
699 |
|
700 /*-----------------------------------------------------------------*/ |
|
701 case OP_CIRC: |
|
702 if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) || |
|
703 ((ims & PCRE_MULTILINE) != 0 && |
|
704 ptr != end_subject && |
|
705 WAS_NEWLINE(ptr))) |
|
706 { ADD_ACTIVE(state_offset + 1, 0); } |
|
707 break; |
|
708 |
|
709 /*-----------------------------------------------------------------*/ |
|
710 case OP_EOD: |
|
711 if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); } |
|
712 break; |
|
713 |
|
714 /*-----------------------------------------------------------------*/ |
|
715 case OP_OPT: |
|
716 ims = code[1]; |
|
717 ADD_ACTIVE(state_offset + 2, 0); |
|
718 break; |
|
719 |
|
720 /*-----------------------------------------------------------------*/ |
|
721 case OP_SOD: |
|
722 if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); } |
|
723 break; |
|
724 |
|
725 /*-----------------------------------------------------------------*/ |
|
726 case OP_SOM: |
|
727 if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); } |
|
728 break; |
|
729 |
|
730 |
|
731 /* ========================================================================== */ |
|
732 /* These opcodes inspect the next subject character, and sometimes |
|
733 the previous one as well, but do not have an argument. The variable |
|
734 clen contains the length of the current character and is zero if we are |
|
735 at the end of the subject. */ |
|
736 |
|
737 /*-----------------------------------------------------------------*/ |
|
738 case OP_ANY: |
|
739 if (clen > 0 && !IS_NEWLINE(ptr)) |
|
740 { ADD_NEW(state_offset + 1, 0); } |
|
741 break; |
|
742 |
|
743 /*-----------------------------------------------------------------*/ |
|
744 case OP_ALLANY: |
|
745 if (clen > 0) |
|
746 { ADD_NEW(state_offset + 1, 0); } |
|
747 break; |
|
748 |
|
749 /*-----------------------------------------------------------------*/ |
|
750 case OP_EODN: |
|
751 if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen)) |
|
752 { ADD_ACTIVE(state_offset + 1, 0); } |
|
753 break; |
|
754 |
|
755 /*-----------------------------------------------------------------*/ |
|
756 case OP_DOLL: |
|
757 if ((md->moptions & PCRE_NOTEOL) == 0) |
|
758 { |
|
759 if (clen == 0 || |
|
760 (IS_NEWLINE(ptr) && |
|
761 ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen) |
|
762 )) |
|
763 { ADD_ACTIVE(state_offset + 1, 0); } |
|
764 } |
|
765 else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr)) |
|
766 { ADD_ACTIVE(state_offset + 1, 0); } |
|
767 break; |
|
768 |
|
769 /*-----------------------------------------------------------------*/ |
|
770 |
|
771 case OP_DIGIT: |
|
772 case OP_WHITESPACE: |
|
773 case OP_WORDCHAR: |
|
774 if (clen > 0 && c < 256 && |
|
775 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0) |
|
776 { ADD_NEW(state_offset + 1, 0); } |
|
777 break; |
|
778 |
|
779 /*-----------------------------------------------------------------*/ |
|
780 case OP_NOT_DIGIT: |
|
781 case OP_NOT_WHITESPACE: |
|
782 case OP_NOT_WORDCHAR: |
|
783 if (clen > 0 && (c >= 256 || |
|
784 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)) |
|
785 { ADD_NEW(state_offset + 1, 0); } |
|
786 break; |
|
787 |
|
788 /*-----------------------------------------------------------------*/ |
|
789 case OP_WORD_BOUNDARY: |
|
790 case OP_NOT_WORD_BOUNDARY: |
|
791 { |
|
792 int left_word, right_word; |
|
793 |
|
794 if (ptr > start_subject) |
|
795 { |
|
796 const uschar *temp = ptr - 1; |
|
797 #ifdef SUPPORT_UTF8 |
|
798 if (utf8) BACKCHAR(temp); |
|
799 #endif |
|
800 GETCHARTEST(d, temp); |
|
801 left_word = d < 256 && (ctypes[d] & ctype_word) != 0; |
|
802 } |
|
803 else left_word = 0; |
|
804 |
|
805 if (clen > 0) right_word = c < 256 && (ctypes[c] & ctype_word) != 0; |
|
806 else right_word = 0; |
|
807 |
|
808 if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY)) |
|
809 { ADD_ACTIVE(state_offset + 1, 0); } |
|
810 } |
|
811 break; |
|
812 |
|
813 |
|
814 /*-----------------------------------------------------------------*/ |
|
815 /* Check the next character by Unicode property. We will get here only |
|
816 if the support is in the binary; otherwise a compile-time error occurs. |
|
817 */ |
|
818 |
|
819 #ifdef SUPPORT_UCP |
|
820 case OP_PROP: |
|
821 case OP_NOTPROP: |
|
822 if (clen > 0) |
|
823 { |
|
824 BOOL OK; |
|
825 const ucd_record * prop = GET_UCD(c); |
|
826 switch(code[1]) |
|
827 { |
|
828 case PT_ANY: |
|
829 OK = TRUE; |
|
830 break; |
|
831 |
|
832 case PT_LAMP: |
|
833 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt; |
|
834 break; |
|
835 |
|
836 case PT_GC: |
|
837 OK = _pcre_ucp_gentype[prop->chartype] == code[2]; |
|
838 break; |
|
839 |
|
840 case PT_PC: |
|
841 OK = prop->chartype == code[2]; |
|
842 break; |
|
843 |
|
844 case PT_SC: |
|
845 OK = prop->script == code[2]; |
|
846 break; |
|
847 |
|
848 /* Should never occur, but keep compilers from grumbling. */ |
|
849 |
|
850 default: |
|
851 OK = codevalue != OP_PROP; |
|
852 break; |
|
853 } |
|
854 |
|
855 if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); } |
|
856 } |
|
857 break; |
|
858 #endif |
|
859 |
|
860 |
|
861 |
|
862 /* ========================================================================== */ |
|
863 /* These opcodes likewise inspect the subject character, but have an |
|
864 argument that is not a data character. It is one of these opcodes: |
|
865 OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE, |
|
866 OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */ |
|
867 |
|
868 case OP_TYPEPLUS: |
|
869 case OP_TYPEMINPLUS: |
|
870 case OP_TYPEPOSPLUS: |
|
871 count = current_state->count; /* Already matched */ |
|
872 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); } |
|
873 if (clen > 0) |
|
874 { |
|
875 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || |
|
876 (c < 256 && |
|
877 (d != OP_ANY || !IS_NEWLINE(ptr)) && |
|
878 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) |
|
879 { |
|
880 if (count > 0 && codevalue == OP_TYPEPOSPLUS) |
|
881 { |
|
882 active_count--; /* Remove non-match possibility */ |
|
883 next_active_state--; |
|
884 } |
|
885 count++; |
|
886 ADD_NEW(state_offset, count); |
|
887 } |
|
888 } |
|
889 break; |
|
890 |
|
891 /*-----------------------------------------------------------------*/ |
|
892 case OP_TYPEQUERY: |
|
893 case OP_TYPEMINQUERY: |
|
894 case OP_TYPEPOSQUERY: |
|
895 ADD_ACTIVE(state_offset + 2, 0); |
|
896 if (clen > 0) |
|
897 { |
|
898 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || |
|
899 (c < 256 && |
|
900 (d != OP_ANY || !IS_NEWLINE(ptr)) && |
|
901 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) |
|
902 { |
|
903 if (codevalue == OP_TYPEPOSQUERY) |
|
904 { |
|
905 active_count--; /* Remove non-match possibility */ |
|
906 next_active_state--; |
|
907 } |
|
908 ADD_NEW(state_offset + 2, 0); |
|
909 } |
|
910 } |
|
911 break; |
|
912 |
|
913 /*-----------------------------------------------------------------*/ |
|
914 case OP_TYPESTAR: |
|
915 case OP_TYPEMINSTAR: |
|
916 case OP_TYPEPOSSTAR: |
|
917 ADD_ACTIVE(state_offset + 2, 0); |
|
918 if (clen > 0) |
|
919 { |
|
920 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || |
|
921 (c < 256 && |
|
922 (d != OP_ANY || !IS_NEWLINE(ptr)) && |
|
923 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) |
|
924 { |
|
925 if (codevalue == OP_TYPEPOSSTAR) |
|
926 { |
|
927 active_count--; /* Remove non-match possibility */ |
|
928 next_active_state--; |
|
929 } |
|
930 ADD_NEW(state_offset, 0); |
|
931 } |
|
932 } |
|
933 break; |
|
934 |
|
935 /*-----------------------------------------------------------------*/ |
|
936 case OP_TYPEEXACT: |
|
937 count = current_state->count; /* Number already matched */ |
|
938 if (clen > 0) |
|
939 { |
|
940 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || |
|
941 (c < 256 && |
|
942 (d != OP_ANY || !IS_NEWLINE(ptr)) && |
|
943 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) |
|
944 { |
|
945 if (++count >= GET2(code, 1)) |
|
946 { ADD_NEW(state_offset + 4, 0); } |
|
947 else |
|
948 { ADD_NEW(state_offset, count); } |
|
949 } |
|
950 } |
|
951 break; |
|
952 |
|
953 /*-----------------------------------------------------------------*/ |
|
954 case OP_TYPEUPTO: |
|
955 case OP_TYPEMINUPTO: |
|
956 case OP_TYPEPOSUPTO: |
|
957 ADD_ACTIVE(state_offset + 4, 0); |
|
958 count = current_state->count; /* Number already matched */ |
|
959 if (clen > 0) |
|
960 { |
|
961 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || |
|
962 (c < 256 && |
|
963 (d != OP_ANY || !IS_NEWLINE(ptr)) && |
|
964 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) |
|
965 { |
|
966 if (codevalue == OP_TYPEPOSUPTO) |
|
967 { |
|
968 active_count--; /* Remove non-match possibility */ |
|
969 next_active_state--; |
|
970 } |
|
971 if (++count >= GET2(code, 1)) |
|
972 { ADD_NEW(state_offset + 4, 0); } |
|
973 else |
|
974 { ADD_NEW(state_offset, count); } |
|
975 } |
|
976 } |
|
977 break; |
|
978 |
|
979 /* ========================================================================== */ |
|
980 /* These are virtual opcodes that are used when something like |
|
981 OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its |
|
982 argument. It keeps the code above fast for the other cases. The argument |
|
983 is in the d variable. */ |
|
984 |
|
985 #ifdef SUPPORT_UCP |
|
986 case OP_PROP_EXTRA + OP_TYPEPLUS: |
|
987 case OP_PROP_EXTRA + OP_TYPEMINPLUS: |
|
988 case OP_PROP_EXTRA + OP_TYPEPOSPLUS: |
|
989 count = current_state->count; /* Already matched */ |
|
990 if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); } |
|
991 if (clen > 0) |
|
992 { |
|
993 BOOL OK; |
|
994 const ucd_record * prop = GET_UCD(c); |
|
995 switch(code[2]) |
|
996 { |
|
997 case PT_ANY: |
|
998 OK = TRUE; |
|
999 break; |
|
1000 |
|
1001 case PT_LAMP: |
|
1002 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt; |
|
1003 break; |
|
1004 |
|
1005 case PT_GC: |
|
1006 OK = _pcre_ucp_gentype[prop->chartype] == code[3]; |
|
1007 break; |
|
1008 |
|
1009 case PT_PC: |
|
1010 OK = prop->chartype == code[3]; |
|
1011 break; |
|
1012 |
|
1013 case PT_SC: |
|
1014 OK = prop->script == code[3]; |
|
1015 break; |
|
1016 |
|
1017 /* Should never occur, but keep compilers from grumbling. */ |
|
1018 |
|
1019 default: |
|
1020 OK = codevalue != OP_PROP; |
|
1021 break; |
|
1022 } |
|
1023 |
|
1024 if (OK == (d == OP_PROP)) |
|
1025 { |
|
1026 if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS) |
|
1027 { |
|
1028 active_count--; /* Remove non-match possibility */ |
|
1029 next_active_state--; |
|
1030 } |
|
1031 count++; |
|
1032 ADD_NEW(state_offset, count); |
|
1033 } |
|
1034 } |
|
1035 break; |
|
1036 |
|
1037 /*-----------------------------------------------------------------*/ |
|
1038 case OP_EXTUNI_EXTRA + OP_TYPEPLUS: |
|
1039 case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS: |
|
1040 case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS: |
|
1041 count = current_state->count; /* Already matched */ |
|
1042 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); } |
|
1043 if (clen > 0 && UCD_CATEGORY(c) != ucp_M) |
|
1044 { |
|
1045 const uschar *nptr = ptr + clen; |
|
1046 int ncount = 0; |
|
1047 if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS) |
|
1048 { |
|
1049 active_count--; /* Remove non-match possibility */ |
|
1050 next_active_state--; |
|
1051 } |
|
1052 while (nptr < end_subject) |
|
1053 { |
|
1054 int nd; |
|
1055 int ndlen = 1; |
|
1056 GETCHARLEN(nd, nptr, ndlen); |
|
1057 if (UCD_CATEGORY(nd) != ucp_M) break; |
|
1058 ncount++; |
|
1059 nptr += ndlen; |
|
1060 } |
|
1061 count++; |
|
1062 ADD_NEW_DATA(-state_offset, count, ncount); |
|
1063 } |
|
1064 break; |
|
1065 #endif |
|
1066 |
|
1067 /*-----------------------------------------------------------------*/ |
|
1068 case OP_ANYNL_EXTRA + OP_TYPEPLUS: |
|
1069 case OP_ANYNL_EXTRA + OP_TYPEMINPLUS: |
|
1070 case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS: |
|
1071 count = current_state->count; /* Already matched */ |
|
1072 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); } |
|
1073 if (clen > 0) |
|
1074 { |
|
1075 int ncount = 0; |
|
1076 switch (c) |
|
1077 { |
|
1078 case 0x000b: |
|
1079 case 0x000c: |
|
1080 case 0x0085: |
|
1081 case 0x2028: |
|
1082 case 0x2029: |
|
1083 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break; |
|
1084 goto ANYNL01; |
|
1085 |
|
1086 case 0x000d: |
|
1087 if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1; |
|
1088 /* Fall through */ |
|
1089 |
|
1090 ANYNL01: |
|
1091 case 0x000a: |
|
1092 if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS) |
|
1093 { |
|
1094 active_count--; /* Remove non-match possibility */ |
|
1095 next_active_state--; |
|
1096 } |
|
1097 count++; |
|
1098 ADD_NEW_DATA(-state_offset, count, ncount); |
|
1099 break; |
|
1100 |
|
1101 default: |
|
1102 break; |
|
1103 } |
|
1104 } |
|
1105 break; |
|
1106 |
|
1107 /*-----------------------------------------------------------------*/ |
|
1108 case OP_VSPACE_EXTRA + OP_TYPEPLUS: |
|
1109 case OP_VSPACE_EXTRA + OP_TYPEMINPLUS: |
|
1110 case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS: |
|
1111 count = current_state->count; /* Already matched */ |
|
1112 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); } |
|
1113 if (clen > 0) |
|
1114 { |
|
1115 BOOL OK; |
|
1116 switch (c) |
|
1117 { |
|
1118 case 0x000a: |
|
1119 case 0x000b: |
|
1120 case 0x000c: |
|
1121 case 0x000d: |
|
1122 case 0x0085: |
|
1123 case 0x2028: |
|
1124 case 0x2029: |
|
1125 OK = TRUE; |
|
1126 break; |
|
1127 |
|
1128 default: |
|
1129 OK = FALSE; |
|
1130 break; |
|
1131 } |
|
1132 |
|
1133 if (OK == (d == OP_VSPACE)) |
|
1134 { |
|
1135 if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS) |
|
1136 { |
|
1137 active_count--; /* Remove non-match possibility */ |
|
1138 next_active_state--; |
|
1139 } |
|
1140 count++; |
|
1141 ADD_NEW_DATA(-state_offset, count, 0); |
|
1142 } |
|
1143 } |
|
1144 break; |
|
1145 |
|
1146 /*-----------------------------------------------------------------*/ |
|
1147 case OP_HSPACE_EXTRA + OP_TYPEPLUS: |
|
1148 case OP_HSPACE_EXTRA + OP_TYPEMINPLUS: |
|
1149 case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS: |
|
1150 count = current_state->count; /* Already matched */ |
|
1151 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); } |
|
1152 if (clen > 0) |
|
1153 { |
|
1154 BOOL OK; |
|
1155 switch (c) |
|
1156 { |
|
1157 case 0x09: /* HT */ |
|
1158 case 0x20: /* SPACE */ |
|
1159 case 0xa0: /* NBSP */ |
|
1160 case 0x1680: /* OGHAM SPACE MARK */ |
|
1161 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ |
|
1162 case 0x2000: /* EN QUAD */ |
|
1163 case 0x2001: /* EM QUAD */ |
|
1164 case 0x2002: /* EN SPACE */ |
|
1165 case 0x2003: /* EM SPACE */ |
|
1166 case 0x2004: /* THREE-PER-EM SPACE */ |
|
1167 case 0x2005: /* FOUR-PER-EM SPACE */ |
|
1168 case 0x2006: /* SIX-PER-EM SPACE */ |
|
1169 case 0x2007: /* FIGURE SPACE */ |
|
1170 case 0x2008: /* PUNCTUATION SPACE */ |
|
1171 case 0x2009: /* THIN SPACE */ |
|
1172 case 0x200A: /* HAIR SPACE */ |
|
1173 case 0x202f: /* NARROW NO-BREAK SPACE */ |
|
1174 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ |
|
1175 case 0x3000: /* IDEOGRAPHIC SPACE */ |
|
1176 OK = TRUE; |
|
1177 break; |
|
1178 |
|
1179 default: |
|
1180 OK = FALSE; |
|
1181 break; |
|
1182 } |
|
1183 |
|
1184 if (OK == (d == OP_HSPACE)) |
|
1185 { |
|
1186 if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS) |
|
1187 { |
|
1188 active_count--; /* Remove non-match possibility */ |
|
1189 next_active_state--; |
|
1190 } |
|
1191 count++; |
|
1192 ADD_NEW_DATA(-state_offset, count, 0); |
|
1193 } |
|
1194 } |
|
1195 break; |
|
1196 |
|
1197 /*-----------------------------------------------------------------*/ |
|
1198 #ifdef SUPPORT_UCP |
|
1199 case OP_PROP_EXTRA + OP_TYPEQUERY: |
|
1200 case OP_PROP_EXTRA + OP_TYPEMINQUERY: |
|
1201 case OP_PROP_EXTRA + OP_TYPEPOSQUERY: |
|
1202 count = 4; |
|
1203 goto QS1; |
|
1204 |
|
1205 case OP_PROP_EXTRA + OP_TYPESTAR: |
|
1206 case OP_PROP_EXTRA + OP_TYPEMINSTAR: |
|
1207 case OP_PROP_EXTRA + OP_TYPEPOSSTAR: |
|
1208 count = 0; |
|
1209 |
|
1210 QS1: |
|
1211 |
|
1212 ADD_ACTIVE(state_offset + 4, 0); |
|
1213 if (clen > 0) |
|
1214 { |
|
1215 BOOL OK; |
|
1216 const ucd_record * prop = GET_UCD(c); |
|
1217 switch(code[2]) |
|
1218 { |
|
1219 case PT_ANY: |
|
1220 OK = TRUE; |
|
1221 break; |
|
1222 |
|
1223 case PT_LAMP: |
|
1224 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt; |
|
1225 break; |
|
1226 |
|
1227 case PT_GC: |
|
1228 OK = _pcre_ucp_gentype[prop->chartype] == code[3]; |
|
1229 break; |
|
1230 |
|
1231 case PT_PC: |
|
1232 OK = prop->chartype == code[3]; |
|
1233 break; |
|
1234 |
|
1235 case PT_SC: |
|
1236 OK = prop->script == code[3]; |
|
1237 break; |
|
1238 |
|
1239 /* Should never occur, but keep compilers from grumbling. */ |
|
1240 |
|
1241 default: |
|
1242 OK = codevalue != OP_PROP; |
|
1243 break; |
|
1244 } |
|
1245 |
|
1246 if (OK == (d == OP_PROP)) |
|
1247 { |
|
1248 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR || |
|
1249 codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY) |
|
1250 { |
|
1251 active_count--; /* Remove non-match possibility */ |
|
1252 next_active_state--; |
|
1253 } |
|
1254 ADD_NEW(state_offset + count, 0); |
|
1255 } |
|
1256 } |
|
1257 break; |
|
1258 |
|
1259 /*-----------------------------------------------------------------*/ |
|
1260 case OP_EXTUNI_EXTRA + OP_TYPEQUERY: |
|
1261 case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY: |
|
1262 case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY: |
|
1263 count = 2; |
|
1264 goto QS2; |
|
1265 |
|
1266 case OP_EXTUNI_EXTRA + OP_TYPESTAR: |
|
1267 case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR: |
|
1268 case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR: |
|
1269 count = 0; |
|
1270 |
|
1271 QS2: |
|
1272 |
|
1273 ADD_ACTIVE(state_offset + 2, 0); |
|
1274 if (clen > 0 && UCD_CATEGORY(c) != ucp_M) |
|
1275 { |
|
1276 const uschar *nptr = ptr + clen; |
|
1277 int ncount = 0; |
|
1278 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR || |
|
1279 codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY) |
|
1280 { |
|
1281 active_count--; /* Remove non-match possibility */ |
|
1282 next_active_state--; |
|
1283 } |
|
1284 while (nptr < end_subject) |
|
1285 { |
|
1286 int nd; |
|
1287 int ndlen = 1; |
|
1288 GETCHARLEN(nd, nptr, ndlen); |
|
1289 if (UCD_CATEGORY(nd) != ucp_M) break; |
|
1290 ncount++; |
|
1291 nptr += ndlen; |
|
1292 } |
|
1293 ADD_NEW_DATA(-(state_offset + count), 0, ncount); |
|
1294 } |
|
1295 break; |
|
1296 #endif |
|
1297 |
|
1298 /*-----------------------------------------------------------------*/ |
|
1299 case OP_ANYNL_EXTRA + OP_TYPEQUERY: |
|
1300 case OP_ANYNL_EXTRA + OP_TYPEMINQUERY: |
|
1301 case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY: |
|
1302 count = 2; |
|
1303 goto QS3; |
|
1304 |
|
1305 case OP_ANYNL_EXTRA + OP_TYPESTAR: |
|
1306 case OP_ANYNL_EXTRA + OP_TYPEMINSTAR: |
|
1307 case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR: |
|
1308 count = 0; |
|
1309 |
|
1310 QS3: |
|
1311 ADD_ACTIVE(state_offset + 2, 0); |
|
1312 if (clen > 0) |
|
1313 { |
|
1314 int ncount = 0; |
|
1315 switch (c) |
|
1316 { |
|
1317 case 0x000b: |
|
1318 case 0x000c: |
|
1319 case 0x0085: |
|
1320 case 0x2028: |
|
1321 case 0x2029: |
|
1322 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break; |
|
1323 goto ANYNL02; |
|
1324 |
|
1325 case 0x000d: |
|
1326 if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1; |
|
1327 /* Fall through */ |
|
1328 |
|
1329 ANYNL02: |
|
1330 case 0x000a: |
|
1331 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR || |
|
1332 codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY) |
|
1333 { |
|
1334 active_count--; /* Remove non-match possibility */ |
|
1335 next_active_state--; |
|
1336 } |
|
1337 ADD_NEW_DATA(-(state_offset + count), 0, ncount); |
|
1338 break; |
|
1339 |
|
1340 default: |
|
1341 break; |
|
1342 } |
|
1343 } |
|
1344 break; |
|
1345 |
|
1346 /*-----------------------------------------------------------------*/ |
|
1347 case OP_VSPACE_EXTRA + OP_TYPEQUERY: |
|
1348 case OP_VSPACE_EXTRA + OP_TYPEMINQUERY: |
|
1349 case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY: |
|
1350 count = 2; |
|
1351 goto QS4; |
|
1352 |
|
1353 case OP_VSPACE_EXTRA + OP_TYPESTAR: |
|
1354 case OP_VSPACE_EXTRA + OP_TYPEMINSTAR: |
|
1355 case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR: |
|
1356 count = 0; |
|
1357 |
|
1358 QS4: |
|
1359 ADD_ACTIVE(state_offset + 2, 0); |
|
1360 if (clen > 0) |
|
1361 { |
|
1362 BOOL OK; |
|
1363 switch (c) |
|
1364 { |
|
1365 case 0x000a: |
|
1366 case 0x000b: |
|
1367 case 0x000c: |
|
1368 case 0x000d: |
|
1369 case 0x0085: |
|
1370 case 0x2028: |
|
1371 case 0x2029: |
|
1372 OK = TRUE; |
|
1373 break; |
|
1374 |
|
1375 default: |
|
1376 OK = FALSE; |
|
1377 break; |
|
1378 } |
|
1379 if (OK == (d == OP_VSPACE)) |
|
1380 { |
|
1381 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR || |
|
1382 codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY) |
|
1383 { |
|
1384 active_count--; /* Remove non-match possibility */ |
|
1385 next_active_state--; |
|
1386 } |
|
1387 ADD_NEW_DATA(-(state_offset + count), 0, 0); |
|
1388 } |
|
1389 } |
|
1390 break; |
|
1391 |
|
1392 /*-----------------------------------------------------------------*/ |
|
1393 case OP_HSPACE_EXTRA + OP_TYPEQUERY: |
|
1394 case OP_HSPACE_EXTRA + OP_TYPEMINQUERY: |
|
1395 case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY: |
|
1396 count = 2; |
|
1397 goto QS5; |
|
1398 |
|
1399 case OP_HSPACE_EXTRA + OP_TYPESTAR: |
|
1400 case OP_HSPACE_EXTRA + OP_TYPEMINSTAR: |
|
1401 case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR: |
|
1402 count = 0; |
|
1403 |
|
1404 QS5: |
|
1405 ADD_ACTIVE(state_offset + 2, 0); |
|
1406 if (clen > 0) |
|
1407 { |
|
1408 BOOL OK; |
|
1409 switch (c) |
|
1410 { |
|
1411 case 0x09: /* HT */ |
|
1412 case 0x20: /* SPACE */ |
|
1413 case 0xa0: /* NBSP */ |
|
1414 case 0x1680: /* OGHAM SPACE MARK */ |
|
1415 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ |
|
1416 case 0x2000: /* EN QUAD */ |
|
1417 case 0x2001: /* EM QUAD */ |
|
1418 case 0x2002: /* EN SPACE */ |
|
1419 case 0x2003: /* EM SPACE */ |
|
1420 case 0x2004: /* THREE-PER-EM SPACE */ |
|
1421 case 0x2005: /* FOUR-PER-EM SPACE */ |
|
1422 case 0x2006: /* SIX-PER-EM SPACE */ |
|
1423 case 0x2007: /* FIGURE SPACE */ |
|
1424 case 0x2008: /* PUNCTUATION SPACE */ |
|
1425 case 0x2009: /* THIN SPACE */ |
|
1426 case 0x200A: /* HAIR SPACE */ |
|
1427 case 0x202f: /* NARROW NO-BREAK SPACE */ |
|
1428 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ |
|
1429 case 0x3000: /* IDEOGRAPHIC SPACE */ |
|
1430 OK = TRUE; |
|
1431 break; |
|
1432 |
|
1433 default: |
|
1434 OK = FALSE; |
|
1435 break; |
|
1436 } |
|
1437 |
|
1438 if (OK == (d == OP_HSPACE)) |
|
1439 { |
|
1440 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR || |
|
1441 codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY) |
|
1442 { |
|
1443 active_count--; /* Remove non-match possibility */ |
|
1444 next_active_state--; |
|
1445 } |
|
1446 ADD_NEW_DATA(-(state_offset + count), 0, 0); |
|
1447 } |
|
1448 } |
|
1449 break; |
|
1450 |
|
1451 /*-----------------------------------------------------------------*/ |
|
1452 #ifdef SUPPORT_UCP |
|
1453 case OP_PROP_EXTRA + OP_TYPEEXACT: |
|
1454 case OP_PROP_EXTRA + OP_TYPEUPTO: |
|
1455 case OP_PROP_EXTRA + OP_TYPEMINUPTO: |
|
1456 case OP_PROP_EXTRA + OP_TYPEPOSUPTO: |
|
1457 if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT) |
|
1458 { ADD_ACTIVE(state_offset + 6, 0); } |
|
1459 count = current_state->count; /* Number already matched */ |
|
1460 if (clen > 0) |
|
1461 { |
|
1462 BOOL OK; |
|
1463 const ucd_record * prop = GET_UCD(c); |
|
1464 switch(code[4]) |
|
1465 { |
|
1466 case PT_ANY: |
|
1467 OK = TRUE; |
|
1468 break; |
|
1469 |
|
1470 case PT_LAMP: |
|
1471 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt; |
|
1472 break; |
|
1473 |
|
1474 case PT_GC: |
|
1475 OK = _pcre_ucp_gentype[prop->chartype] == code[5]; |
|
1476 break; |
|
1477 |
|
1478 case PT_PC: |
|
1479 OK = prop->chartype == code[5]; |
|
1480 break; |
|
1481 |
|
1482 case PT_SC: |
|
1483 OK = prop->script == code[5]; |
|
1484 break; |
|
1485 |
|
1486 /* Should never occur, but keep compilers from grumbling. */ |
|
1487 |
|
1488 default: |
|
1489 OK = codevalue != OP_PROP; |
|
1490 break; |
|
1491 } |
|
1492 |
|
1493 if (OK == (d == OP_PROP)) |
|
1494 { |
|
1495 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO) |
|
1496 { |
|
1497 active_count--; /* Remove non-match possibility */ |
|
1498 next_active_state--; |
|
1499 } |
|
1500 if (++count >= GET2(code, 1)) |
|
1501 { ADD_NEW(state_offset + 6, 0); } |
|
1502 else |
|
1503 { ADD_NEW(state_offset, count); } |
|
1504 } |
|
1505 } |
|
1506 break; |
|
1507 |
|
1508 /*-----------------------------------------------------------------*/ |
|
1509 case OP_EXTUNI_EXTRA + OP_TYPEEXACT: |
|
1510 case OP_EXTUNI_EXTRA + OP_TYPEUPTO: |
|
1511 case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO: |
|
1512 case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO: |
|
1513 if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT) |
|
1514 { ADD_ACTIVE(state_offset + 4, 0); } |
|
1515 count = current_state->count; /* Number already matched */ |
|
1516 if (clen > 0 && UCD_CATEGORY(c) != ucp_M) |
|
1517 { |
|
1518 const uschar *nptr = ptr + clen; |
|
1519 int ncount = 0; |
|
1520 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO) |
|
1521 { |
|
1522 active_count--; /* Remove non-match possibility */ |
|
1523 next_active_state--; |
|
1524 } |
|
1525 while (nptr < end_subject) |
|
1526 { |
|
1527 int nd; |
|
1528 int ndlen = 1; |
|
1529 GETCHARLEN(nd, nptr, ndlen); |
|
1530 if (UCD_CATEGORY(nd) != ucp_M) break; |
|
1531 ncount++; |
|
1532 nptr += ndlen; |
|
1533 } |
|
1534 if (++count >= GET2(code, 1)) |
|
1535 { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); } |
|
1536 else |
|
1537 { ADD_NEW_DATA(-state_offset, count, ncount); } |
|
1538 } |
|
1539 break; |
|
1540 #endif |
|
1541 |
|
1542 /*-----------------------------------------------------------------*/ |
|
1543 case OP_ANYNL_EXTRA + OP_TYPEEXACT: |
|
1544 case OP_ANYNL_EXTRA + OP_TYPEUPTO: |
|
1545 case OP_ANYNL_EXTRA + OP_TYPEMINUPTO: |
|
1546 case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO: |
|
1547 if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT) |
|
1548 { ADD_ACTIVE(state_offset + 4, 0); } |
|
1549 count = current_state->count; /* Number already matched */ |
|
1550 if (clen > 0) |
|
1551 { |
|
1552 int ncount = 0; |
|
1553 switch (c) |
|
1554 { |
|
1555 case 0x000b: |
|
1556 case 0x000c: |
|
1557 case 0x0085: |
|
1558 case 0x2028: |
|
1559 case 0x2029: |
|
1560 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break; |
|
1561 goto ANYNL03; |
|
1562 |
|
1563 case 0x000d: |
|
1564 if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1; |
|
1565 /* Fall through */ |
|
1566 |
|
1567 ANYNL03: |
|
1568 case 0x000a: |
|
1569 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO) |
|
1570 { |
|
1571 active_count--; /* Remove non-match possibility */ |
|
1572 next_active_state--; |
|
1573 } |
|
1574 if (++count >= GET2(code, 1)) |
|
1575 { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); } |
|
1576 else |
|
1577 { ADD_NEW_DATA(-state_offset, count, ncount); } |
|
1578 break; |
|
1579 |
|
1580 default: |
|
1581 break; |
|
1582 } |
|
1583 } |
|
1584 break; |
|
1585 |
|
1586 /*-----------------------------------------------------------------*/ |
|
1587 case OP_VSPACE_EXTRA + OP_TYPEEXACT: |
|
1588 case OP_VSPACE_EXTRA + OP_TYPEUPTO: |
|
1589 case OP_VSPACE_EXTRA + OP_TYPEMINUPTO: |
|
1590 case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO: |
|
1591 if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT) |
|
1592 { ADD_ACTIVE(state_offset + 4, 0); } |
|
1593 count = current_state->count; /* Number already matched */ |
|
1594 if (clen > 0) |
|
1595 { |
|
1596 BOOL OK; |
|
1597 switch (c) |
|
1598 { |
|
1599 case 0x000a: |
|
1600 case 0x000b: |
|
1601 case 0x000c: |
|
1602 case 0x000d: |
|
1603 case 0x0085: |
|
1604 case 0x2028: |
|
1605 case 0x2029: |
|
1606 OK = TRUE; |
|
1607 break; |
|
1608 |
|
1609 default: |
|
1610 OK = FALSE; |
|
1611 } |
|
1612 |
|
1613 if (OK == (d == OP_VSPACE)) |
|
1614 { |
|
1615 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO) |
|
1616 { |
|
1617 active_count--; /* Remove non-match possibility */ |
|
1618 next_active_state--; |
|
1619 } |
|
1620 if (++count >= GET2(code, 1)) |
|
1621 { ADD_NEW_DATA(-(state_offset + 4), 0, 0); } |
|
1622 else |
|
1623 { ADD_NEW_DATA(-state_offset, count, 0); } |
|
1624 } |
|
1625 } |
|
1626 break; |
|
1627 |
|
1628 /*-----------------------------------------------------------------*/ |
|
1629 case OP_HSPACE_EXTRA + OP_TYPEEXACT: |
|
1630 case OP_HSPACE_EXTRA + OP_TYPEUPTO: |
|
1631 case OP_HSPACE_EXTRA + OP_TYPEMINUPTO: |
|
1632 case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO: |
|
1633 if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT) |
|
1634 { ADD_ACTIVE(state_offset + 4, 0); } |
|
1635 count = current_state->count; /* Number already matched */ |
|
1636 if (clen > 0) |
|
1637 { |
|
1638 BOOL OK; |
|
1639 switch (c) |
|
1640 { |
|
1641 case 0x09: /* HT */ |
|
1642 case 0x20: /* SPACE */ |
|
1643 case 0xa0: /* NBSP */ |
|
1644 case 0x1680: /* OGHAM SPACE MARK */ |
|
1645 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ |
|
1646 case 0x2000: /* EN QUAD */ |
|
1647 case 0x2001: /* EM QUAD */ |
|
1648 case 0x2002: /* EN SPACE */ |
|
1649 case 0x2003: /* EM SPACE */ |
|
1650 case 0x2004: /* THREE-PER-EM SPACE */ |
|
1651 case 0x2005: /* FOUR-PER-EM SPACE */ |
|
1652 case 0x2006: /* SIX-PER-EM SPACE */ |
|
1653 case 0x2007: /* FIGURE SPACE */ |
|
1654 case 0x2008: /* PUNCTUATION SPACE */ |
|
1655 case 0x2009: /* THIN SPACE */ |
|
1656 case 0x200A: /* HAIR SPACE */ |
|
1657 case 0x202f: /* NARROW NO-BREAK SPACE */ |
|
1658 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ |
|
1659 case 0x3000: /* IDEOGRAPHIC SPACE */ |
|
1660 OK = TRUE; |
|
1661 break; |
|
1662 |
|
1663 default: |
|
1664 OK = FALSE; |
|
1665 break; |
|
1666 } |
|
1667 |
|
1668 if (OK == (d == OP_HSPACE)) |
|
1669 { |
|
1670 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO) |
|
1671 { |
|
1672 active_count--; /* Remove non-match possibility */ |
|
1673 next_active_state--; |
|
1674 } |
|
1675 if (++count >= GET2(code, 1)) |
|
1676 { ADD_NEW_DATA(-(state_offset + 4), 0, 0); } |
|
1677 else |
|
1678 { ADD_NEW_DATA(-state_offset, count, 0); } |
|
1679 } |
|
1680 } |
|
1681 break; |
|
1682 |
|
1683 /* ========================================================================== */ |
|
1684 /* These opcodes are followed by a character that is usually compared |
|
1685 to the current subject character; it is loaded into d. We still get |
|
1686 here even if there is no subject character, because in some cases zero |
|
1687 repetitions are permitted. */ |
|
1688 |
|
1689 /*-----------------------------------------------------------------*/ |
|
1690 case OP_CHAR: |
|
1691 if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); } |
|
1692 break; |
|
1693 |
|
1694 /*-----------------------------------------------------------------*/ |
|
1695 case OP_CHARNC: |
|
1696 if (clen == 0) break; |
|
1697 |
|
1698 #ifdef SUPPORT_UTF8 |
|
1699 if (utf8) |
|
1700 { |
|
1701 if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else |
|
1702 { |
|
1703 unsigned int othercase; |
|
1704 if (c < 128) othercase = fcc[c]; else |
|
1705 |
|
1706 /* If we have Unicode property support, we can use it to test the |
|
1707 other case of the character. */ |
|
1708 |
|
1709 #ifdef SUPPORT_UCP |
|
1710 othercase = UCD_OTHERCASE(c); |
|
1711 #else |
|
1712 othercase = NOTACHAR; |
|
1713 #endif |
|
1714 |
|
1715 if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); } |
|
1716 } |
|
1717 } |
|
1718 else |
|
1719 #endif /* SUPPORT_UTF8 */ |
|
1720 |
|
1721 /* Non-UTF-8 mode */ |
|
1722 { |
|
1723 if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); } |
|
1724 } |
|
1725 break; |
|
1726 |
|
1727 |
|
1728 #ifdef SUPPORT_UCP |
|
1729 /*-----------------------------------------------------------------*/ |
|
1730 /* This is a tricky one because it can match more than one character. |
|
1731 Find out how many characters to skip, and then set up a negative state |
|
1732 to wait for them to pass before continuing. */ |
|
1733 |
|
1734 case OP_EXTUNI: |
|
1735 if (clen > 0 && UCD_CATEGORY(c) != ucp_M) |
|
1736 { |
|
1737 const uschar *nptr = ptr + clen; |
|
1738 int ncount = 0; |
|
1739 while (nptr < end_subject) |
|
1740 { |
|
1741 int nclen = 1; |
|
1742 GETCHARLEN(c, nptr, nclen); |
|
1743 if (UCD_CATEGORY(c) != ucp_M) break; |
|
1744 ncount++; |
|
1745 nptr += nclen; |
|
1746 } |
|
1747 ADD_NEW_DATA(-(state_offset + 1), 0, ncount); |
|
1748 } |
|
1749 break; |
|
1750 #endif |
|
1751 |
|
1752 /*-----------------------------------------------------------------*/ |
|
1753 /* This is a tricky like EXTUNI because it too can match more than one |
|
1754 character (when CR is followed by LF). In this case, set up a negative |
|
1755 state to wait for one character to pass before continuing. */ |
|
1756 |
|
1757 case OP_ANYNL: |
|
1758 if (clen > 0) switch(c) |
|
1759 { |
|
1760 case 0x000b: |
|
1761 case 0x000c: |
|
1762 case 0x0085: |
|
1763 case 0x2028: |
|
1764 case 0x2029: |
|
1765 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break; |
|
1766 |
|
1767 case 0x000a: |
|
1768 ADD_NEW(state_offset + 1, 0); |
|
1769 break; |
|
1770 |
|
1771 case 0x000d: |
|
1772 if (ptr + 1 < end_subject && ptr[1] == 0x0a) |
|
1773 { |
|
1774 ADD_NEW_DATA(-(state_offset + 1), 0, 1); |
|
1775 } |
|
1776 else |
|
1777 { |
|
1778 ADD_NEW(state_offset + 1, 0); |
|
1779 } |
|
1780 break; |
|
1781 } |
|
1782 break; |
|
1783 |
|
1784 /*-----------------------------------------------------------------*/ |
|
1785 case OP_NOT_VSPACE: |
|
1786 if (clen > 0) switch(c) |
|
1787 { |
|
1788 case 0x000a: |
|
1789 case 0x000b: |
|
1790 case 0x000c: |
|
1791 case 0x000d: |
|
1792 case 0x0085: |
|
1793 case 0x2028: |
|
1794 case 0x2029: |
|
1795 break; |
|
1796 |
|
1797 default: |
|
1798 ADD_NEW(state_offset + 1, 0); |
|
1799 break; |
|
1800 } |
|
1801 break; |
|
1802 |
|
1803 /*-----------------------------------------------------------------*/ |
|
1804 case OP_VSPACE: |
|
1805 if (clen > 0) switch(c) |
|
1806 { |
|
1807 case 0x000a: |
|
1808 case 0x000b: |
|
1809 case 0x000c: |
|
1810 case 0x000d: |
|
1811 case 0x0085: |
|
1812 case 0x2028: |
|
1813 case 0x2029: |
|
1814 ADD_NEW(state_offset + 1, 0); |
|
1815 break; |
|
1816 |
|
1817 default: break; |
|
1818 } |
|
1819 break; |
|
1820 |
|
1821 /*-----------------------------------------------------------------*/ |
|
1822 case OP_NOT_HSPACE: |
|
1823 if (clen > 0) switch(c) |
|
1824 { |
|
1825 case 0x09: /* HT */ |
|
1826 case 0x20: /* SPACE */ |
|
1827 case 0xa0: /* NBSP */ |
|
1828 case 0x1680: /* OGHAM SPACE MARK */ |
|
1829 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ |
|
1830 case 0x2000: /* EN QUAD */ |
|
1831 case 0x2001: /* EM QUAD */ |
|
1832 case 0x2002: /* EN SPACE */ |
|
1833 case 0x2003: /* EM SPACE */ |
|
1834 case 0x2004: /* THREE-PER-EM SPACE */ |
|
1835 case 0x2005: /* FOUR-PER-EM SPACE */ |
|
1836 case 0x2006: /* SIX-PER-EM SPACE */ |
|
1837 case 0x2007: /* FIGURE SPACE */ |
|
1838 case 0x2008: /* PUNCTUATION SPACE */ |
|
1839 case 0x2009: /* THIN SPACE */ |
|
1840 case 0x200A: /* HAIR SPACE */ |
|
1841 case 0x202f: /* NARROW NO-BREAK SPACE */ |
|
1842 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ |
|
1843 case 0x3000: /* IDEOGRAPHIC SPACE */ |
|
1844 break; |
|
1845 |
|
1846 default: |
|
1847 ADD_NEW(state_offset + 1, 0); |
|
1848 break; |
|
1849 } |
|
1850 break; |
|
1851 |
|
1852 /*-----------------------------------------------------------------*/ |
|
1853 case OP_HSPACE: |
|
1854 if (clen > 0) switch(c) |
|
1855 { |
|
1856 case 0x09: /* HT */ |
|
1857 case 0x20: /* SPACE */ |
|
1858 case 0xa0: /* NBSP */ |
|
1859 case 0x1680: /* OGHAM SPACE MARK */ |
|
1860 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ |
|
1861 case 0x2000: /* EN QUAD */ |
|
1862 case 0x2001: /* EM QUAD */ |
|
1863 case 0x2002: /* EN SPACE */ |
|
1864 case 0x2003: /* EM SPACE */ |
|
1865 case 0x2004: /* THREE-PER-EM SPACE */ |
|
1866 case 0x2005: /* FOUR-PER-EM SPACE */ |
|
1867 case 0x2006: /* SIX-PER-EM SPACE */ |
|
1868 case 0x2007: /* FIGURE SPACE */ |
|
1869 case 0x2008: /* PUNCTUATION SPACE */ |
|
1870 case 0x2009: /* THIN SPACE */ |
|
1871 case 0x200A: /* HAIR SPACE */ |
|
1872 case 0x202f: /* NARROW NO-BREAK SPACE */ |
|
1873 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ |
|
1874 case 0x3000: /* IDEOGRAPHIC SPACE */ |
|
1875 ADD_NEW(state_offset + 1, 0); |
|
1876 break; |
|
1877 } |
|
1878 break; |
|
1879 |
|
1880 /*-----------------------------------------------------------------*/ |
|
1881 /* Match a negated single character. This is only used for one-byte |
|
1882 characters, that is, we know that d < 256. The character we are |
|
1883 checking (c) can be multibyte. */ |
|
1884 |
|
1885 case OP_NOT: |
|
1886 if (clen > 0) |
|
1887 { |
|
1888 unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d; |
|
1889 if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); } |
|
1890 } |
|
1891 break; |
|
1892 |
|
1893 /*-----------------------------------------------------------------*/ |
|
1894 case OP_PLUS: |
|
1895 case OP_MINPLUS: |
|
1896 case OP_POSPLUS: |
|
1897 case OP_NOTPLUS: |
|
1898 case OP_NOTMINPLUS: |
|
1899 case OP_NOTPOSPLUS: |
|
1900 count = current_state->count; /* Already matched */ |
|
1901 if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); } |
|
1902 if (clen > 0) |
|
1903 { |
|
1904 unsigned int otherd = NOTACHAR; |
|
1905 if ((ims & PCRE_CASELESS) != 0) |
|
1906 { |
|
1907 #ifdef SUPPORT_UTF8 |
|
1908 if (utf8 && d >= 128) |
|
1909 { |
|
1910 #ifdef SUPPORT_UCP |
|
1911 otherd = UCD_OTHERCASE(d); |
|
1912 #endif /* SUPPORT_UCP */ |
|
1913 } |
|
1914 else |
|
1915 #endif /* SUPPORT_UTF8 */ |
|
1916 otherd = fcc[d]; |
|
1917 } |
|
1918 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR)) |
|
1919 { |
|
1920 if (count > 0 && |
|
1921 (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS)) |
|
1922 { |
|
1923 active_count--; /* Remove non-match possibility */ |
|
1924 next_active_state--; |
|
1925 } |
|
1926 count++; |
|
1927 ADD_NEW(state_offset, count); |
|
1928 } |
|
1929 } |
|
1930 break; |
|
1931 |
|
1932 /*-----------------------------------------------------------------*/ |
|
1933 case OP_QUERY: |
|
1934 case OP_MINQUERY: |
|
1935 case OP_POSQUERY: |
|
1936 case OP_NOTQUERY: |
|
1937 case OP_NOTMINQUERY: |
|
1938 case OP_NOTPOSQUERY: |
|
1939 ADD_ACTIVE(state_offset + dlen + 1, 0); |
|
1940 if (clen > 0) |
|
1941 { |
|
1942 unsigned int otherd = NOTACHAR; |
|
1943 if ((ims & PCRE_CASELESS) != 0) |
|
1944 { |
|
1945 #ifdef SUPPORT_UTF8 |
|
1946 if (utf8 && d >= 128) |
|
1947 { |
|
1948 #ifdef SUPPORT_UCP |
|
1949 otherd = UCD_OTHERCASE(d); |
|
1950 #endif /* SUPPORT_UCP */ |
|
1951 } |
|
1952 else |
|
1953 #endif /* SUPPORT_UTF8 */ |
|
1954 otherd = fcc[d]; |
|
1955 } |
|
1956 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR)) |
|
1957 { |
|
1958 if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY) |
|
1959 { |
|
1960 active_count--; /* Remove non-match possibility */ |
|
1961 next_active_state--; |
|
1962 } |
|
1963 ADD_NEW(state_offset + dlen + 1, 0); |
|
1964 } |
|
1965 } |
|
1966 break; |
|
1967 |
|
1968 /*-----------------------------------------------------------------*/ |
|
1969 case OP_STAR: |
|
1970 case OP_MINSTAR: |
|
1971 case OP_POSSTAR: |
|
1972 case OP_NOTSTAR: |
|
1973 case OP_NOTMINSTAR: |
|
1974 case OP_NOTPOSSTAR: |
|
1975 ADD_ACTIVE(state_offset + dlen + 1, 0); |
|
1976 if (clen > 0) |
|
1977 { |
|
1978 unsigned int otherd = NOTACHAR; |
|
1979 if ((ims & PCRE_CASELESS) != 0) |
|
1980 { |
|
1981 #ifdef SUPPORT_UTF8 |
|
1982 if (utf8 && d >= 128) |
|
1983 { |
|
1984 #ifdef SUPPORT_UCP |
|
1985 otherd = UCD_OTHERCASE(d); |
|
1986 #endif /* SUPPORT_UCP */ |
|
1987 } |
|
1988 else |
|
1989 #endif /* SUPPORT_UTF8 */ |
|
1990 otherd = fcc[d]; |
|
1991 } |
|
1992 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR)) |
|
1993 { |
|
1994 if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR) |
|
1995 { |
|
1996 active_count--; /* Remove non-match possibility */ |
|
1997 next_active_state--; |
|
1998 } |
|
1999 ADD_NEW(state_offset, 0); |
|
2000 } |
|
2001 } |
|
2002 break; |
|
2003 |
|
2004 /*-----------------------------------------------------------------*/ |
|
2005 case OP_EXACT: |
|
2006 case OP_NOTEXACT: |
|
2007 count = current_state->count; /* Number already matched */ |
|
2008 if (clen > 0) |
|
2009 { |
|
2010 unsigned int otherd = NOTACHAR; |
|
2011 if ((ims & PCRE_CASELESS) != 0) |
|
2012 { |
|
2013 #ifdef SUPPORT_UTF8 |
|
2014 if (utf8 && d >= 128) |
|
2015 { |
|
2016 #ifdef SUPPORT_UCP |
|
2017 otherd = UCD_OTHERCASE(d); |
|
2018 #endif /* SUPPORT_UCP */ |
|
2019 } |
|
2020 else |
|
2021 #endif /* SUPPORT_UTF8 */ |
|
2022 otherd = fcc[d]; |
|
2023 } |
|
2024 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR)) |
|
2025 { |
|
2026 if (++count >= GET2(code, 1)) |
|
2027 { ADD_NEW(state_offset + dlen + 3, 0); } |
|
2028 else |
|
2029 { ADD_NEW(state_offset, count); } |
|
2030 } |
|
2031 } |
|
2032 break; |
|
2033 |
|
2034 /*-----------------------------------------------------------------*/ |
|
2035 case OP_UPTO: |
|
2036 case OP_MINUPTO: |
|
2037 case OP_POSUPTO: |
|
2038 case OP_NOTUPTO: |
|
2039 case OP_NOTMINUPTO: |
|
2040 case OP_NOTPOSUPTO: |
|
2041 ADD_ACTIVE(state_offset + dlen + 3, 0); |
|
2042 count = current_state->count; /* Number already matched */ |
|
2043 if (clen > 0) |
|
2044 { |
|
2045 unsigned int otherd = NOTACHAR; |
|
2046 if ((ims & PCRE_CASELESS) != 0) |
|
2047 { |
|
2048 #ifdef SUPPORT_UTF8 |
|
2049 if (utf8 && d >= 128) |
|
2050 { |
|
2051 #ifdef SUPPORT_UCP |
|
2052 otherd = UCD_OTHERCASE(d); |
|
2053 #endif /* SUPPORT_UCP */ |
|
2054 } |
|
2055 else |
|
2056 #endif /* SUPPORT_UTF8 */ |
|
2057 otherd = fcc[d]; |
|
2058 } |
|
2059 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR)) |
|
2060 { |
|
2061 if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO) |
|
2062 { |
|
2063 active_count--; /* Remove non-match possibility */ |
|
2064 next_active_state--; |
|
2065 } |
|
2066 if (++count >= GET2(code, 1)) |
|
2067 { ADD_NEW(state_offset + dlen + 3, 0); } |
|
2068 else |
|
2069 { ADD_NEW(state_offset, count); } |
|
2070 } |
|
2071 } |
|
2072 break; |
|
2073 |
|
2074 |
|
2075 /* ========================================================================== */ |
|
2076 /* These are the class-handling opcodes */ |
|
2077 |
|
2078 case OP_CLASS: |
|
2079 case OP_NCLASS: |
|
2080 case OP_XCLASS: |
|
2081 { |
|
2082 BOOL isinclass = FALSE; |
|
2083 int next_state_offset; |
|
2084 const uschar *ecode; |
|
2085 |
|
2086 /* For a simple class, there is always just a 32-byte table, and we |
|
2087 can set isinclass from it. */ |
|
2088 |
|
2089 if (codevalue != OP_XCLASS) |
|
2090 { |
|
2091 ecode = code + 33; |
|
2092 if (clen > 0) |
|
2093 { |
|
2094 isinclass = (c > 255)? (codevalue == OP_NCLASS) : |
|
2095 ((code[1 + c/8] & (1 << (c&7))) != 0); |
|
2096 } |
|
2097 } |
|
2098 |
|
2099 /* An extended class may have a table or a list of single characters, |
|
2100 ranges, or both, and it may be positive or negative. There's a |
|
2101 function that sorts all this out. */ |
|
2102 |
|
2103 else |
|
2104 { |
|
2105 ecode = code + GET(code, 1); |
|
2106 if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE); |
|
2107 } |
|
2108 |
|
2109 /* At this point, isinclass is set for all kinds of class, and ecode |
|
2110 points to the byte after the end of the class. If there is a |
|
2111 quantifier, this is where it will be. */ |
|
2112 |
|
2113 next_state_offset = ecode - start_code; |
|
2114 |
|
2115 switch (*ecode) |
|
2116 { |
|
2117 case OP_CRSTAR: |
|
2118 case OP_CRMINSTAR: |
|
2119 ADD_ACTIVE(next_state_offset + 1, 0); |
|
2120 if (isinclass) { ADD_NEW(state_offset, 0); } |
|
2121 break; |
|
2122 |
|
2123 case OP_CRPLUS: |
|
2124 case OP_CRMINPLUS: |
|
2125 count = current_state->count; /* Already matched */ |
|
2126 if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); } |
|
2127 if (isinclass) { count++; ADD_NEW(state_offset, count); } |
|
2128 break; |
|
2129 |
|
2130 case OP_CRQUERY: |
|
2131 case OP_CRMINQUERY: |
|
2132 ADD_ACTIVE(next_state_offset + 1, 0); |
|
2133 if (isinclass) { ADD_NEW(next_state_offset + 1, 0); } |
|
2134 break; |
|
2135 |
|
2136 case OP_CRRANGE: |
|
2137 case OP_CRMINRANGE: |
|
2138 count = current_state->count; /* Already matched */ |
|
2139 if (count >= GET2(ecode, 1)) |
|
2140 { ADD_ACTIVE(next_state_offset + 5, 0); } |
|
2141 if (isinclass) |
|
2142 { |
|
2143 int max = GET2(ecode, 3); |
|
2144 if (++count >= max && max != 0) /* Max 0 => no limit */ |
|
2145 { ADD_NEW(next_state_offset + 5, 0); } |
|
2146 else |
|
2147 { ADD_NEW(state_offset, count); } |
|
2148 } |
|
2149 break; |
|
2150 |
|
2151 default: |
|
2152 if (isinclass) { ADD_NEW(next_state_offset, 0); } |
|
2153 break; |
|
2154 } |
|
2155 } |
|
2156 break; |
|
2157 |
|
2158 /* ========================================================================== */ |
|
2159 /* These are the opcodes for fancy brackets of various kinds. We have |
|
2160 to use recursion in order to handle them. The "always failing" assersion |
|
2161 (?!) is optimised when compiling to OP_FAIL, so we have to support that, |
|
2162 though the other "backtracking verbs" are not supported. */ |
|
2163 |
|
2164 case OP_FAIL: |
|
2165 break; |
|
2166 |
|
2167 case OP_ASSERT: |
|
2168 case OP_ASSERT_NOT: |
|
2169 case OP_ASSERTBACK: |
|
2170 case OP_ASSERTBACK_NOT: |
|
2171 { |
|
2172 int rc; |
|
2173 int local_offsets[2]; |
|
2174 int local_workspace[1000]; |
|
2175 const uschar *endasscode = code + GET(code, 1); |
|
2176 |
|
2177 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1); |
|
2178 |
|
2179 rc = internal_dfa_exec( |
|
2180 md, /* static match data */ |
|
2181 code, /* this subexpression's code */ |
|
2182 ptr, /* where we currently are */ |
|
2183 ptr - start_subject, /* start offset */ |
|
2184 local_offsets, /* offset vector */ |
|
2185 sizeof(local_offsets)/sizeof(int), /* size of same */ |
|
2186 local_workspace, /* workspace vector */ |
|
2187 sizeof(local_workspace)/sizeof(int), /* size of same */ |
|
2188 ims, /* the current ims flags */ |
|
2189 rlevel, /* function recursion level */ |
|
2190 recursing); /* pass on regex recursion */ |
|
2191 |
|
2192 if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK)) |
|
2193 { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); } |
|
2194 } |
|
2195 break; |
|
2196 |
|
2197 /*-----------------------------------------------------------------*/ |
|
2198 case OP_COND: |
|
2199 case OP_SCOND: |
|
2200 { |
|
2201 int local_offsets[1000]; |
|
2202 int local_workspace[1000]; |
|
2203 int condcode = code[LINK_SIZE+1]; |
|
2204 |
|
2205 /* Back reference conditions are not supported */ |
|
2206 |
|
2207 if (condcode == OP_CREF) return PCRE_ERROR_DFA_UCOND; |
|
2208 |
|
2209 /* The DEFINE condition is always false */ |
|
2210 |
|
2211 if (condcode == OP_DEF) |
|
2212 { |
|
2213 ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); |
|
2214 } |
|
2215 |
|
2216 /* The only supported version of OP_RREF is for the value RREF_ANY, |
|
2217 which means "test if in any recursion". We can't test for specifically |
|
2218 recursed groups. */ |
|
2219 |
|
2220 else if (condcode == OP_RREF) |
|
2221 { |
|
2222 int value = GET2(code, LINK_SIZE+2); |
|
2223 if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND; |
|
2224 if (recursing > 0) { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); } |
|
2225 else { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); } |
|
2226 } |
|
2227 |
|
2228 /* Otherwise, the condition is an assertion */ |
|
2229 |
|
2230 else |
|
2231 { |
|
2232 int rc; |
|
2233 const uschar *asscode = code + LINK_SIZE + 1; |
|
2234 const uschar *endasscode = asscode + GET(asscode, 1); |
|
2235 |
|
2236 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1); |
|
2237 |
|
2238 rc = internal_dfa_exec( |
|
2239 md, /* fixed match data */ |
|
2240 asscode, /* this subexpression's code */ |
|
2241 ptr, /* where we currently are */ |
|
2242 ptr - start_subject, /* start offset */ |
|
2243 local_offsets, /* offset vector */ |
|
2244 sizeof(local_offsets)/sizeof(int), /* size of same */ |
|
2245 local_workspace, /* workspace vector */ |
|
2246 sizeof(local_workspace)/sizeof(int), /* size of same */ |
|
2247 ims, /* the current ims flags */ |
|
2248 rlevel, /* function recursion level */ |
|
2249 recursing); /* pass on regex recursion */ |
|
2250 |
|
2251 if ((rc >= 0) == |
|
2252 (condcode == OP_ASSERT || condcode == OP_ASSERTBACK)) |
|
2253 { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); } |
|
2254 else |
|
2255 { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); } |
|
2256 } |
|
2257 } |
|
2258 break; |
|
2259 |
|
2260 /*-----------------------------------------------------------------*/ |
|
2261 case OP_RECURSE: |
|
2262 { |
|
2263 int local_offsets[1000]; |
|
2264 int local_workspace[1000]; |
|
2265 int rc; |
|
2266 |
|
2267 DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP, |
|
2268 recursing + 1)); |
|
2269 |
|
2270 rc = internal_dfa_exec( |
|
2271 md, /* fixed match data */ |
|
2272 start_code + GET(code, 1), /* this subexpression's code */ |
|
2273 ptr, /* where we currently are */ |
|
2274 ptr - start_subject, /* start offset */ |
|
2275 local_offsets, /* offset vector */ |
|
2276 sizeof(local_offsets)/sizeof(int), /* size of same */ |
|
2277 local_workspace, /* workspace vector */ |
|
2278 sizeof(local_workspace)/sizeof(int), /* size of same */ |
|
2279 ims, /* the current ims flags */ |
|
2280 rlevel, /* function recursion level */ |
|
2281 recursing + 1); /* regex recurse level */ |
|
2282 |
|
2283 DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP, |
|
2284 recursing + 1, rc)); |
|
2285 |
|
2286 /* Ran out of internal offsets */ |
|
2287 |
|
2288 if (rc == 0) return PCRE_ERROR_DFA_RECURSE; |
|
2289 |
|
2290 /* For each successful matched substring, set up the next state with a |
|
2291 count of characters to skip before trying it. Note that the count is in |
|
2292 characters, not bytes. */ |
|
2293 |
|
2294 if (rc > 0) |
|
2295 { |
|
2296 for (rc = rc*2 - 2; rc >= 0; rc -= 2) |
|
2297 { |
|
2298 const uschar *p = start_subject + local_offsets[rc]; |
|
2299 const uschar *pp = start_subject + local_offsets[rc+1]; |
|
2300 int charcount = local_offsets[rc+1] - local_offsets[rc]; |
|
2301 while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--; |
|
2302 if (charcount > 0) |
|
2303 { |
|
2304 ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1)); |
|
2305 } |
|
2306 else |
|
2307 { |
|
2308 ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0); |
|
2309 } |
|
2310 } |
|
2311 } |
|
2312 else if (rc != PCRE_ERROR_NOMATCH) return rc; |
|
2313 } |
|
2314 break; |
|
2315 |
|
2316 /*-----------------------------------------------------------------*/ |
|
2317 case OP_ONCE: |
|
2318 { |
|
2319 int local_offsets[2]; |
|
2320 int local_workspace[1000]; |
|
2321 |
|
2322 int rc = internal_dfa_exec( |
|
2323 md, /* fixed match data */ |
|
2324 code, /* this subexpression's code */ |
|
2325 ptr, /* where we currently are */ |
|
2326 ptr - start_subject, /* start offset */ |
|
2327 local_offsets, /* offset vector */ |
|
2328 sizeof(local_offsets)/sizeof(int), /* size of same */ |
|
2329 local_workspace, /* workspace vector */ |
|
2330 sizeof(local_workspace)/sizeof(int), /* size of same */ |
|
2331 ims, /* the current ims flags */ |
|
2332 rlevel, /* function recursion level */ |
|
2333 recursing); /* pass on regex recursion */ |
|
2334 |
|
2335 if (rc >= 0) |
|
2336 { |
|
2337 const uschar *end_subpattern = code; |
|
2338 int charcount = local_offsets[1] - local_offsets[0]; |
|
2339 int next_state_offset, repeat_state_offset; |
|
2340 |
|
2341 do { end_subpattern += GET(end_subpattern, 1); } |
|
2342 while (*end_subpattern == OP_ALT); |
|
2343 next_state_offset = end_subpattern - start_code + LINK_SIZE + 1; |
|
2344 |
|
2345 /* If the end of this subpattern is KETRMAX or KETRMIN, we must |
|
2346 arrange for the repeat state also to be added to the relevant list. |
|
2347 Calculate the offset, or set -1 for no repeat. */ |
|
2348 |
|
2349 repeat_state_offset = (*end_subpattern == OP_KETRMAX || |
|
2350 *end_subpattern == OP_KETRMIN)? |
|
2351 end_subpattern - start_code - GET(end_subpattern, 1) : -1; |
|
2352 |
|
2353 /* If we have matched an empty string, add the next state at the |
|
2354 current character pointer. This is important so that the duplicate |
|
2355 checking kicks in, which is what breaks infinite loops that match an |
|
2356 empty string. */ |
|
2357 |
|
2358 if (charcount == 0) |
|
2359 { |
|
2360 ADD_ACTIVE(next_state_offset, 0); |
|
2361 } |
|
2362 |
|
2363 /* Optimization: if there are no more active states, and there |
|
2364 are no new states yet set up, then skip over the subject string |
|
2365 right here, to save looping. Otherwise, set up the new state to swing |
|
2366 into action when the end of the substring is reached. */ |
|
2367 |
|
2368 else if (i + 1 >= active_count && new_count == 0) |
|
2369 { |
|
2370 ptr += charcount; |
|
2371 clen = 0; |
|
2372 ADD_NEW(next_state_offset, 0); |
|
2373 |
|
2374 /* If we are adding a repeat state at the new character position, |
|
2375 we must fudge things so that it is the only current state. |
|
2376 Otherwise, it might be a duplicate of one we processed before, and |
|
2377 that would cause it to be skipped. */ |
|
2378 |
|
2379 if (repeat_state_offset >= 0) |
|
2380 { |
|
2381 next_active_state = active_states; |
|
2382 active_count = 0; |
|
2383 i = -1; |
|
2384 ADD_ACTIVE(repeat_state_offset, 0); |
|
2385 } |
|
2386 } |
|
2387 else |
|
2388 { |
|
2389 const uschar *p = start_subject + local_offsets[0]; |
|
2390 const uschar *pp = start_subject + local_offsets[1]; |
|
2391 while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--; |
|
2392 ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1)); |
|
2393 if (repeat_state_offset >= 0) |
|
2394 { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); } |
|
2395 } |
|
2396 |
|
2397 } |
|
2398 else if (rc != PCRE_ERROR_NOMATCH) return rc; |
|
2399 } |
|
2400 break; |
|
2401 |
|
2402 |
|
2403 /* ========================================================================== */ |
|
2404 /* Handle callouts */ |
|
2405 |
|
2406 case OP_CALLOUT: |
|
2407 if (pcre_callout != NULL) |
|
2408 { |
|
2409 int rrc; |
|
2410 pcre_callout_block cb; |
|
2411 cb.version = 1; /* Version 1 of the callout block */ |
|
2412 cb.callout_number = code[1]; |
|
2413 cb.offset_vector = offsets; |
|
2414 cb.subject = (PCRE_SPTR)start_subject; |
|
2415 cb.subject_length = end_subject - start_subject; |
|
2416 cb.start_match = current_subject - start_subject; |
|
2417 cb.current_position = ptr - start_subject; |
|
2418 cb.pattern_position = GET(code, 2); |
|
2419 cb.next_item_length = GET(code, 2 + LINK_SIZE); |
|
2420 cb.capture_top = 1; |
|
2421 cb.capture_last = -1; |
|
2422 cb.callout_data = md->callout_data; |
|
2423 if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */ |
|
2424 if (rrc == 0) { ADD_ACTIVE(state_offset + 2 + 2*LINK_SIZE, 0); } |
|
2425 } |
|
2426 break; |
|
2427 |
|
2428 |
|
2429 /* ========================================================================== */ |
|
2430 default: /* Unsupported opcode */ |
|
2431 return PCRE_ERROR_DFA_UITEM; |
|
2432 } |
|
2433 |
|
2434 NEXT_ACTIVE_STATE: continue; |
|
2435 |
|
2436 } /* End of loop scanning active states */ |
|
2437 |
|
2438 /* We have finished the processing at the current subject character. If no |
|
2439 new states have been set for the next character, we have found all the |
|
2440 matches that we are going to find. If we are at the top level and partial |
|
2441 matching has been requested, check for appropriate conditions. */ |
|
2442 |
|
2443 if (new_count <= 0) |
|
2444 { |
|
2445 if (match_count < 0 && /* No matches found */ |
|
2446 rlevel == 1 && /* Top level match function */ |
|
2447 (md->moptions & PCRE_PARTIAL) != 0 && /* Want partial matching */ |
|
2448 ptr >= end_subject && /* Reached end of subject */ |
|
2449 ptr > current_subject) /* Matched non-empty string */ |
|
2450 { |
|
2451 if (offsetcount >= 2) |
|
2452 { |
|
2453 offsets[0] = current_subject - start_subject; |
|
2454 offsets[1] = end_subject - start_subject; |
|
2455 } |
|
2456 match_count = PCRE_ERROR_PARTIAL; |
|
2457 } |
|
2458 |
|
2459 DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n" |
|
2460 "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count, |
|
2461 rlevel*2-2, SP)); |
|
2462 break; /* In effect, "return", but see the comment below */ |
|
2463 } |
|
2464 |
|
2465 /* One or more states are active for the next character. */ |
|
2466 |
|
2467 ptr += clen; /* Advance to next subject character */ |
|
2468 } /* Loop to move along the subject string */ |
|
2469 |
|
2470 /* Control gets here from "break" a few lines above. We do it this way because |
|
2471 if we use "return" above, we have compiler trouble. Some compilers warn if |
|
2472 there's nothing here because they think the function doesn't return a value. On |
|
2473 the other hand, if we put a dummy statement here, some more clever compilers |
|
2474 complain that it can't be reached. Sigh. */ |
|
2475 |
|
2476 return match_count; |
|
2477 } |
|
2478 |
|
2479 |
|
2480 |
|
2481 |
|
2482 /************************************************* |
|
2483 * Execute a Regular Expression - DFA engine * |
|
2484 *************************************************/ |
|
2485 |
|
2486 /* This external function applies a compiled re to a subject string using a DFA |
|
2487 engine. This function calls the internal function multiple times if the pattern |
|
2488 is not anchored. |
|
2489 |
|
2490 Arguments: |
|
2491 argument_re points to the compiled expression |
|
2492 extra_data points to extra data or is NULL |
|
2493 subject points to the subject string |
|
2494 length length of subject string (may contain binary zeros) |
|
2495 start_offset where to start in the subject string |
|
2496 options option bits |
|
2497 offsets vector of match offsets |
|
2498 offsetcount size of same |
|
2499 workspace workspace vector |
|
2500 wscount size of same |
|
2501 |
|
2502 Returns: > 0 => number of match offset pairs placed in offsets |
|
2503 = 0 => offsets overflowed; longest matches are present |
|
2504 -1 => failed to match |
|
2505 < -1 => some kind of unexpected problem |
|
2506 */ |
|
2507 |
|
2508 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION |
|
2509 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data, |
|
2510 const char *subject, int length, int start_offset, int options, int *offsets, |
|
2511 int offsetcount, int *workspace, int wscount) |
|
2512 { |
|
2513 real_pcre *re = (real_pcre *)argument_re; |
|
2514 dfa_match_data match_block; |
|
2515 dfa_match_data *md = &match_block; |
|
2516 BOOL utf8, anchored, startline, firstline; |
|
2517 const uschar *current_subject, *end_subject, *lcc; |
|
2518 |
|
2519 pcre_study_data internal_study; |
|
2520 const pcre_study_data *study = NULL; |
|
2521 real_pcre internal_re; |
|
2522 |
|
2523 const uschar *req_byte_ptr; |
|
2524 const uschar *start_bits = NULL; |
|
2525 BOOL first_byte_caseless = FALSE; |
|
2526 BOOL req_byte_caseless = FALSE; |
|
2527 int first_byte = -1; |
|
2528 int req_byte = -1; |
|
2529 int req_byte2 = -1; |
|
2530 int newline; |
|
2531 |
|
2532 /* Plausibility checks */ |
|
2533 |
|
2534 if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION; |
|
2535 if (re == NULL || subject == NULL || workspace == NULL || |
|
2536 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL; |
|
2537 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT; |
|
2538 if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE; |
|
2539 |
|
2540 /* We need to find the pointer to any study data before we test for byte |
|
2541 flipping, so we scan the extra_data block first. This may set two fields in the |
|
2542 match block, so we must initialize them beforehand. However, the other fields |
|
2543 in the match block must not be set until after the byte flipping. */ |
|
2544 |
|
2545 md->tables = re->tables; |
|
2546 md->callout_data = NULL; |
|
2547 |
|
2548 if (extra_data != NULL) |
|
2549 { |
|
2550 unsigned int flags = extra_data->flags; |
|
2551 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0) |
|
2552 study = (const pcre_study_data *)extra_data->study_data; |
|
2553 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT; |
|
2554 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0) |
|
2555 return PCRE_ERROR_DFA_UMLIMIT; |
|
2556 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0) |
|
2557 md->callout_data = extra_data->callout_data; |
|
2558 if ((flags & PCRE_EXTRA_TABLES) != 0) |
|
2559 md->tables = extra_data->tables; |
|
2560 } |
|
2561 |
|
2562 /* Check that the first field in the block is the magic number. If it is not, |
|
2563 test for a regex that was compiled on a host of opposite endianness. If this is |
|
2564 the case, flipped values are put in internal_re and internal_study if there was |
|
2565 study data too. */ |
|
2566 |
|
2567 if (re->magic_number != MAGIC_NUMBER) |
|
2568 { |
|
2569 re = _pcre_try_flipped(re, &internal_re, study, &internal_study); |
|
2570 if (re == NULL) return PCRE_ERROR_BADMAGIC; |
|
2571 if (study != NULL) study = &internal_study; |
|
2572 } |
|
2573 |
|
2574 /* Set some local values */ |
|
2575 |
|
2576 current_subject = (const unsigned char *)subject + start_offset; |
|
2577 end_subject = (const unsigned char *)subject + length; |
|
2578 req_byte_ptr = current_subject - 1; |
|
2579 |
|
2580 #ifdef SUPPORT_UTF8 |
|
2581 utf8 = (re->options & PCRE_UTF8) != 0; |
|
2582 #else |
|
2583 utf8 = FALSE; |
|
2584 #endif |
|
2585 |
|
2586 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 || |
|
2587 (re->options & PCRE_ANCHORED) != 0; |
|
2588 |
|
2589 /* The remaining fixed data for passing around. */ |
|
2590 |
|
2591 md->start_code = (const uschar *)argument_re + |
|
2592 re->name_table_offset + re->name_count * re->name_entry_size; |
|
2593 md->start_subject = (const unsigned char *)subject; |
|
2594 md->end_subject = end_subject; |
|
2595 md->moptions = options; |
|
2596 md->poptions = re->options; |
|
2597 |
|
2598 /* If the BSR option is not set at match time, copy what was set |
|
2599 at compile time. */ |
|
2600 |
|
2601 if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0) |
|
2602 { |
|
2603 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0) |
|
2604 md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE); |
|
2605 #ifdef BSR_ANYCRLF |
|
2606 else md->moptions |= PCRE_BSR_ANYCRLF; |
|
2607 #endif |
|
2608 } |
|
2609 |
|
2610 /* Handle different types of newline. The three bits give eight cases. If |
|
2611 nothing is set at run time, whatever was used at compile time applies. */ |
|
2612 |
|
2613 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) & |
|
2614 PCRE_NEWLINE_BITS) |
|
2615 { |
|
2616 case 0: newline = NEWLINE; break; /* Compile-time default */ |
|
2617 case PCRE_NEWLINE_CR: newline = '\r'; break; |
|
2618 case PCRE_NEWLINE_LF: newline = '\n'; break; |
|
2619 case PCRE_NEWLINE_CR+ |
|
2620 PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break; |
|
2621 case PCRE_NEWLINE_ANY: newline = -1; break; |
|
2622 case PCRE_NEWLINE_ANYCRLF: newline = -2; break; |
|
2623 default: return PCRE_ERROR_BADNEWLINE; |
|
2624 } |
|
2625 |
|
2626 if (newline == -2) |
|
2627 { |
|
2628 md->nltype = NLTYPE_ANYCRLF; |
|
2629 } |
|
2630 else if (newline < 0) |
|
2631 { |
|
2632 md->nltype = NLTYPE_ANY; |
|
2633 } |
|
2634 else |
|
2635 { |
|
2636 md->nltype = NLTYPE_FIXED; |
|
2637 if (newline > 255) |
|
2638 { |
|
2639 md->nllen = 2; |
|
2640 md->nl[0] = (newline >> 8) & 255; |
|
2641 md->nl[1] = newline & 255; |
|
2642 } |
|
2643 else |
|
2644 { |
|
2645 md->nllen = 1; |
|
2646 md->nl[0] = newline; |
|
2647 } |
|
2648 } |
|
2649 |
|
2650 /* Check a UTF-8 string if required. Unfortunately there's no way of passing |
|
2651 back the character offset. */ |
|
2652 |
|
2653 #ifdef SUPPORT_UTF8 |
|
2654 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0) |
|
2655 { |
|
2656 if (_pcre_valid_utf8((uschar *)subject, length) >= 0) |
|
2657 return PCRE_ERROR_BADUTF8; |
|
2658 if (start_offset > 0 && start_offset < length) |
|
2659 { |
|
2660 int tb = ((uschar *)subject)[start_offset]; |
|
2661 if (tb > 127) |
|
2662 { |
|
2663 tb &= 0xc0; |
|
2664 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET; |
|
2665 } |
|
2666 } |
|
2667 } |
|
2668 #endif |
|
2669 |
|
2670 /* If the exec call supplied NULL for tables, use the inbuilt ones. This |
|
2671 is a feature that makes it possible to save compiled regex and re-use them |
|
2672 in other programs later. */ |
|
2673 |
|
2674 if (md->tables == NULL) md->tables = _pcre_default_tables; |
|
2675 |
|
2676 /* The lower casing table and the "must be at the start of a line" flag are |
|
2677 used in a loop when finding where to start. */ |
|
2678 |
|
2679 lcc = md->tables + lcc_offset; |
|
2680 startline = (re->flags & PCRE_STARTLINE) != 0; |
|
2681 firstline = (re->options & PCRE_FIRSTLINE) != 0; |
|
2682 |
|
2683 /* Set up the first character to match, if available. The first_byte value is |
|
2684 never set for an anchored regular expression, but the anchoring may be forced |
|
2685 at run time, so we have to test for anchoring. The first char may be unset for |
|
2686 an unanchored pattern, of course. If there's no first char and the pattern was |
|
2687 studied, there may be a bitmap of possible first characters. */ |
|
2688 |
|
2689 if (!anchored) |
|
2690 { |
|
2691 if ((re->flags & PCRE_FIRSTSET) != 0) |
|
2692 { |
|
2693 first_byte = re->first_byte & 255; |
|
2694 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE) |
|
2695 first_byte = lcc[first_byte]; |
|
2696 } |
|
2697 else |
|
2698 { |
|
2699 if (startline && study != NULL && |
|
2700 (study->options & PCRE_STUDY_MAPPED) != 0) |
|
2701 start_bits = study->start_bits; |
|
2702 } |
|
2703 } |
|
2704 |
|
2705 /* For anchored or unanchored matches, there may be a "last known required |
|
2706 character" set. */ |
|
2707 |
|
2708 if ((re->flags & PCRE_REQCHSET) != 0) |
|
2709 { |
|
2710 req_byte = re->req_byte & 255; |
|
2711 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0; |
|
2712 req_byte2 = (md->tables + fcc_offset)[req_byte]; /* case flipped */ |
|
2713 } |
|
2714 |
|
2715 /* Call the main matching function, looping for a non-anchored regex after a |
|
2716 failed match. Unless restarting, optimize by moving to the first match |
|
2717 character if possible, when not anchored. Then unless wanting a partial match, |
|
2718 check for a required later character. */ |
|
2719 |
|
2720 for (;;) |
|
2721 { |
|
2722 int rc; |
|
2723 |
|
2724 if ((options & PCRE_DFA_RESTART) == 0) |
|
2725 { |
|
2726 const uschar *save_end_subject = end_subject; |
|
2727 |
|
2728 /* Advance to a unique first char if possible. If firstline is TRUE, the |
|
2729 start of the match is constrained to the first line of a multiline string. |
|
2730 Implement this by temporarily adjusting end_subject so that we stop |
|
2731 scanning at a newline. If the match fails at the newline, later code breaks |
|
2732 this loop. */ |
|
2733 |
|
2734 if (firstline) |
|
2735 { |
|
2736 USPTR t = current_subject; |
|
2737 #ifdef SUPPORT_UTF8 |
|
2738 if (utf8) |
|
2739 { |
|
2740 while (t < md->end_subject && !IS_NEWLINE(t)) |
|
2741 { |
|
2742 t++; |
|
2743 while (t < end_subject && (*t & 0xc0) == 0x80) t++; |
|
2744 } |
|
2745 } |
|
2746 else |
|
2747 #endif |
|
2748 while (t < md->end_subject && !IS_NEWLINE(t)) t++; |
|
2749 end_subject = t; |
|
2750 } |
|
2751 |
|
2752 if (first_byte >= 0) |
|
2753 { |
|
2754 if (first_byte_caseless) |
|
2755 while (current_subject < end_subject && |
|
2756 lcc[*current_subject] != first_byte) |
|
2757 current_subject++; |
|
2758 else |
|
2759 while (current_subject < end_subject && *current_subject != first_byte) |
|
2760 current_subject++; |
|
2761 } |
|
2762 |
|
2763 /* Or to just after a linebreak for a multiline match if possible */ |
|
2764 |
|
2765 else if (startline) |
|
2766 { |
|
2767 if (current_subject > md->start_subject + start_offset) |
|
2768 { |
|
2769 #ifdef SUPPORT_UTF8 |
|
2770 if (utf8) |
|
2771 { |
|
2772 while (current_subject < end_subject && !WAS_NEWLINE(current_subject)) |
|
2773 { |
|
2774 current_subject++; |
|
2775 while(current_subject < end_subject && |
|
2776 (*current_subject & 0xc0) == 0x80) |
|
2777 current_subject++; |
|
2778 } |
|
2779 } |
|
2780 else |
|
2781 #endif |
|
2782 while (current_subject < end_subject && !WAS_NEWLINE(current_subject)) |
|
2783 current_subject++; |
|
2784 |
|
2785 /* If we have just passed a CR and the newline option is ANY or |
|
2786 ANYCRLF, and we are now at a LF, advance the match position by one more |
|
2787 character. */ |
|
2788 |
|
2789 if (current_subject[-1] == '\r' && |
|
2790 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) && |
|
2791 current_subject < end_subject && |
|
2792 *current_subject == '\n') |
|
2793 current_subject++; |
|
2794 } |
|
2795 } |
|
2796 |
|
2797 /* Or to a non-unique first char after study */ |
|
2798 |
|
2799 else if (start_bits != NULL) |
|
2800 { |
|
2801 while (current_subject < end_subject) |
|
2802 { |
|
2803 register unsigned int c = *current_subject; |
|
2804 if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++; |
|
2805 else break; |
|
2806 } |
|
2807 } |
|
2808 |
|
2809 /* Restore fudged end_subject */ |
|
2810 |
|
2811 end_subject = save_end_subject; |
|
2812 } |
|
2813 |
|
2814 /* If req_byte is set, we know that that character must appear in the subject |
|
2815 for the match to succeed. If the first character is set, req_byte must be |
|
2816 later in the subject; otherwise the test starts at the match point. This |
|
2817 optimization can save a huge amount of work in patterns with nested unlimited |
|
2818 repeats that aren't going to match. Writing separate code for cased/caseless |
|
2819 versions makes it go faster, as does using an autoincrement and backing off |
|
2820 on a match. |
|
2821 |
|
2822 HOWEVER: when the subject string is very, very long, searching to its end can |
|
2823 take a long time, and give bad performance on quite ordinary patterns. This |
|
2824 showed up when somebody was matching /^C/ on a 32-megabyte string... so we |
|
2825 don't do this when the string is sufficiently long. |
|
2826 |
|
2827 ALSO: this processing is disabled when partial matching is requested. |
|
2828 */ |
|
2829 |
|
2830 if (req_byte >= 0 && |
|
2831 end_subject - current_subject < REQ_BYTE_MAX && |
|
2832 (options & PCRE_PARTIAL) == 0) |
|
2833 { |
|
2834 register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0); |
|
2835 |
|
2836 /* We don't need to repeat the search if we haven't yet reached the |
|
2837 place we found it at last time. */ |
|
2838 |
|
2839 if (p > req_byte_ptr) |
|
2840 { |
|
2841 if (req_byte_caseless) |
|
2842 { |
|
2843 while (p < end_subject) |
|
2844 { |
|
2845 register int pp = *p++; |
|
2846 if (pp == req_byte || pp == req_byte2) { p--; break; } |
|
2847 } |
|
2848 } |
|
2849 else |
|
2850 { |
|
2851 while (p < end_subject) |
|
2852 { |
|
2853 if (*p++ == req_byte) { p--; break; } |
|
2854 } |
|
2855 } |
|
2856 |
|
2857 /* If we can't find the required character, break the matching loop, |
|
2858 which will cause a return or PCRE_ERROR_NOMATCH. */ |
|
2859 |
|
2860 if (p >= end_subject) break; |
|
2861 |
|
2862 /* If we have found the required character, save the point where we |
|
2863 found it, so that we don't search again next time round the loop if |
|
2864 the start hasn't passed this character yet. */ |
|
2865 |
|
2866 req_byte_ptr = p; |
|
2867 } |
|
2868 } |
|
2869 |
|
2870 /* OK, now we can do the business */ |
|
2871 |
|
2872 rc = internal_dfa_exec( |
|
2873 md, /* fixed match data */ |
|
2874 md->start_code, /* this subexpression's code */ |
|
2875 current_subject, /* where we currently are */ |
|
2876 start_offset, /* start offset in subject */ |
|
2877 offsets, /* offset vector */ |
|
2878 offsetcount, /* size of same */ |
|
2879 workspace, /* workspace vector */ |
|
2880 wscount, /* size of same */ |
|
2881 re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */ |
|
2882 0, /* function recurse level */ |
|
2883 0); /* regex recurse level */ |
|
2884 |
|
2885 /* Anything other than "no match" means we are done, always; otherwise, carry |
|
2886 on only if not anchored. */ |
|
2887 |
|
2888 if (rc != PCRE_ERROR_NOMATCH || anchored) return rc; |
|
2889 |
|
2890 /* Advance to the next subject character unless we are at the end of a line |
|
2891 and firstline is set. */ |
|
2892 |
|
2893 if (firstline && IS_NEWLINE(current_subject)) break; |
|
2894 current_subject++; |
|
2895 if (utf8) |
|
2896 { |
|
2897 while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80) |
|
2898 current_subject++; |
|
2899 } |
|
2900 if (current_subject > end_subject) break; |
|
2901 |
|
2902 /* If we have just passed a CR and we are now at a LF, and the pattern does |
|
2903 not contain any explicit matches for \r or \n, and the newline option is CRLF |
|
2904 or ANY or ANYCRLF, advance the match position by one more character. */ |
|
2905 |
|
2906 if (current_subject[-1] == '\r' && |
|
2907 current_subject < end_subject && |
|
2908 *current_subject == '\n' && |
|
2909 (re->flags & PCRE_HASCRORLF) == 0 && |
|
2910 (md->nltype == NLTYPE_ANY || |
|
2911 md->nltype == NLTYPE_ANYCRLF || |
|
2912 md->nllen == 2)) |
|
2913 current_subject++; |
|
2914 |
|
2915 } /* "Bumpalong" loop */ |
|
2916 |
|
2917 return PCRE_ERROR_NOMATCH; |
|
2918 } |
|
2919 |
|
2920 /* End of pcre_dfa_exec.c */ |