|
1 /************************************************* |
|
2 * Perl-Compatible Regular Expressions * |
|
3 *************************************************/ |
|
4 |
|
5 /* PCRE is a library of functions to support regular expressions whose syntax |
|
6 and semantics are as close as possible to those of the Perl 5 language. |
|
7 |
|
8 Written by Philip Hazel |
|
9 Copyright (c) 1997-2005 University of Cambridge |
|
10 |
|
11 ----------------------------------------------------------------------------- |
|
12 Redistribution and use in source and binary forms, with or without |
|
13 modification, are permitted provided that the following conditions are met: |
|
14 |
|
15 * Redistributions of source code must retain the above copyright notice, |
|
16 this list of conditions and the following disclaimer. |
|
17 |
|
18 * Redistributions in binary form must reproduce the above copyright |
|
19 notice, this list of conditions and the following disclaimer in the |
|
20 documentation and/or other materials provided with the distribution. |
|
21 |
|
22 * Neither the name of the University of Cambridge nor the names of its |
|
23 contributors may be used to endorse or promote products derived from |
|
24 this software without specific prior written permission. |
|
25 |
|
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
|
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
|
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
|
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
|
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
|
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
|
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
|
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
|
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
|
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
|
36 POSSIBILITY OF SUCH DAMAGE. |
|
37 ----------------------------------------------------------------------------- |
|
38 */ |
|
39 |
|
40 |
|
41 /* This module contains the external function pcre_study(), along with local |
|
42 supporting functions. */ |
|
43 |
|
44 |
|
45 #include "pcre_internal.h" |
|
46 |
|
47 |
|
48 /************************************************* |
|
49 * Set a bit and maybe its alternate case * |
|
50 *************************************************/ |
|
51 |
|
52 /* Given a character, set its bit in the table, and also the bit for the other |
|
53 version of a letter if we are caseless. |
|
54 |
|
55 Arguments: |
|
56 start_bits points to the bit map |
|
57 c is the character |
|
58 caseless the caseless flag |
|
59 cd the block with char table pointers |
|
60 |
|
61 Returns: nothing |
|
62 */ |
|
63 |
|
64 static void |
|
65 set_bit(uschar *start_bits, unsigned int c, BOOL caseless, compile_data *cd) |
|
66 { |
|
67 start_bits[c/8] |= (1 << (c&7)); |
|
68 if (caseless && (cd->ctypes[c] & ctype_letter) != 0) |
|
69 start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7)); |
|
70 } |
|
71 |
|
72 |
|
73 |
|
74 /************************************************* |
|
75 * Create bitmap of starting chars * |
|
76 *************************************************/ |
|
77 |
|
78 /* This function scans a compiled unanchored expression and attempts to build a |
|
79 bitmap of the set of initial characters. If it can't, it returns FALSE. As time |
|
80 goes by, we may be able to get more clever at doing this. |
|
81 |
|
82 Arguments: |
|
83 code points to an expression |
|
84 start_bits points to a 32-byte table, initialized to 0 |
|
85 caseless the current state of the caseless flag |
|
86 utf8 TRUE if in UTF-8 mode |
|
87 cd the block with char table pointers |
|
88 |
|
89 Returns: TRUE if table built, FALSE otherwise |
|
90 */ |
|
91 |
|
92 static BOOL |
|
93 set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless, |
|
94 BOOL utf8, compile_data *cd) |
|
95 { |
|
96 register int c; |
|
97 |
|
98 /* This next statement and the later reference to dummy are here in order to |
|
99 trick the optimizer of the IBM C compiler for OS/2 into generating correct |
|
100 code. Apparently IBM isn't going to fix the problem, and we would rather not |
|
101 disable optimization (in this module it actually makes a big difference, and |
|
102 the pcre module can use all the optimization it can get). */ |
|
103 |
|
104 volatile int dummy; |
|
105 |
|
106 do |
|
107 { |
|
108 const uschar *tcode = code + 1 + LINK_SIZE; |
|
109 BOOL try_next = TRUE; |
|
110 |
|
111 while (try_next) |
|
112 { |
|
113 /* If a branch starts with a bracket or a positive lookahead assertion, |
|
114 recurse to set bits from within them. That's all for this branch. */ |
|
115 |
|
116 if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT) |
|
117 { |
|
118 if (!set_start_bits(tcode, start_bits, caseless, utf8, cd)) |
|
119 return FALSE; |
|
120 try_next = FALSE; |
|
121 } |
|
122 |
|
123 else switch(*tcode) |
|
124 { |
|
125 default: |
|
126 return FALSE; |
|
127 |
|
128 /* Skip over callout */ |
|
129 |
|
130 case OP_CALLOUT: |
|
131 tcode += 2 + 2*LINK_SIZE; |
|
132 break; |
|
133 |
|
134 /* Skip over extended extraction bracket number */ |
|
135 |
|
136 case OP_BRANUMBER: |
|
137 tcode += 3; |
|
138 break; |
|
139 |
|
140 /* Skip over lookbehind and negative lookahead assertions */ |
|
141 |
|
142 case OP_ASSERT_NOT: |
|
143 case OP_ASSERTBACK: |
|
144 case OP_ASSERTBACK_NOT: |
|
145 do tcode += GET(tcode, 1); while (*tcode == OP_ALT); |
|
146 tcode += 1+LINK_SIZE; |
|
147 break; |
|
148 |
|
149 /* Skip over an option setting, changing the caseless flag */ |
|
150 |
|
151 case OP_OPT: |
|
152 caseless = (tcode[1] & PCRE_CASELESS) != 0; |
|
153 tcode += 2; |
|
154 break; |
|
155 |
|
156 /* BRAZERO does the bracket, but carries on. */ |
|
157 |
|
158 case OP_BRAZERO: |
|
159 case OP_BRAMINZERO: |
|
160 if (!set_start_bits(++tcode, start_bits, caseless, utf8, cd)) |
|
161 return FALSE; |
|
162 dummy = 1; |
|
163 do tcode += GET(tcode,1); while (*tcode == OP_ALT); |
|
164 tcode += 1+LINK_SIZE; |
|
165 break; |
|
166 |
|
167 /* Single-char * or ? sets the bit and tries the next item */ |
|
168 |
|
169 case OP_STAR: |
|
170 case OP_MINSTAR: |
|
171 case OP_QUERY: |
|
172 case OP_MINQUERY: |
|
173 set_bit(start_bits, tcode[1], caseless, cd); |
|
174 tcode += 2; |
|
175 #ifdef SUPPORT_UTF8 |
|
176 if (utf8) while ((*tcode & 0xc0) == 0x80) tcode++; |
|
177 #endif |
|
178 break; |
|
179 |
|
180 /* Single-char upto sets the bit and tries the next */ |
|
181 |
|
182 case OP_UPTO: |
|
183 case OP_MINUPTO: |
|
184 set_bit(start_bits, tcode[3], caseless, cd); |
|
185 tcode += 4; |
|
186 #ifdef SUPPORT_UTF8 |
|
187 if (utf8) while ((*tcode & 0xc0) == 0x80) tcode++; |
|
188 #endif |
|
189 break; |
|
190 |
|
191 /* At least one single char sets the bit and stops */ |
|
192 |
|
193 case OP_EXACT: /* Fall through */ |
|
194 tcode += 2; |
|
195 |
|
196 case OP_CHAR: |
|
197 case OP_CHARNC: |
|
198 case OP_PLUS: |
|
199 case OP_MINPLUS: |
|
200 set_bit(start_bits, tcode[1], caseless, cd); |
|
201 try_next = FALSE; |
|
202 break; |
|
203 |
|
204 /* Single character type sets the bits and stops */ |
|
205 |
|
206 case OP_NOT_DIGIT: |
|
207 for (c = 0; c < 32; c++) |
|
208 start_bits[c] |= ~cd->cbits[c+cbit_digit]; |
|
209 try_next = FALSE; |
|
210 break; |
|
211 |
|
212 case OP_DIGIT: |
|
213 for (c = 0; c < 32; c++) |
|
214 start_bits[c] |= cd->cbits[c+cbit_digit]; |
|
215 try_next = FALSE; |
|
216 break; |
|
217 |
|
218 case OP_NOT_WHITESPACE: |
|
219 for (c = 0; c < 32; c++) |
|
220 start_bits[c] |= ~cd->cbits[c+cbit_space]; |
|
221 try_next = FALSE; |
|
222 break; |
|
223 |
|
224 case OP_WHITESPACE: |
|
225 for (c = 0; c < 32; c++) |
|
226 start_bits[c] |= cd->cbits[c+cbit_space]; |
|
227 try_next = FALSE; |
|
228 break; |
|
229 |
|
230 case OP_NOT_WORDCHAR: |
|
231 for (c = 0; c < 32; c++) |
|
232 start_bits[c] |= ~cd->cbits[c+cbit_word]; |
|
233 try_next = FALSE; |
|
234 break; |
|
235 |
|
236 case OP_WORDCHAR: |
|
237 for (c = 0; c < 32; c++) |
|
238 start_bits[c] |= cd->cbits[c+cbit_word]; |
|
239 try_next = FALSE; |
|
240 break; |
|
241 |
|
242 /* One or more character type fudges the pointer and restarts, knowing |
|
243 it will hit a single character type and stop there. */ |
|
244 |
|
245 case OP_TYPEPLUS: |
|
246 case OP_TYPEMINPLUS: |
|
247 tcode++; |
|
248 break; |
|
249 |
|
250 case OP_TYPEEXACT: |
|
251 tcode += 3; |
|
252 break; |
|
253 |
|
254 /* Zero or more repeats of character types set the bits and then |
|
255 try again. */ |
|
256 |
|
257 case OP_TYPEUPTO: |
|
258 case OP_TYPEMINUPTO: |
|
259 tcode += 2; /* Fall through */ |
|
260 |
|
261 case OP_TYPESTAR: |
|
262 case OP_TYPEMINSTAR: |
|
263 case OP_TYPEQUERY: |
|
264 case OP_TYPEMINQUERY: |
|
265 switch(tcode[1]) |
|
266 { |
|
267 case OP_ANY: |
|
268 return FALSE; |
|
269 |
|
270 case OP_NOT_DIGIT: |
|
271 for (c = 0; c < 32; c++) |
|
272 start_bits[c] |= ~cd->cbits[c+cbit_digit]; |
|
273 break; |
|
274 |
|
275 case OP_DIGIT: |
|
276 for (c = 0; c < 32; c++) |
|
277 start_bits[c] |= cd->cbits[c+cbit_digit]; |
|
278 break; |
|
279 |
|
280 case OP_NOT_WHITESPACE: |
|
281 for (c = 0; c < 32; c++) |
|
282 start_bits[c] |= ~cd->cbits[c+cbit_space]; |
|
283 break; |
|
284 |
|
285 case OP_WHITESPACE: |
|
286 for (c = 0; c < 32; c++) |
|
287 start_bits[c] |= cd->cbits[c+cbit_space]; |
|
288 break; |
|
289 |
|
290 case OP_NOT_WORDCHAR: |
|
291 for (c = 0; c < 32; c++) |
|
292 start_bits[c] |= ~cd->cbits[c+cbit_word]; |
|
293 break; |
|
294 |
|
295 case OP_WORDCHAR: |
|
296 for (c = 0; c < 32; c++) |
|
297 start_bits[c] |= cd->cbits[c+cbit_word]; |
|
298 break; |
|
299 } |
|
300 |
|
301 tcode += 2; |
|
302 break; |
|
303 |
|
304 /* Character class where all the information is in a bit map: set the |
|
305 bits and either carry on or not, according to the repeat count. If it was |
|
306 a negative class, and we are operating with UTF-8 characters, any byte |
|
307 with a value >= 0xc4 is a potentially valid starter because it starts a |
|
308 character with a value > 255. */ |
|
309 |
|
310 case OP_NCLASS: |
|
311 if (utf8) |
|
312 { |
|
313 start_bits[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */ |
|
314 memset(start_bits+25, 0xff, 7); /* Bits for 0xc9 - 0xff */ |
|
315 } |
|
316 /* Fall through */ |
|
317 |
|
318 case OP_CLASS: |
|
319 { |
|
320 tcode++; |
|
321 |
|
322 /* In UTF-8 mode, the bits in a bit map correspond to character |
|
323 values, not to byte values. However, the bit map we are constructing is |
|
324 for byte values. So we have to do a conversion for characters whose |
|
325 value is > 127. In fact, there are only two possible starting bytes for |
|
326 characters in the range 128 - 255. */ |
|
327 |
|
328 if (utf8) |
|
329 { |
|
330 for (c = 0; c < 16; c++) start_bits[c] |= tcode[c]; |
|
331 for (c = 128; c < 256; c++) |
|
332 { |
|
333 if ((tcode[c/8] && (1 << (c&7))) != 0) |
|
334 { |
|
335 int d = (c >> 6) | 0xc0; /* Set bit for this starter */ |
|
336 start_bits[d/8] |= (1 << (d&7)); /* and then skip on to the */ |
|
337 c = (c & 0xc0) + 0x40 - 1; /* next relevant character. */ |
|
338 } |
|
339 } |
|
340 } |
|
341 |
|
342 /* In non-UTF-8 mode, the two bit maps are completely compatible. */ |
|
343 |
|
344 else |
|
345 { |
|
346 for (c = 0; c < 32; c++) start_bits[c] |= tcode[c]; |
|
347 } |
|
348 |
|
349 /* Advance past the bit map, and act on what follows */ |
|
350 |
|
351 tcode += 32; |
|
352 switch (*tcode) |
|
353 { |
|
354 case OP_CRSTAR: |
|
355 case OP_CRMINSTAR: |
|
356 case OP_CRQUERY: |
|
357 case OP_CRMINQUERY: |
|
358 tcode++; |
|
359 break; |
|
360 |
|
361 case OP_CRRANGE: |
|
362 case OP_CRMINRANGE: |
|
363 if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5; |
|
364 else try_next = FALSE; |
|
365 break; |
|
366 |
|
367 default: |
|
368 try_next = FALSE; |
|
369 break; |
|
370 } |
|
371 } |
|
372 break; /* End of bitmap class handling */ |
|
373 |
|
374 } /* End of switch */ |
|
375 } /* End of try_next loop */ |
|
376 |
|
377 code += GET(code, 1); /* Advance to next branch */ |
|
378 } |
|
379 while (*code == OP_ALT); |
|
380 return TRUE; |
|
381 } |
|
382 |
|
383 |
|
384 |
|
385 /************************************************* |
|
386 * Study a compiled expression * |
|
387 *************************************************/ |
|
388 |
|
389 /* This function is handed a compiled expression that it must study to produce |
|
390 information that will speed up the matching. It returns a pcre_extra block |
|
391 which then gets handed back to pcre_exec(). |
|
392 |
|
393 Arguments: |
|
394 re points to the compiled expression |
|
395 options contains option bits |
|
396 errorptr points to where to place error messages; |
|
397 set NULL unless error |
|
398 |
|
399 Returns: pointer to a pcre_extra block, with study_data filled in and the |
|
400 appropriate flag set; |
|
401 NULL on error or if no optimization possible |
|
402 */ |
|
403 |
|
404 PCRE_EXPORT pcre_extra * |
|
405 pcre_study(const pcre *external_re, int options, const char **errorptr) |
|
406 { |
|
407 uschar start_bits[32]; |
|
408 pcre_extra *extra; |
|
409 pcre_study_data *study; |
|
410 const uschar *tables; |
|
411 const real_pcre *re = (const real_pcre *)external_re; |
|
412 uschar *code; |
|
413 compile_data compile_block; |
|
414 |
|
415 *errorptr = NULL; |
|
416 |
|
417 if (re == NULL || re->magic_number != MAGIC_NUMBER) |
|
418 { |
|
419 *errorptr = "argument is not a compiled regular expression"; |
|
420 return NULL; |
|
421 } |
|
422 |
|
423 code = (uschar *)re + re->name_table_offset + |
|
424 (re->name_count * re->name_entry_size); |
|
425 |
|
426 if ((options & ~PUBLIC_STUDY_OPTIONS) != 0) |
|
427 { |
|
428 *errorptr = "unknown or incorrect option bit(s) set"; |
|
429 return NULL; |
|
430 } |
|
431 |
|
432 /* For an anchored pattern, or an unanchored pattern that has a first char, or |
|
433 a multiline pattern that matches only at "line starts", no further processing |
|
434 at present. */ |
|
435 |
|
436 if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0) |
|
437 return NULL; |
|
438 |
|
439 /* Set the character tables in the block that is passed around */ |
|
440 |
|
441 tables = re->tables; |
|
442 if (tables == NULL) |
|
443 (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES, |
|
444 (void *)(&tables)); |
|
445 |
|
446 compile_block.lcc = tables + lcc_offset; |
|
447 compile_block.fcc = tables + fcc_offset; |
|
448 compile_block.cbits = tables + cbits_offset; |
|
449 compile_block.ctypes = tables + ctypes_offset; |
|
450 |
|
451 /* See if we can find a fixed set of initial characters for the pattern. */ |
|
452 |
|
453 memset(start_bits, 0, 32 * sizeof(uschar)); |
|
454 if (!set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0, |
|
455 (re->options & PCRE_UTF8) != 0, &compile_block)) return NULL; |
|
456 |
|
457 /* Get a pcre_extra block and a pcre_study_data block. The study data is put in |
|
458 the latter, which is pointed to by the former, which may also get additional |
|
459 data set later by the calling program. At the moment, the size of |
|
460 pcre_study_data is fixed. We nevertheless save it in a field for returning via |
|
461 the pcre_fullinfo() function so that if it becomes variable in the future, we |
|
462 don't have to change that code. */ |
|
463 |
|
464 extra = (pcre_extra *)(pcre_malloc) |
|
465 (sizeof(pcre_extra) + sizeof(pcre_study_data)); |
|
466 |
|
467 if (extra == NULL) |
|
468 { |
|
469 *errorptr = "failed to get memory"; |
|
470 return NULL; |
|
471 } |
|
472 |
|
473 study = (pcre_study_data *)((char *)extra + sizeof(pcre_extra)); |
|
474 extra->flags = PCRE_EXTRA_STUDY_DATA; |
|
475 extra->study_data = study; |
|
476 |
|
477 study->size = sizeof(pcre_study_data); |
|
478 study->options = PCRE_STUDY_MAPPED; |
|
479 memcpy(study->start_bits, start_bits, sizeof(start_bits)); |
|
480 |
|
481 return extra; |
|
482 } |
|
483 |
|
484 /* End of pcre_study.c */ |