|
1 /************************************************* |
|
2 * Perl-Compatible Regular Expressions * |
|
3 *************************************************/ |
|
4 |
|
5 /* PCRE is a library of functions to support regular expressions whose syntax |
|
6 and semantics are as close as possible to those of the Perl 5 language. |
|
7 |
|
8 Written by Philip Hazel |
|
9 Copyright (c) 1997-2008 University of Cambridge |
|
10 |
|
11 ----------------------------------------------------------------------------- |
|
12 Redistribution and use in source and binary forms, with or without |
|
13 modification, are permitted provided that the following conditions are met: |
|
14 |
|
15 * Redistributions of source code must retain the above copyright notice, |
|
16 this list of conditions and the following disclaimer. |
|
17 |
|
18 * Redistributions in binary form must reproduce the above copyright |
|
19 notice, this list of conditions and the following disclaimer in the |
|
20 documentation and/or other materials provided with the distribution. |
|
21 |
|
22 * Neither the name of the University of Cambridge nor the names of its |
|
23 contributors may be used to endorse or promote products derived from |
|
24 this software without specific prior written permission. |
|
25 |
|
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
|
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
|
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
|
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
|
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
|
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
|
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
|
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
|
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
|
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
|
36 POSSIBILITY OF SUCH DAMAGE. |
|
37 ----------------------------------------------------------------------------- |
|
38 */ |
|
39 |
|
40 |
|
41 /* This module contains a PCRE private debugging function for printing out the |
|
42 internal form of a compiled regular expression, along with some supporting |
|
43 local functions. This source file is used in two places: |
|
44 |
|
45 (1) It is #included by pcre_compile.c when it is compiled in debugging mode |
|
46 (DEBUG defined in pcre_internal.h). It is not included in production compiles. |
|
47 |
|
48 (2) It is always #included by pcretest.c, which can be asked to print out a |
|
49 compiled regex for debugging purposes. */ |
|
50 |
|
51 |
|
52 /* Macro that decides whether a character should be output as a literal or in |
|
53 hexadecimal. We don't use isprint() because that can vary from system to system |
|
54 (even without the use of locales) and we want the output always to be the same, |
|
55 for testing purposes. This macro is used in pcretest as well as in this file. */ |
|
56 |
|
57 #define PRINTABLE(c) ((c) >= 32 && (c) < 127) |
|
58 |
|
59 /* The table of operator names. */ |
|
60 |
|
61 static const char *OP_names[] = { OP_NAME_LIST }; |
|
62 |
|
63 |
|
64 |
|
65 /************************************************* |
|
66 * Print single- or multi-byte character * |
|
67 *************************************************/ |
|
68 |
|
69 static int |
|
70 print_char(FILE *f, uschar *ptr, BOOL utf8) |
|
71 { |
|
72 int c = *ptr; |
|
73 |
|
74 #ifndef SUPPORT_UTF8 |
|
75 utf8 = utf8; /* Avoid compiler warning */ |
|
76 if (PRINTABLE(c)) fprintf(f, "%c", c); else fprintf(f, "\\x%02x", c); |
|
77 return 0; |
|
78 |
|
79 #else |
|
80 if (!utf8 || (c & 0xc0) != 0xc0) |
|
81 { |
|
82 if (PRINTABLE(c)) fprintf(f, "%c", c); else fprintf(f, "\\x%02x", c); |
|
83 return 0; |
|
84 } |
|
85 else |
|
86 { |
|
87 int i; |
|
88 int a = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ |
|
89 int s = 6*a; |
|
90 c = (c & _pcre_utf8_table3[a]) << s; |
|
91 for (i = 1; i <= a; i++) |
|
92 { |
|
93 /* This is a check for malformed UTF-8; it should only occur if the sanity |
|
94 check has been turned off. Rather than swallow random bytes, just stop if |
|
95 we hit a bad one. Print it with \X instead of \x as an indication. */ |
|
96 |
|
97 if ((ptr[i] & 0xc0) != 0x80) |
|
98 { |
|
99 fprintf(f, "\\X{%x}", c); |
|
100 return i - 1; |
|
101 } |
|
102 |
|
103 /* The byte is OK */ |
|
104 |
|
105 s -= 6; |
|
106 c |= (ptr[i] & 0x3f) << s; |
|
107 } |
|
108 if (c < 128) fprintf(f, "\\x%02x", c); else fprintf(f, "\\x{%x}", c); |
|
109 return a; |
|
110 } |
|
111 #endif |
|
112 } |
|
113 |
|
114 |
|
115 |
|
116 /************************************************* |
|
117 * Find Unicode property name * |
|
118 *************************************************/ |
|
119 |
|
120 static const char * |
|
121 get_ucpname(int ptype, int pvalue) |
|
122 { |
|
123 #ifdef SUPPORT_UCP |
|
124 int i; |
|
125 for (i = _pcre_utt_size - 1; i >= 0; i--) |
|
126 { |
|
127 if (ptype == _pcre_utt[i].type && pvalue == _pcre_utt[i].value) break; |
|
128 } |
|
129 return (i >= 0)? _pcre_utt_names + _pcre_utt[i].name_offset : "??"; |
|
130 #else |
|
131 /* It gets harder and harder to shut off unwanted compiler warnings. */ |
|
132 ptype = ptype * pvalue; |
|
133 return (ptype == pvalue)? "??" : "??"; |
|
134 #endif |
|
135 } |
|
136 |
|
137 |
|
138 |
|
139 /************************************************* |
|
140 * Print compiled regex * |
|
141 *************************************************/ |
|
142 |
|
143 /* Make this function work for a regex with integers either byte order. |
|
144 However, we assume that what we are passed is a compiled regex. The |
|
145 print_lengths flag controls whether offsets and lengths of items are printed. |
|
146 They can be turned off from pcretest so that automatic tests on bytecode can be |
|
147 written that do not depend on the value of LINK_SIZE. */ |
|
148 |
|
149 static void |
|
150 pcre_printint(pcre *external_re, FILE *f, BOOL print_lengths) |
|
151 { |
|
152 real_pcre *re = (real_pcre *)external_re; |
|
153 uschar *codestart, *code; |
|
154 BOOL utf8; |
|
155 |
|
156 unsigned int options = re->options; |
|
157 int offset = re->name_table_offset; |
|
158 int count = re->name_count; |
|
159 int size = re->name_entry_size; |
|
160 |
|
161 if (re->magic_number != MAGIC_NUMBER) |
|
162 { |
|
163 offset = ((offset << 8) & 0xff00) | ((offset >> 8) & 0xff); |
|
164 count = ((count << 8) & 0xff00) | ((count >> 8) & 0xff); |
|
165 size = ((size << 8) & 0xff00) | ((size >> 8) & 0xff); |
|
166 options = ((options << 24) & 0xff000000) | |
|
167 ((options << 8) & 0x00ff0000) | |
|
168 ((options >> 8) & 0x0000ff00) | |
|
169 ((options >> 24) & 0x000000ff); |
|
170 } |
|
171 |
|
172 code = codestart = (uschar *)re + offset + count * size; |
|
173 utf8 = (options & PCRE_UTF8) != 0; |
|
174 |
|
175 for(;;) |
|
176 { |
|
177 uschar *ccode; |
|
178 int c; |
|
179 int extra = 0; |
|
180 |
|
181 if (print_lengths) |
|
182 fprintf(f, "%3d ", (int)(code - codestart)); |
|
183 else |
|
184 fprintf(f, " "); |
|
185 |
|
186 switch(*code) |
|
187 { |
|
188 case OP_END: |
|
189 fprintf(f, " %s\n", OP_names[*code]); |
|
190 fprintf(f, "------------------------------------------------------------------\n"); |
|
191 return; |
|
192 |
|
193 case OP_OPT: |
|
194 fprintf(f, " %.2x %s", code[1], OP_names[*code]); |
|
195 break; |
|
196 |
|
197 case OP_CHAR: |
|
198 fprintf(f, " "); |
|
199 do |
|
200 { |
|
201 code++; |
|
202 code += 1 + print_char(f, code, utf8); |
|
203 } |
|
204 while (*code == OP_CHAR); |
|
205 fprintf(f, "\n"); |
|
206 continue; |
|
207 |
|
208 case OP_CHARNC: |
|
209 fprintf(f, " NC "); |
|
210 do |
|
211 { |
|
212 code++; |
|
213 code += 1 + print_char(f, code, utf8); |
|
214 } |
|
215 while (*code == OP_CHARNC); |
|
216 fprintf(f, "\n"); |
|
217 continue; |
|
218 |
|
219 case OP_CBRA: |
|
220 case OP_SCBRA: |
|
221 if (print_lengths) fprintf(f, "%3d ", GET(code, 1)); |
|
222 else fprintf(f, " "); |
|
223 fprintf(f, "%s %d", OP_names[*code], GET2(code, 1+LINK_SIZE)); |
|
224 break; |
|
225 |
|
226 case OP_BRA: |
|
227 case OP_SBRA: |
|
228 case OP_KETRMAX: |
|
229 case OP_KETRMIN: |
|
230 case OP_ALT: |
|
231 case OP_KET: |
|
232 case OP_ASSERT: |
|
233 case OP_ASSERT_NOT: |
|
234 case OP_ASSERTBACK: |
|
235 case OP_ASSERTBACK_NOT: |
|
236 case OP_ONCE: |
|
237 case OP_COND: |
|
238 case OP_SCOND: |
|
239 case OP_REVERSE: |
|
240 if (print_lengths) fprintf(f, "%3d ", GET(code, 1)); |
|
241 else fprintf(f, " "); |
|
242 fprintf(f, "%s", OP_names[*code]); |
|
243 break; |
|
244 |
|
245 case OP_CREF: |
|
246 fprintf(f, "%3d %s", GET2(code,1), OP_names[*code]); |
|
247 break; |
|
248 |
|
249 case OP_RREF: |
|
250 c = GET2(code, 1); |
|
251 if (c == RREF_ANY) |
|
252 fprintf(f, " Cond recurse any"); |
|
253 else |
|
254 fprintf(f, " Cond recurse %d", c); |
|
255 break; |
|
256 |
|
257 case OP_DEF: |
|
258 fprintf(f, " Cond def"); |
|
259 break; |
|
260 |
|
261 case OP_STAR: |
|
262 case OP_MINSTAR: |
|
263 case OP_POSSTAR: |
|
264 case OP_PLUS: |
|
265 case OP_MINPLUS: |
|
266 case OP_POSPLUS: |
|
267 case OP_QUERY: |
|
268 case OP_MINQUERY: |
|
269 case OP_POSQUERY: |
|
270 case OP_TYPESTAR: |
|
271 case OP_TYPEMINSTAR: |
|
272 case OP_TYPEPOSSTAR: |
|
273 case OP_TYPEPLUS: |
|
274 case OP_TYPEMINPLUS: |
|
275 case OP_TYPEPOSPLUS: |
|
276 case OP_TYPEQUERY: |
|
277 case OP_TYPEMINQUERY: |
|
278 case OP_TYPEPOSQUERY: |
|
279 fprintf(f, " "); |
|
280 if (*code >= OP_TYPESTAR) |
|
281 { |
|
282 fprintf(f, "%s", OP_names[code[1]]); |
|
283 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) |
|
284 { |
|
285 fprintf(f, " %s ", get_ucpname(code[2], code[3])); |
|
286 extra = 2; |
|
287 } |
|
288 } |
|
289 else extra = print_char(f, code+1, utf8); |
|
290 fprintf(f, "%s", OP_names[*code]); |
|
291 break; |
|
292 |
|
293 case OP_EXACT: |
|
294 case OP_UPTO: |
|
295 case OP_MINUPTO: |
|
296 case OP_POSUPTO: |
|
297 fprintf(f, " "); |
|
298 extra = print_char(f, code+3, utf8); |
|
299 fprintf(f, "{"); |
|
300 if (*code != OP_EXACT) fprintf(f, "0,"); |
|
301 fprintf(f, "%d}", GET2(code,1)); |
|
302 if (*code == OP_MINUPTO) fprintf(f, "?"); |
|
303 else if (*code == OP_POSUPTO) fprintf(f, "+"); |
|
304 break; |
|
305 |
|
306 case OP_TYPEEXACT: |
|
307 case OP_TYPEUPTO: |
|
308 case OP_TYPEMINUPTO: |
|
309 case OP_TYPEPOSUPTO: |
|
310 fprintf(f, " %s", OP_names[code[3]]); |
|
311 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) |
|
312 { |
|
313 fprintf(f, " %s ", get_ucpname(code[4], code[5])); |
|
314 extra = 2; |
|
315 } |
|
316 fprintf(f, "{"); |
|
317 if (*code != OP_TYPEEXACT) fprintf(f, "0,"); |
|
318 fprintf(f, "%d}", GET2(code,1)); |
|
319 if (*code == OP_TYPEMINUPTO) fprintf(f, "?"); |
|
320 else if (*code == OP_TYPEPOSUPTO) fprintf(f, "+"); |
|
321 break; |
|
322 |
|
323 case OP_NOT: |
|
324 c = code[1]; |
|
325 if (PRINTABLE(c)) fprintf(f, " [^%c]", c); |
|
326 else fprintf(f, " [^\\x%02x]", c); |
|
327 break; |
|
328 |
|
329 case OP_NOTSTAR: |
|
330 case OP_NOTMINSTAR: |
|
331 case OP_NOTPOSSTAR: |
|
332 case OP_NOTPLUS: |
|
333 case OP_NOTMINPLUS: |
|
334 case OP_NOTPOSPLUS: |
|
335 case OP_NOTQUERY: |
|
336 case OP_NOTMINQUERY: |
|
337 case OP_NOTPOSQUERY: |
|
338 c = code[1]; |
|
339 if (PRINTABLE(c)) fprintf(f, " [^%c]", c); |
|
340 else fprintf(f, " [^\\x%02x]", c); |
|
341 fprintf(f, "%s", OP_names[*code]); |
|
342 break; |
|
343 |
|
344 case OP_NOTEXACT: |
|
345 case OP_NOTUPTO: |
|
346 case OP_NOTMINUPTO: |
|
347 case OP_NOTPOSUPTO: |
|
348 c = code[3]; |
|
349 if (PRINTABLE(c)) fprintf(f, " [^%c]{", c); |
|
350 else fprintf(f, " [^\\x%02x]{", c); |
|
351 if (*code != OP_NOTEXACT) fprintf(f, "0,"); |
|
352 fprintf(f, "%d}", GET2(code,1)); |
|
353 if (*code == OP_NOTMINUPTO) fprintf(f, "?"); |
|
354 else if (*code == OP_NOTPOSUPTO) fprintf(f, "+"); |
|
355 break; |
|
356 |
|
357 case OP_RECURSE: |
|
358 if (print_lengths) fprintf(f, "%3d ", GET(code, 1)); |
|
359 else fprintf(f, " "); |
|
360 fprintf(f, "%s", OP_names[*code]); |
|
361 break; |
|
362 |
|
363 case OP_REF: |
|
364 fprintf(f, " \\%d", GET2(code,1)); |
|
365 ccode = code + _pcre_OP_lengths[*code]; |
|
366 goto CLASS_REF_REPEAT; |
|
367 |
|
368 case OP_CALLOUT: |
|
369 fprintf(f, " %s %d %d %d", OP_names[*code], code[1], GET(code,2), |
|
370 GET(code, 2 + LINK_SIZE)); |
|
371 break; |
|
372 |
|
373 case OP_PROP: |
|
374 case OP_NOTPROP: |
|
375 fprintf(f, " %s %s", OP_names[*code], get_ucpname(code[1], code[2])); |
|
376 break; |
|
377 |
|
378 /* OP_XCLASS can only occur in UTF-8 mode. However, there's no harm in |
|
379 having this code always here, and it makes it less messy without all those |
|
380 #ifdefs. */ |
|
381 |
|
382 case OP_CLASS: |
|
383 case OP_NCLASS: |
|
384 case OP_XCLASS: |
|
385 { |
|
386 int i, min, max; |
|
387 BOOL printmap; |
|
388 |
|
389 fprintf(f, " ["); |
|
390 |
|
391 if (*code == OP_XCLASS) |
|
392 { |
|
393 extra = GET(code, 1); |
|
394 ccode = code + LINK_SIZE + 1; |
|
395 printmap = (*ccode & XCL_MAP) != 0; |
|
396 if ((*ccode++ & XCL_NOT) != 0) fprintf(f, "^"); |
|
397 } |
|
398 else |
|
399 { |
|
400 printmap = TRUE; |
|
401 ccode = code + 1; |
|
402 } |
|
403 |
|
404 /* Print a bit map */ |
|
405 |
|
406 if (printmap) |
|
407 { |
|
408 for (i = 0; i < 256; i++) |
|
409 { |
|
410 if ((ccode[i/8] & (1 << (i&7))) != 0) |
|
411 { |
|
412 int j; |
|
413 for (j = i+1; j < 256; j++) |
|
414 if ((ccode[j/8] & (1 << (j&7))) == 0) break; |
|
415 if (i == '-' || i == ']') fprintf(f, "\\"); |
|
416 if (PRINTABLE(i)) fprintf(f, "%c", i); |
|
417 else fprintf(f, "\\x%02x", i); |
|
418 if (--j > i) |
|
419 { |
|
420 if (j != i + 1) fprintf(f, "-"); |
|
421 if (j == '-' || j == ']') fprintf(f, "\\"); |
|
422 if (PRINTABLE(j)) fprintf(f, "%c", j); |
|
423 else fprintf(f, "\\x%02x", j); |
|
424 } |
|
425 i = j; |
|
426 } |
|
427 } |
|
428 ccode += 32; |
|
429 } |
|
430 |
|
431 /* For an XCLASS there is always some additional data */ |
|
432 |
|
433 if (*code == OP_XCLASS) |
|
434 { |
|
435 int ch; |
|
436 while ((ch = *ccode++) != XCL_END) |
|
437 { |
|
438 if (ch == XCL_PROP) |
|
439 { |
|
440 int ptype = *ccode++; |
|
441 int pvalue = *ccode++; |
|
442 fprintf(f, "\\p{%s}", get_ucpname(ptype, pvalue)); |
|
443 } |
|
444 else if (ch == XCL_NOTPROP) |
|
445 { |
|
446 int ptype = *ccode++; |
|
447 int pvalue = *ccode++; |
|
448 fprintf(f, "\\P{%s}", get_ucpname(ptype, pvalue)); |
|
449 } |
|
450 else |
|
451 { |
|
452 ccode += 1 + print_char(f, ccode, TRUE); |
|
453 if (ch == XCL_RANGE) |
|
454 { |
|
455 fprintf(f, "-"); |
|
456 ccode += 1 + print_char(f, ccode, TRUE); |
|
457 } |
|
458 } |
|
459 } |
|
460 } |
|
461 |
|
462 /* Indicate a non-UTF8 class which was created by negation */ |
|
463 |
|
464 fprintf(f, "]%s", (*code == OP_NCLASS)? " (neg)" : ""); |
|
465 |
|
466 /* Handle repeats after a class or a back reference */ |
|
467 |
|
468 CLASS_REF_REPEAT: |
|
469 switch(*ccode) |
|
470 { |
|
471 case OP_CRSTAR: |
|
472 case OP_CRMINSTAR: |
|
473 case OP_CRPLUS: |
|
474 case OP_CRMINPLUS: |
|
475 case OP_CRQUERY: |
|
476 case OP_CRMINQUERY: |
|
477 fprintf(f, "%s", OP_names[*ccode]); |
|
478 extra += _pcre_OP_lengths[*ccode]; |
|
479 break; |
|
480 |
|
481 case OP_CRRANGE: |
|
482 case OP_CRMINRANGE: |
|
483 min = GET2(ccode,1); |
|
484 max = GET2(ccode,3); |
|
485 if (max == 0) fprintf(f, "{%d,}", min); |
|
486 else fprintf(f, "{%d,%d}", min, max); |
|
487 if (*ccode == OP_CRMINRANGE) fprintf(f, "?"); |
|
488 extra += _pcre_OP_lengths[*ccode]; |
|
489 break; |
|
490 |
|
491 /* Do nothing if it's not a repeat; this code stops picky compilers |
|
492 warning about the lack of a default code path. */ |
|
493 |
|
494 default: |
|
495 break; |
|
496 } |
|
497 } |
|
498 break; |
|
499 |
|
500 /* Anything else is just an item with no data*/ |
|
501 |
|
502 default: |
|
503 fprintf(f, " %s", OP_names[*code]); |
|
504 break; |
|
505 } |
|
506 |
|
507 code += _pcre_OP_lengths[*code] + extra; |
|
508 fprintf(f, "\n"); |
|
509 } |
|
510 } |
|
511 |
|
512 /* End of pcre_printint.src */ |