|
1 /************************************************* |
|
2 * Perl-Compatible Regular Expressions * |
|
3 *************************************************/ |
|
4 |
|
5 |
|
6 /* PCRE is a library of functions to support regular expressions whose syntax |
|
7 and semantics are as close as possible to those of the Perl 5 language. |
|
8 |
|
9 Written by Philip Hazel |
|
10 Copyright (c) 1997-2008 University of Cambridge |
|
11 |
|
12 ----------------------------------------------------------------------------- |
|
13 Redistribution and use in source and binary forms, with or without |
|
14 modification, are permitted provided that the following conditions are met: |
|
15 |
|
16 * Redistributions of source code must retain the above copyright notice, |
|
17 this list of conditions and the following disclaimer. |
|
18 |
|
19 * Redistributions in binary form must reproduce the above copyright |
|
20 notice, this list of conditions and the following disclaimer in the |
|
21 documentation and/or other materials provided with the distribution. |
|
22 |
|
23 * Neither the name of the University of Cambridge nor the names of its |
|
24 contributors may be used to endorse or promote products derived from |
|
25 this software without specific prior written permission. |
|
26 |
|
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
|
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
|
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
|
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
|
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
|
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
|
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
|
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
|
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
|
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
|
37 POSSIBILITY OF SUCH DAMAGE. |
|
38 ----------------------------------------------------------------------------- |
|
39 */ |
|
40 |
|
41 /* This header contains definitions that are shared between the different |
|
42 modules, but which are not relevant to the exported API. This includes some |
|
43 functions whose names all begin with "_pcre_". */ |
|
44 |
|
45 #ifndef PCRE_INTERNAL_H |
|
46 #define PCRE_INTERNAL_H |
|
47 |
|
48 /* Define DEBUG to get debugging output on stdout. */ |
|
49 |
|
50 #if 0 |
|
51 #define DEBUG |
|
52 #endif |
|
53 |
|
54 /* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef |
|
55 inline, and there are *still* stupid compilers about that don't like indented |
|
56 pre-processor statements, or at least there were when I first wrote this. After |
|
57 all, it had only been about 10 years then... |
|
58 |
|
59 It turns out that the Mac Debugging.h header also defines the macro DPRINTF, so |
|
60 be absolutely sure we get our version. */ |
|
61 |
|
62 #undef DPRINTF |
|
63 #ifdef DEBUG |
|
64 #define DPRINTF(p) printf p |
|
65 #else |
|
66 #define DPRINTF(p) /* Nothing */ |
|
67 #endif |
|
68 |
|
69 |
|
70 /* Standard C headers plus the external interface definition. The only time |
|
71 setjmp and stdarg are used is when NO_RECURSE is set. */ |
|
72 |
|
73 #include <ctype.h> |
|
74 #include <limits.h> |
|
75 #include <setjmp.h> |
|
76 #include <stdarg.h> |
|
77 #include <stddef.h> |
|
78 #include <stdio.h> |
|
79 #include <stdlib.h> |
|
80 #include <string.h> |
|
81 |
|
82 /* When compiling a DLL for Windows, the exported symbols have to be declared |
|
83 using some MS magic. I found some useful information on this web page: |
|
84 http://msdn2.microsoft.com/en-us/library/y4h7bcy6(VS.80).aspx. According to the |
|
85 information there, using __declspec(dllexport) without "extern" we have a |
|
86 definition; with "extern" we have a declaration. The settings here override the |
|
87 setting in pcre.h (which is included below); it defines only PCRE_EXP_DECL, |
|
88 which is all that is needed for applications (they just import the symbols). We |
|
89 use: |
|
90 |
|
91 PCRE_EXP_DECL for declarations |
|
92 PCRE_EXP_DEFN for definitions of exported functions |
|
93 PCRE_EXP_DATA_DEFN for definitions of exported variables |
|
94 |
|
95 The reason for the two DEFN macros is that in non-Windows environments, one |
|
96 does not want to have "extern" before variable definitions because it leads to |
|
97 compiler warnings. So we distinguish between functions and variables. In |
|
98 Windows, the two should always be the same. |
|
99 |
|
100 The reason for wrapping this in #ifndef PCRE_EXP_DECL is so that pcretest, |
|
101 which is an application, but needs to import this file in order to "peek" at |
|
102 internals, can #include pcre.h first to get an application's-eye view. |
|
103 |
|
104 In principle, people compiling for non-Windows, non-Unix-like (i.e. uncommon, |
|
105 special-purpose environments) might want to stick other stuff in front of |
|
106 exported symbols. That's why, in the non-Windows case, we set PCRE_EXP_DEFN and |
|
107 PCRE_EXP_DATA_DEFN only if they are not already set. */ |
|
108 |
|
109 #ifndef PCRE_EXP_DECL |
|
110 # ifdef _WIN32 |
|
111 # ifndef PCRE_STATIC |
|
112 # define PCRE_EXP_DECL extern __declspec(dllexport) |
|
113 # define PCRE_EXP_DEFN __declspec(dllexport) |
|
114 # define PCRE_EXP_DATA_DEFN __declspec(dllexport) |
|
115 # else |
|
116 # define PCRE_EXP_DECL extern |
|
117 # define PCRE_EXP_DEFN |
|
118 # define PCRE_EXP_DATA_DEFN |
|
119 # endif |
|
120 # else |
|
121 # ifdef __cplusplus |
|
122 # define PCRE_EXP_DECL extern "C" |
|
123 # else |
|
124 # define PCRE_EXP_DECL extern |
|
125 # endif |
|
126 # ifndef PCRE_EXP_DEFN |
|
127 # define PCRE_EXP_DEFN PCRE_EXP_DECL |
|
128 # endif |
|
129 # ifndef PCRE_EXP_DATA_DEFN |
|
130 # define PCRE_EXP_DATA_DEFN |
|
131 # endif |
|
132 # endif |
|
133 #endif |
|
134 |
|
135 /* When compiling with the MSVC compiler, it is sometimes necessary to include |
|
136 a "calling convention" before exported function names. (This is secondhand |
|
137 information; I know nothing about MSVC myself). For example, something like |
|
138 |
|
139 void __cdecl function(....) |
|
140 |
|
141 might be needed. In order so make this easy, all the exported functions have |
|
142 PCRE_CALL_CONVENTION just before their names. It is rarely needed; if not |
|
143 set, we ensure here that it has no effect. */ |
|
144 |
|
145 #ifndef PCRE_CALL_CONVENTION |
|
146 #define PCRE_CALL_CONVENTION |
|
147 #endif |
|
148 |
|
149 /* We need to have types that specify unsigned 16-bit and 32-bit integers. We |
|
150 cannot determine these outside the compilation (e.g. by running a program as |
|
151 part of "configure") because PCRE is often cross-compiled for use on other |
|
152 systems. Instead we make use of the maximum sizes that are available at |
|
153 preprocessor time in standard C environments. */ |
|
154 |
|
155 #if USHRT_MAX == 65535 |
|
156 typedef unsigned short pcre_uint16; |
|
157 typedef short pcre_int16; |
|
158 #elif UINT_MAX == 65535 |
|
159 typedef unsigned int pcre_uint16; |
|
160 typedef int pcre_int16; |
|
161 #else |
|
162 #error Cannot determine a type for 16-bit unsigned integers |
|
163 #endif |
|
164 |
|
165 #if UINT_MAX == 4294967295 |
|
166 typedef unsigned int pcre_uint32; |
|
167 typedef int pcre_int32; |
|
168 #elif ULONG_MAX == 4294967295 |
|
169 typedef unsigned long int pcre_uint32; |
|
170 typedef long int pcre_int32; |
|
171 #else |
|
172 #error Cannot determine a type for 32-bit unsigned integers |
|
173 #endif |
|
174 |
|
175 /* All character handling must be done as unsigned characters. Otherwise there |
|
176 are problems with top-bit-set characters and functions such as isspace(). |
|
177 However, we leave the interface to the outside world as char *, because that |
|
178 should make things easier for callers. We define a short type for unsigned char |
|
179 to save lots of typing. I tried "uchar", but it causes problems on Digital |
|
180 Unix, where it is defined in sys/types, so use "uschar" instead. */ |
|
181 |
|
182 typedef unsigned char uschar; |
|
183 |
|
184 /* This is an unsigned int value that no character can ever have. UTF-8 |
|
185 characters only go up to 0x7fffffff (though Unicode doesn't go beyond |
|
186 0x0010ffff). */ |
|
187 |
|
188 #define NOTACHAR 0xffffffff |
|
189 |
|
190 /* PCRE is able to support several different kinds of newline (CR, LF, CRLF, |
|
191 "any" and "anycrlf" at present). The following macros are used to package up |
|
192 testing for newlines. NLBLOCK, PSSTART, and PSEND are defined in the various |
|
193 modules to indicate in which datablock the parameters exist, and what the |
|
194 start/end of string field names are. */ |
|
195 |
|
196 #define NLTYPE_FIXED 0 /* Newline is a fixed length string */ |
|
197 #define NLTYPE_ANY 1 /* Newline is any Unicode line ending */ |
|
198 #define NLTYPE_ANYCRLF 2 /* Newline is CR, LF, or CRLF */ |
|
199 |
|
200 /* This macro checks for a newline at the given position */ |
|
201 |
|
202 #define IS_NEWLINE(p) \ |
|
203 ((NLBLOCK->nltype != NLTYPE_FIXED)? \ |
|
204 ((p) < NLBLOCK->PSEND && \ |
|
205 _pcre_is_newline((p), NLBLOCK->nltype, NLBLOCK->PSEND, &(NLBLOCK->nllen),\ |
|
206 utf8)) \ |
|
207 : \ |
|
208 ((p) <= NLBLOCK->PSEND - NLBLOCK->nllen && \ |
|
209 (p)[0] == NLBLOCK->nl[0] && \ |
|
210 (NLBLOCK->nllen == 1 || (p)[1] == NLBLOCK->nl[1]) \ |
|
211 ) \ |
|
212 ) |
|
213 |
|
214 /* This macro checks for a newline immediately preceding the given position */ |
|
215 |
|
216 #define WAS_NEWLINE(p) \ |
|
217 ((NLBLOCK->nltype != NLTYPE_FIXED)? \ |
|
218 ((p) > NLBLOCK->PSSTART && \ |
|
219 _pcre_was_newline((p), NLBLOCK->nltype, NLBLOCK->PSSTART, \ |
|
220 &(NLBLOCK->nllen), utf8)) \ |
|
221 : \ |
|
222 ((p) >= NLBLOCK->PSSTART + NLBLOCK->nllen && \ |
|
223 (p)[-NLBLOCK->nllen] == NLBLOCK->nl[0] && \ |
|
224 (NLBLOCK->nllen == 1 || (p)[-NLBLOCK->nllen+1] == NLBLOCK->nl[1]) \ |
|
225 ) \ |
|
226 ) |
|
227 |
|
228 /* When PCRE is compiled as a C++ library, the subject pointer can be replaced |
|
229 with a custom type. This makes it possible, for example, to allow pcre_exec() |
|
230 to process subject strings that are discontinuous by using a smart pointer |
|
231 class. It must always be possible to inspect all of the subject string in |
|
232 pcre_exec() because of the way it backtracks. Two macros are required in the |
|
233 normal case, for sign-unspecified and unsigned char pointers. The former is |
|
234 used for the external interface and appears in pcre.h, which is why its name |
|
235 must begin with PCRE_. */ |
|
236 |
|
237 #ifdef CUSTOM_SUBJECT_PTR |
|
238 #define PCRE_SPTR CUSTOM_SUBJECT_PTR |
|
239 #define USPTR CUSTOM_SUBJECT_PTR |
|
240 #else |
|
241 #define PCRE_SPTR const char * |
|
242 #define USPTR const unsigned char * |
|
243 #endif |
|
244 |
|
245 |
|
246 |
|
247 /* Include the public PCRE header and the definitions of UCP character property |
|
248 values. */ |
|
249 |
|
250 #include "pcre.h" |
|
251 #include "ucp.h" |
|
252 |
|
253 /* When compiling for use with the Virtual Pascal compiler, these functions |
|
254 need to have their names changed. PCRE must be compiled with the -DVPCOMPAT |
|
255 option on the command line. */ |
|
256 |
|
257 #ifdef VPCOMPAT |
|
258 #define strlen(s) _strlen(s) |
|
259 #define strncmp(s1,s2,m) _strncmp(s1,s2,m) |
|
260 #define memcmp(s,c,n) _memcmp(s,c,n) |
|
261 #define memcpy(d,s,n) _memcpy(d,s,n) |
|
262 #define memmove(d,s,n) _memmove(d,s,n) |
|
263 #define memset(s,c,n) _memset(s,c,n) |
|
264 #else /* VPCOMPAT */ |
|
265 |
|
266 /* To cope with SunOS4 and other systems that lack memmove() but have bcopy(), |
|
267 define a macro for memmove() if HAVE_MEMMOVE is false, provided that HAVE_BCOPY |
|
268 is set. Otherwise, include an emulating function for those systems that have |
|
269 neither (there some non-Unix environments where this is the case). */ |
|
270 |
|
271 #ifndef HAVE_MEMMOVE |
|
272 #undef memmove /* some systems may have a macro */ |
|
273 #ifdef HAVE_BCOPY |
|
274 #define memmove(a, b, c) bcopy(b, a, c) |
|
275 #else /* HAVE_BCOPY */ |
|
276 static void * |
|
277 pcre_memmove(void *d, const void *s, size_t n) |
|
278 { |
|
279 size_t i; |
|
280 unsigned char *dest = (unsigned char *)d; |
|
281 const unsigned char *src = (const unsigned char *)s; |
|
282 if (dest > src) |
|
283 { |
|
284 dest += n; |
|
285 src += n; |
|
286 for (i = 0; i < n; ++i) *(--dest) = *(--src); |
|
287 return (void *)dest; |
|
288 } |
|
289 else |
|
290 { |
|
291 for (i = 0; i < n; ++i) *dest++ = *src++; |
|
292 return (void *)(dest - n); |
|
293 } |
|
294 } |
|
295 #define memmove(a, b, c) pcre_memmove(a, b, c) |
|
296 #endif /* not HAVE_BCOPY */ |
|
297 #endif /* not HAVE_MEMMOVE */ |
|
298 #endif /* not VPCOMPAT */ |
|
299 |
|
300 |
|
301 /* PCRE keeps offsets in its compiled code as 2-byte quantities (always stored |
|
302 in big-endian order) by default. These are used, for example, to link from the |
|
303 start of a subpattern to its alternatives and its end. The use of 2 bytes per |
|
304 offset limits the size of the compiled regex to around 64K, which is big enough |
|
305 for almost everybody. However, I received a request for an even bigger limit. |
|
306 For this reason, and also to make the code easier to maintain, the storing and |
|
307 loading of offsets from the byte string is now handled by the macros that are |
|
308 defined here. |
|
309 |
|
310 The macros are controlled by the value of LINK_SIZE. This defaults to 2 in |
|
311 the config.h file, but can be overridden by using -D on the command line. This |
|
312 is automated on Unix systems via the "configure" command. */ |
|
313 |
|
314 #if LINK_SIZE == 2 |
|
315 |
|
316 #define PUT(a,n,d) \ |
|
317 (a[n] = (d) >> 8), \ |
|
318 (a[(n)+1] = (d) & 255) |
|
319 |
|
320 #define GET(a,n) \ |
|
321 (((a)[n] << 8) | (a)[(n)+1]) |
|
322 |
|
323 #define MAX_PATTERN_SIZE (1 << 16) |
|
324 |
|
325 |
|
326 #elif LINK_SIZE == 3 |
|
327 |
|
328 #define PUT(a,n,d) \ |
|
329 (a[n] = (d) >> 16), \ |
|
330 (a[(n)+1] = (d) >> 8), \ |
|
331 (a[(n)+2] = (d) & 255) |
|
332 |
|
333 #define GET(a,n) \ |
|
334 (((a)[n] << 16) | ((a)[(n)+1] << 8) | (a)[(n)+2]) |
|
335 |
|
336 #define MAX_PATTERN_SIZE (1 << 24) |
|
337 |
|
338 |
|
339 #elif LINK_SIZE == 4 |
|
340 |
|
341 #define PUT(a,n,d) \ |
|
342 (a[n] = (d) >> 24), \ |
|
343 (a[(n)+1] = (d) >> 16), \ |
|
344 (a[(n)+2] = (d) >> 8), \ |
|
345 (a[(n)+3] = (d) & 255) |
|
346 |
|
347 #define GET(a,n) \ |
|
348 (((a)[n] << 24) | ((a)[(n)+1] << 16) | ((a)[(n)+2] << 8) | (a)[(n)+3]) |
|
349 |
|
350 #define MAX_PATTERN_SIZE (1 << 30) /* Keep it positive */ |
|
351 |
|
352 |
|
353 #else |
|
354 #error LINK_SIZE must be either 2, 3, or 4 |
|
355 #endif |
|
356 |
|
357 |
|
358 /* Convenience macro defined in terms of the others */ |
|
359 |
|
360 #define PUTINC(a,n,d) PUT(a,n,d), a += LINK_SIZE |
|
361 |
|
362 |
|
363 /* PCRE uses some other 2-byte quantities that do not change when the size of |
|
364 offsets changes. There are used for repeat counts and for other things such as |
|
365 capturing parenthesis numbers in back references. */ |
|
366 |
|
367 #define PUT2(a,n,d) \ |
|
368 a[n] = (d) >> 8; \ |
|
369 a[(n)+1] = (d) & 255 |
|
370 |
|
371 #define GET2(a,n) \ |
|
372 (((a)[n] << 8) | (a)[(n)+1]) |
|
373 |
|
374 #define PUT2INC(a,n,d) PUT2(a,n,d), a += 2 |
|
375 |
|
376 |
|
377 /* When UTF-8 encoding is being used, a character is no longer just a single |
|
378 byte. The macros for character handling generate simple sequences when used in |
|
379 byte-mode, and more complicated ones for UTF-8 characters. BACKCHAR should |
|
380 never be called in byte mode. To make sure it can never even appear when UTF-8 |
|
381 support is omitted, we don't even define it. */ |
|
382 |
|
383 #ifndef SUPPORT_UTF8 |
|
384 #define GETCHAR(c, eptr) c = *eptr; |
|
385 #define GETCHARTEST(c, eptr) c = *eptr; |
|
386 #define GETCHARINC(c, eptr) c = *eptr++; |
|
387 #define GETCHARINCTEST(c, eptr) c = *eptr++; |
|
388 #define GETCHARLEN(c, eptr, len) c = *eptr; |
|
389 /* #define BACKCHAR(eptr) */ |
|
390 |
|
391 #else /* SUPPORT_UTF8 */ |
|
392 |
|
393 /* Get the next UTF-8 character, not advancing the pointer. This is called when |
|
394 we know we are in UTF-8 mode. */ |
|
395 |
|
396 #define GETCHAR(c, eptr) \ |
|
397 c = *eptr; \ |
|
398 if (c >= 0xc0) \ |
|
399 { \ |
|
400 int gcii; \ |
|
401 int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \ |
|
402 int gcss = 6*gcaa; \ |
|
403 c = (c & _pcre_utf8_table3[gcaa]) << gcss; \ |
|
404 for (gcii = 1; gcii <= gcaa; gcii++) \ |
|
405 { \ |
|
406 gcss -= 6; \ |
|
407 c |= (eptr[gcii] & 0x3f) << gcss; \ |
|
408 } \ |
|
409 } |
|
410 |
|
411 /* Get the next UTF-8 character, testing for UTF-8 mode, and not advancing the |
|
412 pointer. */ |
|
413 |
|
414 #define GETCHARTEST(c, eptr) \ |
|
415 c = *eptr; \ |
|
416 if (utf8 && c >= 0xc0) \ |
|
417 { \ |
|
418 int gcii; \ |
|
419 int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \ |
|
420 int gcss = 6*gcaa; \ |
|
421 c = (c & _pcre_utf8_table3[gcaa]) << gcss; \ |
|
422 for (gcii = 1; gcii <= gcaa; gcii++) \ |
|
423 { \ |
|
424 gcss -= 6; \ |
|
425 c |= (eptr[gcii] & 0x3f) << gcss; \ |
|
426 } \ |
|
427 } |
|
428 |
|
429 /* Get the next UTF-8 character, advancing the pointer. This is called when we |
|
430 know we are in UTF-8 mode. */ |
|
431 |
|
432 #define GETCHARINC(c, eptr) \ |
|
433 c = *eptr++; \ |
|
434 if (c >= 0xc0) \ |
|
435 { \ |
|
436 int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \ |
|
437 int gcss = 6*gcaa; \ |
|
438 c = (c & _pcre_utf8_table3[gcaa]) << gcss; \ |
|
439 while (gcaa-- > 0) \ |
|
440 { \ |
|
441 gcss -= 6; \ |
|
442 c |= (*eptr++ & 0x3f) << gcss; \ |
|
443 } \ |
|
444 } |
|
445 |
|
446 /* Get the next character, testing for UTF-8 mode, and advancing the pointer */ |
|
447 |
|
448 #define GETCHARINCTEST(c, eptr) \ |
|
449 c = *eptr++; \ |
|
450 if (utf8 && c >= 0xc0) \ |
|
451 { \ |
|
452 int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \ |
|
453 int gcss = 6*gcaa; \ |
|
454 c = (c & _pcre_utf8_table3[gcaa]) << gcss; \ |
|
455 while (gcaa-- > 0) \ |
|
456 { \ |
|
457 gcss -= 6; \ |
|
458 c |= (*eptr++ & 0x3f) << gcss; \ |
|
459 } \ |
|
460 } |
|
461 |
|
462 /* Get the next UTF-8 character, not advancing the pointer, incrementing length |
|
463 if there are extra bytes. This is called when we know we are in UTF-8 mode. */ |
|
464 |
|
465 #define GETCHARLEN(c, eptr, len) \ |
|
466 c = *eptr; \ |
|
467 if (c >= 0xc0) \ |
|
468 { \ |
|
469 int gcii; \ |
|
470 int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \ |
|
471 int gcss = 6*gcaa; \ |
|
472 c = (c & _pcre_utf8_table3[gcaa]) << gcss; \ |
|
473 for (gcii = 1; gcii <= gcaa; gcii++) \ |
|
474 { \ |
|
475 gcss -= 6; \ |
|
476 c |= (eptr[gcii] & 0x3f) << gcss; \ |
|
477 } \ |
|
478 len += gcaa; \ |
|
479 } |
|
480 |
|
481 /* If the pointer is not at the start of a character, move it back until |
|
482 it is. This is called only in UTF-8 mode - we don't put a test within the macro |
|
483 because almost all calls are already within a block of UTF-8 only code. */ |
|
484 |
|
485 #define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr-- |
|
486 |
|
487 #endif |
|
488 |
|
489 |
|
490 /* In case there is no definition of offsetof() provided - though any proper |
|
491 Standard C system should have one. */ |
|
492 |
|
493 #ifndef offsetof |
|
494 #define offsetof(p_type,field) ((size_t)&(((p_type *)0)->field)) |
|
495 #endif |
|
496 |
|
497 |
|
498 /* These are the public options that can change during matching. */ |
|
499 |
|
500 #define PCRE_IMS (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL) |
|
501 |
|
502 /* Private flags containing information about the compiled regex. They used to |
|
503 live at the top end of the options word, but that got almost full, so now they |
|
504 are in a 16-bit flags word. */ |
|
505 |
|
506 #define PCRE_NOPARTIAL 0x0001 /* can't use partial with this regex */ |
|
507 #define PCRE_FIRSTSET 0x0002 /* first_byte is set */ |
|
508 #define PCRE_REQCHSET 0x0004 /* req_byte is set */ |
|
509 #define PCRE_STARTLINE 0x0008 /* start after \n for multiline */ |
|
510 #define PCRE_JCHANGED 0x0010 /* j option used in regex */ |
|
511 #define PCRE_HASCRORLF 0x0020 /* explicit \r or \n in pattern */ |
|
512 |
|
513 /* Options for the "extra" block produced by pcre_study(). */ |
|
514 |
|
515 #define PCRE_STUDY_MAPPED 0x01 /* a map of starting chars exists */ |
|
516 |
|
517 /* Masks for identifying the public options that are permitted at compile |
|
518 time, run time, or study time, respectively. */ |
|
519 |
|
520 #define PCRE_NEWLINE_BITS (PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|PCRE_NEWLINE_ANY| \ |
|
521 PCRE_NEWLINE_ANYCRLF) |
|
522 |
|
523 #define PUBLIC_OPTIONS \ |
|
524 (PCRE_CASELESS|PCRE_EXTENDED|PCRE_ANCHORED|PCRE_MULTILINE| \ |
|
525 PCRE_DOTALL|PCRE_DOLLAR_ENDONLY|PCRE_EXTRA|PCRE_UNGREEDY|PCRE_UTF8| \ |
|
526 PCRE_NO_AUTO_CAPTURE|PCRE_NO_UTF8_CHECK|PCRE_AUTO_CALLOUT|PCRE_FIRSTLINE| \ |
|
527 PCRE_DUPNAMES|PCRE_NEWLINE_BITS|PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE| \ |
|
528 PCRE_JAVASCRIPT_COMPAT) |
|
529 |
|
530 #define PUBLIC_EXEC_OPTIONS \ |
|
531 (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NO_UTF8_CHECK| \ |
|
532 PCRE_PARTIAL|PCRE_NEWLINE_BITS|PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE) |
|
533 |
|
534 #define PUBLIC_DFA_EXEC_OPTIONS \ |
|
535 (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NO_UTF8_CHECK| \ |
|
536 PCRE_PARTIAL|PCRE_DFA_SHORTEST|PCRE_DFA_RESTART|PCRE_NEWLINE_BITS| \ |
|
537 PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE) |
|
538 |
|
539 #define PUBLIC_STUDY_OPTIONS 0 /* None defined */ |
|
540 |
|
541 /* Magic number to provide a small check against being handed junk. Also used |
|
542 to detect whether a pattern was compiled on a host of different endianness. */ |
|
543 |
|
544 #define MAGIC_NUMBER 0x50435245UL /* 'PCRE' */ |
|
545 |
|
546 /* Negative values for the firstchar and reqchar variables */ |
|
547 |
|
548 #define REQ_UNSET (-2) |
|
549 #define REQ_NONE (-1) |
|
550 |
|
551 /* The maximum remaining length of subject we are prepared to search for a |
|
552 req_byte match. */ |
|
553 |
|
554 #define REQ_BYTE_MAX 1000 |
|
555 |
|
556 /* Flags added to firstbyte or reqbyte; a "non-literal" item is either a |
|
557 variable-length repeat, or a anything other than literal characters. */ |
|
558 |
|
559 #define REQ_CASELESS 0x0100 /* indicates caselessness */ |
|
560 #define REQ_VARY 0x0200 /* reqbyte followed non-literal item */ |
|
561 |
|
562 /* Miscellaneous definitions. The #ifndef is to pacify compiler warnings in |
|
563 environments where these macros are defined elsewhere. */ |
|
564 |
|
565 #ifndef FALSE |
|
566 typedef int BOOL; |
|
567 |
|
568 #define FALSE 0 |
|
569 #define TRUE 1 |
|
570 #endif |
|
571 |
|
572 /* Escape items that are just an encoding of a particular data value. */ |
|
573 |
|
574 #ifndef ESC_e |
|
575 #define ESC_e 27 |
|
576 #endif |
|
577 |
|
578 #ifndef ESC_f |
|
579 #define ESC_f '\f' |
|
580 #endif |
|
581 |
|
582 #ifndef ESC_n |
|
583 #define ESC_n '\n' |
|
584 #endif |
|
585 |
|
586 #ifndef ESC_r |
|
587 #define ESC_r '\r' |
|
588 #endif |
|
589 |
|
590 /* We can't officially use ESC_t because it is a POSIX reserved identifier |
|
591 (presumably because of all the others like size_t). */ |
|
592 |
|
593 #ifndef ESC_tee |
|
594 #define ESC_tee '\t' |
|
595 #endif |
|
596 |
|
597 /* Codes for different types of Unicode property */ |
|
598 |
|
599 #define PT_ANY 0 /* Any property - matches all chars */ |
|
600 #define PT_LAMP 1 /* L& - the union of Lu, Ll, Lt */ |
|
601 #define PT_GC 2 /* General characteristic (e.g. L) */ |
|
602 #define PT_PC 3 /* Particular characteristic (e.g. Lu) */ |
|
603 #define PT_SC 4 /* Script (e.g. Han) */ |
|
604 |
|
605 /* Flag bits and data types for the extended class (OP_XCLASS) for classes that |
|
606 contain UTF-8 characters with values greater than 255. */ |
|
607 |
|
608 #define XCL_NOT 0x01 /* Flag: this is a negative class */ |
|
609 #define XCL_MAP 0x02 /* Flag: a 32-byte map is present */ |
|
610 |
|
611 #define XCL_END 0 /* Marks end of individual items */ |
|
612 #define XCL_SINGLE 1 /* Single item (one multibyte char) follows */ |
|
613 #define XCL_RANGE 2 /* A range (two multibyte chars) follows */ |
|
614 #define XCL_PROP 3 /* Unicode property (2-byte property code follows) */ |
|
615 #define XCL_NOTPROP 4 /* Unicode inverted property (ditto) */ |
|
616 |
|
617 /* These are escaped items that aren't just an encoding of a particular data |
|
618 value such as \n. They must have non-zero values, as check_escape() returns |
|
619 their negation. Also, they must appear in the same order as in the opcode |
|
620 definitions below, up to ESC_z. There's a dummy for OP_ANY because it |
|
621 corresponds to "." rather than an escape sequence, and another for OP_ALLANY |
|
622 (which is used for [^] in JavaScript compatibility mode). |
|
623 |
|
624 The final escape must be ESC_REF as subsequent values are used for |
|
625 backreferences (\1, \2, \3, etc). There are two tests in the code for an escape |
|
626 greater than ESC_b and less than ESC_Z to detect the types that may be |
|
627 repeated. These are the types that consume characters. If any new escapes are |
|
628 put in between that don't consume a character, that code will have to change. |
|
629 */ |
|
630 |
|
631 enum { ESC_A = 1, ESC_G, ESC_K, ESC_B, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s, |
|
632 ESC_W, ESC_w, ESC_dum1, ESC_dum2, ESC_C, ESC_P, ESC_p, ESC_R, ESC_H, |
|
633 ESC_h, ESC_V, ESC_v, ESC_X, ESC_Z, ESC_z, ESC_E, ESC_Q, ESC_g, ESC_k, |
|
634 ESC_REF }; |
|
635 |
|
636 |
|
637 /* Opcode table: Starting from 1 (i.e. after OP_END), the values up to |
|
638 OP_EOD must correspond in order to the list of escapes immediately above. |
|
639 |
|
640 *** NOTE NOTE NOTE *** Whenever this list is updated, the two macro definitions |
|
641 that follow must also be updated to match. There is also a table called |
|
642 "coptable" in pcre_dfa_exec.c that must be updated. */ |
|
643 |
|
644 enum { |
|
645 OP_END, /* 0 End of pattern */ |
|
646 |
|
647 /* Values corresponding to backslashed metacharacters */ |
|
648 |
|
649 OP_SOD, /* 1 Start of data: \A */ |
|
650 OP_SOM, /* 2 Start of match (subject + offset): \G */ |
|
651 OP_SET_SOM, /* 3 Set start of match (\K) */ |
|
652 OP_NOT_WORD_BOUNDARY, /* 4 \B */ |
|
653 OP_WORD_BOUNDARY, /* 5 \b */ |
|
654 OP_NOT_DIGIT, /* 6 \D */ |
|
655 OP_DIGIT, /* 7 \d */ |
|
656 OP_NOT_WHITESPACE, /* 8 \S */ |
|
657 OP_WHITESPACE, /* 9 \s */ |
|
658 OP_NOT_WORDCHAR, /* 10 \W */ |
|
659 OP_WORDCHAR, /* 11 \w */ |
|
660 OP_ANY, /* 12 Match any character (subject to DOTALL) */ |
|
661 OP_ALLANY, /* 13 Match any character (not subject to DOTALL) */ |
|
662 OP_ANYBYTE, /* 14 Match any byte (\C); different to OP_ANY for UTF-8 */ |
|
663 OP_NOTPROP, /* 15 \P (not Unicode property) */ |
|
664 OP_PROP, /* 16 \p (Unicode property) */ |
|
665 OP_ANYNL, /* 17 \R (any newline sequence) */ |
|
666 OP_NOT_HSPACE, /* 18 \H (not horizontal whitespace) */ |
|
667 OP_HSPACE, /* 19 \h (horizontal whitespace) */ |
|
668 OP_NOT_VSPACE, /* 20 \V (not vertical whitespace) */ |
|
669 OP_VSPACE, /* 21 \v (vertical whitespace) */ |
|
670 OP_EXTUNI, /* 22 \X (extended Unicode sequence */ |
|
671 OP_EODN, /* 23 End of data or \n at end of data: \Z. */ |
|
672 OP_EOD, /* 24 End of data: \z */ |
|
673 |
|
674 OP_OPT, /* 25 Set runtime options */ |
|
675 OP_CIRC, /* 26 Start of line - varies with multiline switch */ |
|
676 OP_DOLL, /* 27 End of line - varies with multiline switch */ |
|
677 OP_CHAR, /* 28 Match one character, casefully */ |
|
678 OP_CHARNC, /* 29 Match one character, caselessly */ |
|
679 OP_NOT, /* 30 Match one character, not the following one */ |
|
680 |
|
681 OP_STAR, /* 31 The maximizing and minimizing versions of */ |
|
682 OP_MINSTAR, /* 32 these six opcodes must come in pairs, with */ |
|
683 OP_PLUS, /* 33 the minimizing one second. */ |
|
684 OP_MINPLUS, /* 34 This first set applies to single characters.*/ |
|
685 OP_QUERY, /* 35 */ |
|
686 OP_MINQUERY, /* 36 */ |
|
687 |
|
688 OP_UPTO, /* 37 From 0 to n matches */ |
|
689 OP_MINUPTO, /* 38 */ |
|
690 OP_EXACT, /* 39 Exactly n matches */ |
|
691 |
|
692 OP_POSSTAR, /* 40 Possessified star */ |
|
693 OP_POSPLUS, /* 41 Possessified plus */ |
|
694 OP_POSQUERY, /* 42 Posesssified query */ |
|
695 OP_POSUPTO, /* 43 Possessified upto */ |
|
696 |
|
697 OP_NOTSTAR, /* 44 The maximizing and minimizing versions of */ |
|
698 OP_NOTMINSTAR, /* 45 these six opcodes must come in pairs, with */ |
|
699 OP_NOTPLUS, /* 46 the minimizing one second. They must be in */ |
|
700 OP_NOTMINPLUS, /* 47 exactly the same order as those above. */ |
|
701 OP_NOTQUERY, /* 48 This set applies to "not" single characters. */ |
|
702 OP_NOTMINQUERY, /* 49 */ |
|
703 |
|
704 OP_NOTUPTO, /* 50 From 0 to n matches */ |
|
705 OP_NOTMINUPTO, /* 51 */ |
|
706 OP_NOTEXACT, /* 52 Exactly n matches */ |
|
707 |
|
708 OP_NOTPOSSTAR, /* 53 Possessified versions */ |
|
709 OP_NOTPOSPLUS, /* 54 */ |
|
710 OP_NOTPOSQUERY, /* 55 */ |
|
711 OP_NOTPOSUPTO, /* 56 */ |
|
712 |
|
713 OP_TYPESTAR, /* 57 The maximizing and minimizing versions of */ |
|
714 OP_TYPEMINSTAR, /* 58 these six opcodes must come in pairs, with */ |
|
715 OP_TYPEPLUS, /* 59 the minimizing one second. These codes must */ |
|
716 OP_TYPEMINPLUS, /* 60 be in exactly the same order as those above. */ |
|
717 OP_TYPEQUERY, /* 61 This set applies to character types such as \d */ |
|
718 OP_TYPEMINQUERY, /* 62 */ |
|
719 |
|
720 OP_TYPEUPTO, /* 63 From 0 to n matches */ |
|
721 OP_TYPEMINUPTO, /* 64 */ |
|
722 OP_TYPEEXACT, /* 65 Exactly n matches */ |
|
723 |
|
724 OP_TYPEPOSSTAR, /* 66 Possessified versions */ |
|
725 OP_TYPEPOSPLUS, /* 67 */ |
|
726 OP_TYPEPOSQUERY, /* 68 */ |
|
727 OP_TYPEPOSUPTO, /* 69 */ |
|
728 |
|
729 OP_CRSTAR, /* 70 The maximizing and minimizing versions of */ |
|
730 OP_CRMINSTAR, /* 71 all these opcodes must come in pairs, with */ |
|
731 OP_CRPLUS, /* 72 the minimizing one second. These codes must */ |
|
732 OP_CRMINPLUS, /* 73 be in exactly the same order as those above. */ |
|
733 OP_CRQUERY, /* 74 These are for character classes and back refs */ |
|
734 OP_CRMINQUERY, /* 75 */ |
|
735 OP_CRRANGE, /* 76 These are different to the three sets above. */ |
|
736 OP_CRMINRANGE, /* 77 */ |
|
737 |
|
738 OP_CLASS, /* 78 Match a character class, chars < 256 only */ |
|
739 OP_NCLASS, /* 79 Same, but the bitmap was created from a negative |
|
740 class - the difference is relevant only when a UTF-8 |
|
741 character > 255 is encountered. */ |
|
742 |
|
743 OP_XCLASS, /* 80 Extended class for handling UTF-8 chars within the |
|
744 class. This does both positive and negative. */ |
|
745 |
|
746 OP_REF, /* 81 Match a back reference */ |
|
747 OP_RECURSE, /* 82 Match a numbered subpattern (possibly recursive) */ |
|
748 OP_CALLOUT, /* 83 Call out to external function if provided */ |
|
749 |
|
750 OP_ALT, /* 84 Start of alternation */ |
|
751 OP_KET, /* 85 End of group that doesn't have an unbounded repeat */ |
|
752 OP_KETRMAX, /* 86 These two must remain together and in this */ |
|
753 OP_KETRMIN, /* 87 order. They are for groups the repeat for ever. */ |
|
754 |
|
755 /* The assertions must come before BRA, CBRA, ONCE, and COND.*/ |
|
756 |
|
757 OP_ASSERT, /* 88 Positive lookahead */ |
|
758 OP_ASSERT_NOT, /* 89 Negative lookahead */ |
|
759 OP_ASSERTBACK, /* 90 Positive lookbehind */ |
|
760 OP_ASSERTBACK_NOT, /* 91 Negative lookbehind */ |
|
761 OP_REVERSE, /* 92 Move pointer back - used in lookbehind assertions */ |
|
762 |
|
763 /* ONCE, BRA, CBRA, and COND must come after the assertions, with ONCE first, |
|
764 as there's a test for >= ONCE for a subpattern that isn't an assertion. */ |
|
765 |
|
766 OP_ONCE, /* 93 Atomic group */ |
|
767 OP_BRA, /* 94 Start of non-capturing bracket */ |
|
768 OP_CBRA, /* 95 Start of capturing bracket */ |
|
769 OP_COND, /* 96 Conditional group */ |
|
770 |
|
771 /* These three must follow the previous three, in the same order. There's a |
|
772 check for >= SBRA to distinguish the two sets. */ |
|
773 |
|
774 OP_SBRA, /* 97 Start of non-capturing bracket, check empty */ |
|
775 OP_SCBRA, /* 98 Start of capturing bracket, check empty */ |
|
776 OP_SCOND, /* 99 Conditional group, check empty */ |
|
777 |
|
778 OP_CREF, /* 100 Used to hold a capture number as condition */ |
|
779 OP_RREF, /* 101 Used to hold a recursion number as condition */ |
|
780 OP_DEF, /* 102 The DEFINE condition */ |
|
781 |
|
782 OP_BRAZERO, /* 103 These two must remain together and in this */ |
|
783 OP_BRAMINZERO, /* 104 order. */ |
|
784 |
|
785 /* These are backtracking control verbs */ |
|
786 |
|
787 OP_PRUNE, /* 105 */ |
|
788 OP_SKIP, /* 106 */ |
|
789 OP_THEN, /* 107 */ |
|
790 OP_COMMIT, /* 108 */ |
|
791 |
|
792 /* These are forced failure and success verbs */ |
|
793 |
|
794 OP_FAIL, /* 109 */ |
|
795 OP_ACCEPT, /* 110 */ |
|
796 |
|
797 /* This is used to skip a subpattern with a {0} quantifier */ |
|
798 |
|
799 OP_SKIPZERO /* 111 */ |
|
800 }; |
|
801 |
|
802 |
|
803 /* This macro defines textual names for all the opcodes. These are used only |
|
804 for debugging. The macro is referenced only in pcre_printint.c. */ |
|
805 |
|
806 #define OP_NAME_LIST \ |
|
807 "End", "\\A", "\\G", "\\K", "\\B", "\\b", "\\D", "\\d", \ |
|
808 "\\S", "\\s", "\\W", "\\w", "Any", "AllAny", "Anybyte", \ |
|
809 "notprop", "prop", "\\R", "\\H", "\\h", "\\V", "\\v", \ |
|
810 "extuni", "\\Z", "\\z", \ |
|
811 "Opt", "^", "$", "char", "charnc", "not", \ |
|
812 "*", "*?", "+", "+?", "?", "??", "{", "{", "{", \ |
|
813 "*+","++", "?+", "{", \ |
|
814 "*", "*?", "+", "+?", "?", "??", "{", "{", "{", \ |
|
815 "*+","++", "?+", "{", \ |
|
816 "*", "*?", "+", "+?", "?", "??", "{", "{", "{", \ |
|
817 "*+","++", "?+", "{", \ |
|
818 "*", "*?", "+", "+?", "?", "??", "{", "{", \ |
|
819 "class", "nclass", "xclass", "Ref", "Recurse", "Callout", \ |
|
820 "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", \ |
|
821 "AssertB", "AssertB not", "Reverse", \ |
|
822 "Once", "Bra", "CBra", "Cond", "SBra", "SCBra", "SCond", \ |
|
823 "Cond ref", "Cond rec", "Cond def", "Brazero", "Braminzero", \ |
|
824 "*PRUNE", "*SKIP", "*THEN", "*COMMIT", "*FAIL", "*ACCEPT", \ |
|
825 "Skip zero" |
|
826 |
|
827 |
|
828 /* This macro defines the length of fixed length operations in the compiled |
|
829 regex. The lengths are used when searching for specific things, and also in the |
|
830 debugging printing of a compiled regex. We use a macro so that it can be |
|
831 defined close to the definitions of the opcodes themselves. |
|
832 |
|
833 As things have been extended, some of these are no longer fixed lenths, but are |
|
834 minima instead. For example, the length of a single-character repeat may vary |
|
835 in UTF-8 mode. The code that uses this table must know about such things. */ |
|
836 |
|
837 #define OP_LENGTHS \ |
|
838 1, /* End */ \ |
|
839 1, 1, 1, 1, 1, /* \A, \G, \K, \B, \b */ \ |
|
840 1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */ \ |
|
841 1, 1, 1, /* Any, AllAny, Anybyte */ \ |
|
842 3, 3, 1, /* NOTPROP, PROP, EXTUNI */ \ |
|
843 1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */ \ |
|
844 1, 1, 2, 1, 1, /* \Z, \z, Opt, ^, $ */ \ |
|
845 2, /* Char - the minimum length */ \ |
|
846 2, /* Charnc - the minimum length */ \ |
|
847 2, /* not */ \ |
|
848 /* Positive single-char repeats ** These are */ \ |
|
849 2, 2, 2, 2, 2, 2, /* *, *?, +, +?, ?, ?? ** minima in */ \ |
|
850 4, 4, 4, /* upto, minupto, exact ** UTF-8 mode */ \ |
|
851 2, 2, 2, 4, /* *+, ++, ?+, upto+ */ \ |
|
852 /* Negative single-char repeats - only for chars < 256 */ \ |
|
853 2, 2, 2, 2, 2, 2, /* NOT *, *?, +, +?, ?, ?? */ \ |
|
854 4, 4, 4, /* NOT upto, minupto, exact */ \ |
|
855 2, 2, 2, 4, /* Possessive *, +, ?, upto */ \ |
|
856 /* Positive type repeats */ \ |
|
857 2, 2, 2, 2, 2, 2, /* Type *, *?, +, +?, ?, ?? */ \ |
|
858 4, 4, 4, /* Type upto, minupto, exact */ \ |
|
859 2, 2, 2, 4, /* Possessive *+, ++, ?+, upto+ */ \ |
|
860 /* Character class & ref repeats */ \ |
|
861 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */ \ |
|
862 5, 5, /* CRRANGE, CRMINRANGE */ \ |
|
863 33, /* CLASS */ \ |
|
864 33, /* NCLASS */ \ |
|
865 0, /* XCLASS - variable length */ \ |
|
866 3, /* REF */ \ |
|
867 1+LINK_SIZE, /* RECURSE */ \ |
|
868 2+2*LINK_SIZE, /* CALLOUT */ \ |
|
869 1+LINK_SIZE, /* Alt */ \ |
|
870 1+LINK_SIZE, /* Ket */ \ |
|
871 1+LINK_SIZE, /* KetRmax */ \ |
|
872 1+LINK_SIZE, /* KetRmin */ \ |
|
873 1+LINK_SIZE, /* Assert */ \ |
|
874 1+LINK_SIZE, /* Assert not */ \ |
|
875 1+LINK_SIZE, /* Assert behind */ \ |
|
876 1+LINK_SIZE, /* Assert behind not */ \ |
|
877 1+LINK_SIZE, /* Reverse */ \ |
|
878 1+LINK_SIZE, /* ONCE */ \ |
|
879 1+LINK_SIZE, /* BRA */ \ |
|
880 3+LINK_SIZE, /* CBRA */ \ |
|
881 1+LINK_SIZE, /* COND */ \ |
|
882 1+LINK_SIZE, /* SBRA */ \ |
|
883 3+LINK_SIZE, /* SCBRA */ \ |
|
884 1+LINK_SIZE, /* SCOND */ \ |
|
885 3, /* CREF */ \ |
|
886 3, /* RREF */ \ |
|
887 1, /* DEF */ \ |
|
888 1, 1, /* BRAZERO, BRAMINZERO */ \ |
|
889 1, 1, 1, 1, /* PRUNE, SKIP, THEN, COMMIT, */ \ |
|
890 1, 1, 1 /* FAIL, ACCEPT, SKIPZERO */ |
|
891 |
|
892 |
|
893 /* A magic value for OP_RREF to indicate the "any recursion" condition. */ |
|
894 |
|
895 #define RREF_ANY 0xffff |
|
896 |
|
897 /* Error code numbers. They are given names so that they can more easily be |
|
898 tracked. */ |
|
899 |
|
900 enum { ERR0, ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9, |
|
901 ERR10, ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17, ERR18, ERR19, |
|
902 ERR20, ERR21, ERR22, ERR23, ERR24, ERR25, ERR26, ERR27, ERR28, ERR29, |
|
903 ERR30, ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39, |
|
904 ERR40, ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49, |
|
905 ERR50, ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59, |
|
906 ERR60, ERR61, ERR62, ERR63, ERR64 }; |
|
907 |
|
908 /* The real format of the start of the pcre block; the index of names and the |
|
909 code vector run on as long as necessary after the end. We store an explicit |
|
910 offset to the name table so that if a regex is compiled on one host, saved, and |
|
911 then run on another where the size of pointers is different, all might still |
|
912 be well. For the case of compiled-on-4 and run-on-8, we include an extra |
|
913 pointer that is always NULL. For future-proofing, a few dummy fields were |
|
914 originally included - even though you can never get this planning right - but |
|
915 there is only one left now. |
|
916 |
|
917 NOTE NOTE NOTE: |
|
918 Because people can now save and re-use compiled patterns, any additions to this |
|
919 structure should be made at the end, and something earlier (e.g. a new |
|
920 flag in the options or one of the dummy fields) should indicate that the new |
|
921 fields are present. Currently PCRE always sets the dummy fields to zero. |
|
922 NOTE NOTE NOTE: |
|
923 */ |
|
924 |
|
925 typedef struct real_pcre { |
|
926 pcre_uint32 magic_number; |
|
927 pcre_uint32 size; /* Total that was malloced */ |
|
928 pcre_uint32 options; /* Public options */ |
|
929 pcre_uint16 flags; /* Private flags */ |
|
930 pcre_uint16 dummy1; /* For future use */ |
|
931 pcre_uint16 top_bracket; |
|
932 pcre_uint16 top_backref; |
|
933 pcre_uint16 first_byte; |
|
934 pcre_uint16 req_byte; |
|
935 pcre_uint16 name_table_offset; /* Offset to name table that follows */ |
|
936 pcre_uint16 name_entry_size; /* Size of any name items */ |
|
937 pcre_uint16 name_count; /* Number of name items */ |
|
938 pcre_uint16 ref_count; /* Reference count */ |
|
939 |
|
940 const unsigned char *tables; /* Pointer to tables or NULL for std */ |
|
941 const unsigned char *nullpad; /* NULL padding */ |
|
942 } real_pcre; |
|
943 |
|
944 /* The format of the block used to store data from pcre_study(). The same |
|
945 remark (see NOTE above) about extending this structure applies. */ |
|
946 |
|
947 typedef struct pcre_study_data { |
|
948 pcre_uint32 size; /* Total that was malloced */ |
|
949 pcre_uint32 options; |
|
950 uschar start_bits[32]; |
|
951 } pcre_study_data; |
|
952 |
|
953 /* Structure for passing "static" information around between the functions |
|
954 doing the compiling, so that they are thread-safe. */ |
|
955 |
|
956 typedef struct compile_data { |
|
957 const uschar *lcc; /* Points to lower casing table */ |
|
958 const uschar *fcc; /* Points to case-flipping table */ |
|
959 const uschar *cbits; /* Points to character type table */ |
|
960 const uschar *ctypes; /* Points to table of type maps */ |
|
961 const uschar *start_workspace;/* The start of working space */ |
|
962 const uschar *start_code; /* The start of the compiled code */ |
|
963 const uschar *start_pattern; /* The start of the pattern */ |
|
964 const uschar *end_pattern; /* The end of the pattern */ |
|
965 uschar *hwm; /* High watermark of workspace */ |
|
966 uschar *name_table; /* The name/number table */ |
|
967 int names_found; /* Number of entries so far */ |
|
968 int name_entry_size; /* Size of each entry */ |
|
969 int bracount; /* Count of capturing parens as we compile */ |
|
970 int final_bracount; /* Saved value after first pass */ |
|
971 int top_backref; /* Maximum back reference */ |
|
972 unsigned int backref_map; /* Bitmap of low back refs */ |
|
973 int external_options; /* External (initial) options */ |
|
974 int external_flags; /* External flag bits to be set */ |
|
975 int req_varyopt; /* "After variable item" flag for reqbyte */ |
|
976 BOOL had_accept; /* (*ACCEPT) encountered */ |
|
977 int nltype; /* Newline type */ |
|
978 int nllen; /* Newline string length */ |
|
979 uschar nl[4]; /* Newline string when fixed length */ |
|
980 } compile_data; |
|
981 |
|
982 /* Structure for maintaining a chain of pointers to the currently incomplete |
|
983 branches, for testing for left recursion. */ |
|
984 |
|
985 typedef struct branch_chain { |
|
986 struct branch_chain *outer; |
|
987 uschar *current; |
|
988 } branch_chain; |
|
989 |
|
990 /* Structure for items in a linked list that represents an explicit recursive |
|
991 call within the pattern. */ |
|
992 |
|
993 typedef struct recursion_info { |
|
994 struct recursion_info *prevrec; /* Previous recursion record (or NULL) */ |
|
995 int group_num; /* Number of group that was called */ |
|
996 const uschar *after_call; /* "Return value": points after the call in the expr */ |
|
997 USPTR save_start; /* Old value of mstart */ |
|
998 int *offset_save; /* Pointer to start of saved offsets */ |
|
999 int saved_max; /* Number of saved offsets */ |
|
1000 } recursion_info; |
|
1001 |
|
1002 /* Structure for building a chain of data for holding the values of the subject |
|
1003 pointer at the start of each subpattern, so as to detect when an empty string |
|
1004 has been matched by a subpattern - to break infinite loops. */ |
|
1005 |
|
1006 typedef struct eptrblock { |
|
1007 struct eptrblock *epb_prev; |
|
1008 USPTR epb_saved_eptr; |
|
1009 } eptrblock; |
|
1010 |
|
1011 |
|
1012 /* Structure for passing "static" information around between the functions |
|
1013 doing traditional NFA matching, so that they are thread-safe. */ |
|
1014 |
|
1015 typedef struct match_data { |
|
1016 unsigned long int match_call_count; /* As it says */ |
|
1017 unsigned long int match_limit; /* As it says */ |
|
1018 unsigned long int match_limit_recursion; /* As it says */ |
|
1019 int *offset_vector; /* Offset vector */ |
|
1020 int offset_end; /* One past the end */ |
|
1021 int offset_max; /* The maximum usable for return data */ |
|
1022 int nltype; /* Newline type */ |
|
1023 int nllen; /* Newline string length */ |
|
1024 uschar nl[4]; /* Newline string when fixed */ |
|
1025 const uschar *lcc; /* Points to lower casing table */ |
|
1026 const uschar *ctypes; /* Points to table of type maps */ |
|
1027 BOOL offset_overflow; /* Set if too many extractions */ |
|
1028 BOOL notbol; /* NOTBOL flag */ |
|
1029 BOOL noteol; /* NOTEOL flag */ |
|
1030 BOOL utf8; /* UTF8 flag */ |
|
1031 BOOL jscript_compat; /* JAVASCRIPT_COMPAT flag */ |
|
1032 BOOL endonly; /* Dollar not before final \n */ |
|
1033 BOOL notempty; /* Empty string match not wanted */ |
|
1034 BOOL partial; /* PARTIAL flag */ |
|
1035 BOOL hitend; /* Hit the end of the subject at some point */ |
|
1036 BOOL bsr_anycrlf; /* \R is just any CRLF, not full Unicode */ |
|
1037 const uschar *start_code; /* For use when recursing */ |
|
1038 USPTR start_subject; /* Start of the subject string */ |
|
1039 USPTR end_subject; /* End of the subject string */ |
|
1040 USPTR start_match_ptr; /* Start of matched string */ |
|
1041 USPTR end_match_ptr; /* Subject position at end match */ |
|
1042 int end_offset_top; /* Highwater mark at end of match */ |
|
1043 int capture_last; /* Most recent capture number */ |
|
1044 int start_offset; /* The start offset value */ |
|
1045 eptrblock *eptrchain; /* Chain of eptrblocks for tail recursions */ |
|
1046 int eptrn; /* Next free eptrblock */ |
|
1047 recursion_info *recursive; /* Linked list of recursion data */ |
|
1048 void *callout_data; /* To pass back to callouts */ |
|
1049 } match_data; |
|
1050 |
|
1051 /* A similar structure is used for the same purpose by the DFA matching |
|
1052 functions. */ |
|
1053 |
|
1054 typedef struct dfa_match_data { |
|
1055 const uschar *start_code; /* Start of the compiled pattern */ |
|
1056 const uschar *start_subject; /* Start of the subject string */ |
|
1057 const uschar *end_subject; /* End of subject string */ |
|
1058 const uschar *tables; /* Character tables */ |
|
1059 int moptions; /* Match options */ |
|
1060 int poptions; /* Pattern options */ |
|
1061 int nltype; /* Newline type */ |
|
1062 int nllen; /* Newline string length */ |
|
1063 uschar nl[4]; /* Newline string when fixed */ |
|
1064 void *callout_data; /* To pass back to callouts */ |
|
1065 } dfa_match_data; |
|
1066 |
|
1067 /* Bit definitions for entries in the pcre_ctypes table. */ |
|
1068 |
|
1069 #define ctype_space 0x01 |
|
1070 #define ctype_letter 0x02 |
|
1071 #define ctype_digit 0x04 |
|
1072 #define ctype_xdigit 0x08 |
|
1073 #define ctype_word 0x10 /* alphanumeric or '_' */ |
|
1074 #define ctype_meta 0x80 /* regexp meta char or zero (end pattern) */ |
|
1075 |
|
1076 /* Offsets for the bitmap tables in pcre_cbits. Each table contains a set |
|
1077 of bits for a class map. Some classes are built by combining these tables. */ |
|
1078 |
|
1079 #define cbit_space 0 /* [:space:] or \s */ |
|
1080 #define cbit_xdigit 32 /* [:xdigit:] */ |
|
1081 #define cbit_digit 64 /* [:digit:] or \d */ |
|
1082 #define cbit_upper 96 /* [:upper:] */ |
|
1083 #define cbit_lower 128 /* [:lower:] */ |
|
1084 #define cbit_word 160 /* [:word:] or \w */ |
|
1085 #define cbit_graph 192 /* [:graph:] */ |
|
1086 #define cbit_print 224 /* [:print:] */ |
|
1087 #define cbit_punct 256 /* [:punct:] */ |
|
1088 #define cbit_cntrl 288 /* [:cntrl:] */ |
|
1089 #define cbit_length 320 /* Length of the cbits table */ |
|
1090 |
|
1091 /* Offsets of the various tables from the base tables pointer, and |
|
1092 total length. */ |
|
1093 |
|
1094 #define lcc_offset 0 |
|
1095 #define fcc_offset 256 |
|
1096 #define cbits_offset 512 |
|
1097 #define ctypes_offset (cbits_offset + cbit_length) |
|
1098 #define tables_length (ctypes_offset + 256) |
|
1099 |
|
1100 /* Layout of the UCP type table that translates property names into types and |
|
1101 codes. Each entry used to point directly to a name, but to reduce the number of |
|
1102 relocations in shared libraries, it now has an offset into a single string |
|
1103 instead. */ |
|
1104 |
|
1105 typedef struct { |
|
1106 pcre_uint16 name_offset; |
|
1107 pcre_uint16 type; |
|
1108 pcre_uint16 value; |
|
1109 } ucp_type_table; |
|
1110 |
|
1111 |
|
1112 /* Internal shared data tables. These are tables that are used by more than one |
|
1113 of the exported public functions. They have to be "external" in the C sense, |
|
1114 but are not part of the PCRE public API. The data for these tables is in the |
|
1115 pcre_tables.c module. */ |
|
1116 |
|
1117 extern const int _pcre_utf8_table1[]; |
|
1118 extern const int _pcre_utf8_table2[]; |
|
1119 extern const int _pcre_utf8_table3[]; |
|
1120 extern const uschar _pcre_utf8_table4[]; |
|
1121 |
|
1122 extern const int _pcre_utf8_table1_size; |
|
1123 |
|
1124 extern const char _pcre_utt_names[]; |
|
1125 extern const ucp_type_table _pcre_utt[]; |
|
1126 extern const int _pcre_utt_size; |
|
1127 |
|
1128 extern const uschar _pcre_default_tables[]; |
|
1129 |
|
1130 extern const uschar _pcre_OP_lengths[]; |
|
1131 |
|
1132 |
|
1133 /* Internal shared functions. These are functions that are used by more than |
|
1134 one of the exported public functions. They have to be "external" in the C |
|
1135 sense, but are not part of the PCRE public API. */ |
|
1136 |
|
1137 extern BOOL _pcre_is_newline(const uschar *, int, const uschar *, |
|
1138 int *, BOOL); |
|
1139 extern int _pcre_ord2utf8(int, uschar *); |
|
1140 extern real_pcre *_pcre_try_flipped(const real_pcre *, real_pcre *, |
|
1141 const pcre_study_data *, pcre_study_data *); |
|
1142 extern int _pcre_valid_utf8(const uschar *, int); |
|
1143 extern BOOL _pcre_was_newline(const uschar *, int, const uschar *, |
|
1144 int *, BOOL); |
|
1145 extern BOOL _pcre_xclass(int, const uschar *); |
|
1146 |
|
1147 |
|
1148 /* Unicode character database (UCD) */ |
|
1149 |
|
1150 typedef struct { |
|
1151 uschar script; |
|
1152 uschar chartype; |
|
1153 pcre_int32 other_case; |
|
1154 } ucd_record; |
|
1155 |
|
1156 extern const ucd_record _pcre_ucd_records[]; |
|
1157 extern const uschar _pcre_ucd_stage1[]; |
|
1158 extern const pcre_uint16 _pcre_ucd_stage2[]; |
|
1159 extern const int _pcre_ucp_gentype[]; |
|
1160 |
|
1161 |
|
1162 /* UCD access macros */ |
|
1163 |
|
1164 #define UCD_BLOCK_SIZE 128 |
|
1165 #define GET_UCD(ch) (_pcre_ucd_records + \ |
|
1166 _pcre_ucd_stage2[_pcre_ucd_stage1[(ch) / UCD_BLOCK_SIZE] * \ |
|
1167 UCD_BLOCK_SIZE + ch % UCD_BLOCK_SIZE]) |
|
1168 |
|
1169 #define UCD_CHARTYPE(ch) GET_UCD(ch)->chartype |
|
1170 #define UCD_SCRIPT(ch) GET_UCD(ch)->script |
|
1171 #define UCD_CATEGORY(ch) _pcre_ucp_gentype[UCD_CHARTYPE(ch)] |
|
1172 #define UCD_OTHERCASE(ch) (ch + GET_UCD(ch)->other_case) |
|
1173 |
|
1174 #endif |
|
1175 |
|
1176 /* End of pcre_internal.h */ |