|
1 /* This is JavaScriptCore's variant of the PCRE library. While this library |
|
2 started out as a copy of PCRE, many of the features of PCRE have been |
|
3 removed. This library now supports only the regular expression features |
|
4 required by the JavaScript language specification, and has only the functions |
|
5 needed by JavaScriptCore and the rest of WebKit. |
|
6 |
|
7 Originally written by Philip Hazel |
|
8 Copyright (c) 1997-2006 University of Cambridge |
|
9 Copyright (C) 2002, 2004, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved. |
|
10 |
|
11 ----------------------------------------------------------------------------- |
|
12 Redistribution and use in source and binary forms, with or without |
|
13 modification, are permitted provided that the following conditions are met: |
|
14 |
|
15 * Redistributions of source code must retain the above copyright notice, |
|
16 this list of conditions and the following disclaimer. |
|
17 |
|
18 * Redistributions in binary form must reproduce the above copyright |
|
19 notice, this list of conditions and the following disclaimer in the |
|
20 documentation and/or other materials provided with the distribution. |
|
21 |
|
22 * Neither the name of the University of Cambridge nor the names of its |
|
23 contributors may be used to endorse or promote products derived from |
|
24 this software without specific prior written permission. |
|
25 |
|
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
|
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
|
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
|
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
|
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
|
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
|
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
|
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
|
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
|
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
|
36 POSSIBILITY OF SUCH DAMAGE. |
|
37 ----------------------------------------------------------------------------- |
|
38 */ |
|
39 |
|
40 /* This header contains definitions that are shared between the different |
|
41 modules, but which are not relevant to the exported API. This includes some |
|
42 functions whose names all begin with "_pcre_". */ |
|
43 |
|
44 #ifndef PCRE_INTERNAL_H |
|
45 #define PCRE_INTERNAL_H |
|
46 |
|
47 /* Bit definitions for entries in the pcre_ctypes table. */ |
|
48 |
|
49 #define ctype_space 0x01 |
|
50 #define ctype_xdigit 0x08 |
|
51 #define ctype_word 0x10 /* alphameric or '_' */ |
|
52 |
|
53 /* Offsets for the bitmap tables in pcre_cbits. Each table contains a set |
|
54 of bits for a class map. Some classes are built by combining these tables. */ |
|
55 |
|
56 #define cbit_space 0 /* \s */ |
|
57 #define cbit_digit 32 /* \d */ |
|
58 #define cbit_word 64 /* \w */ |
|
59 #define cbit_length 96 /* Length of the cbits table */ |
|
60 |
|
61 /* Offsets of the various tables from the base tables pointer, and |
|
62 total length. */ |
|
63 |
|
64 #define lcc_offset 0 |
|
65 #define fcc_offset 128 |
|
66 #define cbits_offset 256 |
|
67 #define ctypes_offset (cbits_offset + cbit_length) |
|
68 #define tables_length (ctypes_offset + 128) |
|
69 |
|
70 #ifndef DFTABLES |
|
71 |
|
72 // Change the following to 1 to dump used regular expressions at process exit time. |
|
73 #define REGEXP_HISTOGRAM 0 |
|
74 |
|
75 #include "Assertions.h" |
|
76 |
|
77 #if COMPILER(MSVC) |
|
78 #pragma warning(disable: 4232) |
|
79 #pragma warning(disable: 4244) |
|
80 #endif |
|
81 |
|
82 #include "pcre.h" |
|
83 |
|
84 /* The value of LINK_SIZE determines the number of bytes used to store links as |
|
85 offsets within the compiled regex. The default is 2, which allows for compiled |
|
86 patterns up to 64K long. */ |
|
87 |
|
88 #define LINK_SIZE 3 |
|
89 |
|
90 /* Define DEBUG to get debugging output on stdout. */ |
|
91 |
|
92 #if 0 |
|
93 #define DEBUG |
|
94 #endif |
|
95 |
|
96 /* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef |
|
97 inline, and there are *still* stupid compilers about that don't like indented |
|
98 pre-processor statements, or at least there were when I first wrote this. After |
|
99 all, it had only been about 10 years then... */ |
|
100 |
|
101 #ifdef DEBUG |
|
102 #define DPRINTF(p) printf p |
|
103 #else |
|
104 #define DPRINTF(p) /*nothing*/ |
|
105 #endif |
|
106 |
|
107 /* PCRE keeps offsets in its compiled code as 2-byte quantities (always stored |
|
108 in big-endian order) by default. These are used, for example, to link from the |
|
109 start of a subpattern to its alternatives and its end. The use of 2 bytes per |
|
110 offset limits the size of the compiled regex to around 64K, which is big enough |
|
111 for almost everybody. However, I received a request for an even bigger limit. |
|
112 For this reason, and also to make the code easier to maintain, the storing and |
|
113 loading of offsets from the byte string is now handled by the functions that are |
|
114 defined here. */ |
|
115 |
|
116 /* PCRE uses some other 2-byte quantities that do not change when the size of |
|
117 offsets changes. There are used for repeat counts and for other things such as |
|
118 capturing parenthesis numbers in back references. */ |
|
119 |
|
120 static inline void put2ByteValue(unsigned char* opcodePtr, int value) |
|
121 { |
|
122 ASSERT(value >= 0 && value <= 0xFFFF); |
|
123 opcodePtr[0] = value >> 8; |
|
124 opcodePtr[1] = value; |
|
125 } |
|
126 |
|
127 static inline void put3ByteValue(unsigned char* opcodePtr, int value) |
|
128 { |
|
129 ASSERT(value >= 0 && value <= 0xFFFFFF); |
|
130 opcodePtr[0] = value >> 16; |
|
131 opcodePtr[1] = value >> 8; |
|
132 opcodePtr[2] = value; |
|
133 } |
|
134 |
|
135 static inline int get2ByteValue(const unsigned char* opcodePtr) |
|
136 { |
|
137 return (opcodePtr[0] << 8) | opcodePtr[1]; |
|
138 } |
|
139 |
|
140 static inline int get3ByteValue(const unsigned char* opcodePtr) |
|
141 { |
|
142 return (opcodePtr[0] << 16) | (opcodePtr[1] << 8) | opcodePtr[2]; |
|
143 } |
|
144 |
|
145 static inline void put2ByteValueAndAdvance(unsigned char*& opcodePtr, int value) |
|
146 { |
|
147 put2ByteValue(opcodePtr, value); |
|
148 opcodePtr += 2; |
|
149 } |
|
150 |
|
151 static inline void put3ByteValueAndAdvance(unsigned char*& opcodePtr, int value) |
|
152 { |
|
153 put3ByteValue(opcodePtr, value); |
|
154 opcodePtr += 3; |
|
155 } |
|
156 |
|
157 static inline void putLinkValueAllowZero(unsigned char* opcodePtr, int value) |
|
158 { |
|
159 #if LINK_SIZE == 3 |
|
160 put3ByteValue(opcodePtr, value); |
|
161 #elif LINK_SIZE == 2 |
|
162 put2ByteValue(opcodePtr, value); |
|
163 #else |
|
164 # error LINK_SIZE not supported. |
|
165 #endif |
|
166 } |
|
167 |
|
168 static inline int getLinkValueAllowZero(const unsigned char* opcodePtr) |
|
169 { |
|
170 #if LINK_SIZE == 3 |
|
171 return get3ByteValue(opcodePtr); |
|
172 #elif LINK_SIZE == 2 |
|
173 return get2ByteValue(opcodePtr); |
|
174 #else |
|
175 # error LINK_SIZE not supported. |
|
176 #endif |
|
177 } |
|
178 |
|
179 #define MAX_PATTERN_SIZE 1024 * 1024 // Derived by empirical testing of compile time in PCRE and WREC. |
|
180 COMPILE_ASSERT(MAX_PATTERN_SIZE < (1 << (8 * LINK_SIZE)), pcre_max_pattern_fits_in_bytecode); |
|
181 |
|
182 static inline void putLinkValue(unsigned char* opcodePtr, int value) |
|
183 { |
|
184 ASSERT(value); |
|
185 putLinkValueAllowZero(opcodePtr, value); |
|
186 } |
|
187 |
|
188 static inline int getLinkValue(const unsigned char* opcodePtr) |
|
189 { |
|
190 int value = getLinkValueAllowZero(opcodePtr); |
|
191 ASSERT(value); |
|
192 return value; |
|
193 } |
|
194 |
|
195 static inline void putLinkValueAndAdvance(unsigned char*& opcodePtr, int value) |
|
196 { |
|
197 putLinkValue(opcodePtr, value); |
|
198 opcodePtr += LINK_SIZE; |
|
199 } |
|
200 |
|
201 static inline void putLinkValueAllowZeroAndAdvance(unsigned char*& opcodePtr, int value) |
|
202 { |
|
203 putLinkValueAllowZero(opcodePtr, value); |
|
204 opcodePtr += LINK_SIZE; |
|
205 } |
|
206 |
|
207 // FIXME: These are really more of a "compiled regexp state" than "regexp options" |
|
208 enum RegExpOptions { |
|
209 UseFirstByteOptimizationOption = 0x40000000, /* firstByte is set */ |
|
210 UseRequiredByteOptimizationOption = 0x20000000, /* reqByte is set */ |
|
211 UseMultiLineFirstByteOptimizationOption = 0x10000000, /* start after \n for multiline */ |
|
212 IsAnchoredOption = 0x02000000, /* can't use partial with this regex */ |
|
213 IgnoreCaseOption = 0x00000001, |
|
214 MatchAcrossMultipleLinesOption = 0x00000002 |
|
215 }; |
|
216 |
|
217 /* Flags added to firstByte or reqByte; a "non-literal" item is either a |
|
218 variable-length repeat, or a anything other than literal characters. */ |
|
219 |
|
220 #define REQ_IGNORE_CASE 0x0100 /* indicates should ignore case */ |
|
221 #define REQ_VARY 0x0200 /* reqByte followed non-literal item */ |
|
222 |
|
223 /* Miscellaneous definitions */ |
|
224 |
|
225 /* Flag bits and data types for the extended class (OP_XCLASS) for classes that |
|
226 contain UTF-8 characters with values greater than 255. */ |
|
227 |
|
228 #define XCL_NOT 0x01 /* Flag: this is a negative class */ |
|
229 #define XCL_MAP 0x02 /* Flag: a 32-byte map is present */ |
|
230 |
|
231 #define XCL_END 0 /* Marks end of individual items */ |
|
232 #define XCL_SINGLE 1 /* Single item (one multibyte char) follows */ |
|
233 #define XCL_RANGE 2 /* A range (two multibyte chars) follows */ |
|
234 |
|
235 /* These are escaped items that aren't just an encoding of a particular data |
|
236 value such as \n. They must have non-zero values, as check_escape() returns |
|
237 their negation. Also, they must appear in the same order as in the opcode |
|
238 definitions below, up to ESC_w. The final one must be |
|
239 ESC_REF as subsequent values are used for \1, \2, \3, etc. There is are two |
|
240 tests in the code for an escape > ESC_b and <= ESC_w to |
|
241 detect the types that may be repeated. These are the types that consume |
|
242 characters. If any new escapes are put in between that don't consume a |
|
243 character, that code will have to change. */ |
|
244 |
|
245 enum { ESC_B = 1, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s, ESC_W, ESC_w, ESC_REF }; |
|
246 |
|
247 /* Opcode table: OP_BRA must be last, as all values >= it are used for brackets |
|
248 that extract substrings. Starting from 1 (i.e. after OP_END), the values up to |
|
249 OP_EOD must correspond in order to the list of escapes immediately above. |
|
250 Note that whenever this list is updated, the two macro definitions that follow |
|
251 must also be updated to match. */ |
|
252 |
|
253 #define FOR_EACH_OPCODE(macro) \ |
|
254 macro(END) \ |
|
255 \ |
|
256 macro(NOT_WORD_BOUNDARY) \ |
|
257 macro(WORD_BOUNDARY) \ |
|
258 macro(NOT_DIGIT) \ |
|
259 macro(DIGIT) \ |
|
260 macro(NOT_WHITESPACE) \ |
|
261 macro(WHITESPACE) \ |
|
262 macro(NOT_WORDCHAR) \ |
|
263 macro(WORDCHAR) \ |
|
264 \ |
|
265 macro(NOT_NEWLINE) \ |
|
266 \ |
|
267 macro(CIRC) \ |
|
268 macro(DOLL) \ |
|
269 macro(BOL) \ |
|
270 macro(EOL) \ |
|
271 macro(CHAR) \ |
|
272 macro(CHAR_IGNORING_CASE) \ |
|
273 macro(ASCII_CHAR) \ |
|
274 macro(ASCII_LETTER_IGNORING_CASE) \ |
|
275 macro(NOT) \ |
|
276 \ |
|
277 macro(STAR) \ |
|
278 macro(MINSTAR) \ |
|
279 macro(PLUS) \ |
|
280 macro(MINPLUS) \ |
|
281 macro(QUERY) \ |
|
282 macro(MINQUERY) \ |
|
283 macro(UPTO) \ |
|
284 macro(MINUPTO) \ |
|
285 macro(EXACT) \ |
|
286 \ |
|
287 macro(NOTSTAR) \ |
|
288 macro(NOTMINSTAR) \ |
|
289 macro(NOTPLUS) \ |
|
290 macro(NOTMINPLUS) \ |
|
291 macro(NOTQUERY) \ |
|
292 macro(NOTMINQUERY) \ |
|
293 macro(NOTUPTO) \ |
|
294 macro(NOTMINUPTO) \ |
|
295 macro(NOTEXACT) \ |
|
296 \ |
|
297 macro(TYPESTAR) \ |
|
298 macro(TYPEMINSTAR) \ |
|
299 macro(TYPEPLUS) \ |
|
300 macro(TYPEMINPLUS) \ |
|
301 macro(TYPEQUERY) \ |
|
302 macro(TYPEMINQUERY) \ |
|
303 macro(TYPEUPTO) \ |
|
304 macro(TYPEMINUPTO) \ |
|
305 macro(TYPEEXACT) \ |
|
306 \ |
|
307 macro(CRSTAR) \ |
|
308 macro(CRMINSTAR) \ |
|
309 macro(CRPLUS) \ |
|
310 macro(CRMINPLUS) \ |
|
311 macro(CRQUERY) \ |
|
312 macro(CRMINQUERY) \ |
|
313 macro(CRRANGE) \ |
|
314 macro(CRMINRANGE) \ |
|
315 \ |
|
316 macro(CLASS) \ |
|
317 macro(NCLASS) \ |
|
318 macro(XCLASS) \ |
|
319 \ |
|
320 macro(REF) \ |
|
321 \ |
|
322 macro(ALT) \ |
|
323 macro(KET) \ |
|
324 macro(KETRMAX) \ |
|
325 macro(KETRMIN) \ |
|
326 \ |
|
327 macro(ASSERT) \ |
|
328 macro(ASSERT_NOT) \ |
|
329 \ |
|
330 macro(BRAZERO) \ |
|
331 macro(BRAMINZERO) \ |
|
332 macro(BRANUMBER) \ |
|
333 macro(BRA) |
|
334 |
|
335 #define OPCODE_ENUM_VALUE(opcode) OP_##opcode, |
|
336 enum { FOR_EACH_OPCODE(OPCODE_ENUM_VALUE) }; |
|
337 |
|
338 /* WARNING WARNING WARNING: There is an implicit assumption in pcre.c and |
|
339 study.c that all opcodes are less than 128 in value. This makes handling UTF-8 |
|
340 character sequences easier. */ |
|
341 |
|
342 /* The highest extraction number before we have to start using additional |
|
343 bytes. (Originally PCRE didn't have support for extraction counts higher than |
|
344 this number.) The value is limited by the number of opcodes left after OP_BRA, |
|
345 i.e. 255 - OP_BRA. We actually set it a bit lower to leave room for additional |
|
346 opcodes. */ |
|
347 |
|
348 /* FIXME: Note that OP_BRA + 100 is > 128, so the two comments above |
|
349 are in conflict! */ |
|
350 |
|
351 #define EXTRACT_BASIC_MAX 100 |
|
352 |
|
353 /* The code vector runs on as long as necessary after the end. */ |
|
354 |
|
355 struct JSRegExp { |
|
356 unsigned options; |
|
357 |
|
358 unsigned short topBracket; |
|
359 unsigned short topBackref; |
|
360 |
|
361 unsigned short firstByte; |
|
362 unsigned short reqByte; |
|
363 |
|
364 #if REGEXP_HISTOGRAM |
|
365 size_t stringOffset; |
|
366 size_t stringLength; |
|
367 #endif |
|
368 }; |
|
369 |
|
370 /* Internal shared data tables. These are tables that are used by more than one |
|
371 of the exported public functions. They have to be "external" in the C sense, |
|
372 but are not part of the PCRE public API. The data for these tables is in the |
|
373 pcre_tables.c module. */ |
|
374 |
|
375 #define jsc_pcre_utf8_table1_size 6 |
|
376 |
|
377 extern const int jsc_pcre_utf8_table1[6]; |
|
378 extern const int jsc_pcre_utf8_table2[6]; |
|
379 extern const int jsc_pcre_utf8_table3[6]; |
|
380 extern const unsigned char jsc_pcre_utf8_table4[0x40]; |
|
381 |
|
382 extern const unsigned char jsc_pcre_default_tables[tables_length]; |
|
383 |
|
384 static inline unsigned char toLowerCase(unsigned char c) |
|
385 { |
|
386 static const unsigned char* lowerCaseChars = jsc_pcre_default_tables + lcc_offset; |
|
387 return lowerCaseChars[c]; |
|
388 } |
|
389 |
|
390 static inline unsigned char flipCase(unsigned char c) |
|
391 { |
|
392 static const unsigned char* flippedCaseChars = jsc_pcre_default_tables + fcc_offset; |
|
393 return flippedCaseChars[c]; |
|
394 } |
|
395 |
|
396 static inline unsigned char classBitmapForChar(unsigned char c) |
|
397 { |
|
398 static const unsigned char* charClassBitmaps = jsc_pcre_default_tables + cbits_offset; |
|
399 return charClassBitmaps[c]; |
|
400 } |
|
401 |
|
402 static inline unsigned char charTypeForChar(unsigned char c) |
|
403 { |
|
404 const unsigned char* charTypeMap = jsc_pcre_default_tables + ctypes_offset; |
|
405 return charTypeMap[c]; |
|
406 } |
|
407 |
|
408 static inline bool isWordChar(UChar c) |
|
409 { |
|
410 return c < 128 && (charTypeForChar(c) & ctype_word); |
|
411 } |
|
412 |
|
413 static inline bool isSpaceChar(UChar c) |
|
414 { |
|
415 return (c < 128 && (charTypeForChar(c) & ctype_space)) || c == 0x00A0; |
|
416 } |
|
417 |
|
418 static inline bool isNewline(UChar nl) |
|
419 { |
|
420 return (nl == 0xA || nl == 0xD || nl == 0x2028 || nl == 0x2029); |
|
421 } |
|
422 |
|
423 static inline bool isBracketStartOpcode(unsigned char opcode) |
|
424 { |
|
425 if (opcode >= OP_BRA) |
|
426 return true; |
|
427 switch (opcode) { |
|
428 case OP_ASSERT: |
|
429 case OP_ASSERT_NOT: |
|
430 return true; |
|
431 default: |
|
432 return false; |
|
433 } |
|
434 } |
|
435 |
|
436 static inline void advanceToEndOfBracket(const unsigned char*& opcodePtr) |
|
437 { |
|
438 ASSERT(isBracketStartOpcode(*opcodePtr) || *opcodePtr == OP_ALT); |
|
439 do |
|
440 opcodePtr += getLinkValue(opcodePtr + 1); |
|
441 while (*opcodePtr == OP_ALT); |
|
442 } |
|
443 |
|
444 /* Internal shared functions. These are functions that are used in more |
|
445 that one of the source files. They have to have external linkage, but |
|
446 but are not part of the public API and so not exported from the library. */ |
|
447 |
|
448 extern int jsc_pcre_ucp_othercase(unsigned); |
|
449 extern bool jsc_pcre_xclass(int, const unsigned char*); |
|
450 |
|
451 #endif |
|
452 |
|
453 #endif |
|
454 |
|
455 /* End of pcre_internal.h */ |