|
1 /************************************************* |
|
2 * Perl-Compatible Regular Expressions * |
|
3 *************************************************/ |
|
4 |
|
5 /* PCRE is a library of functions to support regular expressions whose syntax |
|
6 and semantics are as close as possible to those of the Perl 5 language. |
|
7 |
|
8 Written by Philip Hazel |
|
9 Copyright (c) 1997-2008 University of Cambridge |
|
10 |
|
11 ----------------------------------------------------------------------------- |
|
12 Redistribution and use in source and binary forms, with or without |
|
13 modification, are permitted provided that the following conditions are met: |
|
14 |
|
15 * Redistributions of source code must retain the above copyright notice, |
|
16 this list of conditions and the following disclaimer. |
|
17 |
|
18 * Redistributions in binary form must reproduce the above copyright |
|
19 notice, this list of conditions and the following disclaimer in the |
|
20 documentation and/or other materials provided with the distribution. |
|
21 |
|
22 * Neither the name of the University of Cambridge nor the names of its |
|
23 contributors may be used to endorse or promote products derived from |
|
24 this software without specific prior written permission. |
|
25 |
|
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
|
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
|
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
|
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
|
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
|
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
|
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
|
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
|
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
|
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
|
36 POSSIBILITY OF SUCH DAMAGE. |
|
37 ----------------------------------------------------------------------------- |
|
38 */ |
|
39 |
|
40 |
|
41 /* This module contains some fixed tables that are used by more than one of the |
|
42 PCRE code modules. The tables are also #included by the pcretest program, which |
|
43 uses macros to change their names from _pcre_xxx to xxxx, thereby avoiding name |
|
44 clashes with the library. */ |
|
45 |
|
46 |
|
47 #ifdef HAVE_CONFIG_H |
|
48 #include "config.h" |
|
49 #endif |
|
50 |
|
51 #include "pcre_internal.h" |
|
52 |
|
53 |
|
54 /* Table of sizes for the fixed-length opcodes. It's defined in a macro so that |
|
55 the definition is next to the definition of the opcodes in pcre_internal.h. */ |
|
56 |
|
57 const uschar _pcre_OP_lengths[] = { OP_LENGTHS }; |
|
58 |
|
59 |
|
60 |
|
61 /************************************************* |
|
62 * Tables for UTF-8 support * |
|
63 *************************************************/ |
|
64 |
|
65 /* These are the breakpoints for different numbers of bytes in a UTF-8 |
|
66 character. */ |
|
67 |
|
68 #ifdef SUPPORT_UTF8 |
|
69 |
|
70 const int _pcre_utf8_table1[] = |
|
71 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff}; |
|
72 |
|
73 const int _pcre_utf8_table1_size = sizeof(_pcre_utf8_table1)/sizeof(int); |
|
74 |
|
75 /* These are the indicator bits and the mask for the data bits to set in the |
|
76 first byte of a character, indexed by the number of additional bytes. */ |
|
77 |
|
78 const int _pcre_utf8_table2[] = { 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc}; |
|
79 const int _pcre_utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01}; |
|
80 |
|
81 /* Table of the number of extra bytes, indexed by the first byte masked with |
|
82 0x3f. The highest number for a valid UTF-8 first byte is in fact 0x3d. */ |
|
83 |
|
84 const uschar _pcre_utf8_table4[] = { |
|
85 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
|
86 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
|
87 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, |
|
88 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 }; |
|
89 |
|
90 /* Table to translate from particular type value to the general value. */ |
|
91 |
|
92 const int _pcre_ucp_gentype[] = { |
|
93 ucp_C, ucp_C, ucp_C, ucp_C, ucp_C, /* Cc, Cf, Cn, Co, Cs */ |
|
94 ucp_L, ucp_L, ucp_L, ucp_L, ucp_L, /* Ll, Lu, Lm, Lo, Lt */ |
|
95 ucp_M, ucp_M, ucp_M, /* Mc, Me, Mn */ |
|
96 ucp_N, ucp_N, ucp_N, /* Nd, Nl, No */ |
|
97 ucp_P, ucp_P, ucp_P, ucp_P, ucp_P, /* Pc, Pd, Pe, Pf, Pi */ |
|
98 ucp_P, ucp_P, /* Ps, Po */ |
|
99 ucp_S, ucp_S, ucp_S, ucp_S, /* Sc, Sk, Sm, So */ |
|
100 ucp_Z, ucp_Z, ucp_Z /* Zl, Zp, Zs */ |
|
101 }; |
|
102 |
|
103 /* The pcre_utt[] table below translates Unicode property names into type and |
|
104 code values. It is searched by binary chop, so must be in collating sequence of |
|
105 name. Originally, the table contained pointers to the name strings in the first |
|
106 field of each entry. However, that leads to a large number of relocations when |
|
107 a shared library is dynamically loaded. A significant reduction is made by |
|
108 putting all the names into a single, large string and then using offsets in the |
|
109 table itself. Maintenance is more error-prone, but frequent changes to this |
|
110 data are unlikely. |
|
111 |
|
112 July 2008: There is now a script called maint/GenerateUtt.py which can be used |
|
113 to generate this data instead of maintaining it entirely by hand. */ |
|
114 |
|
115 const char _pcre_utt_names[] = |
|
116 "Any\0" |
|
117 "Arabic\0" |
|
118 "Armenian\0" |
|
119 "Balinese\0" |
|
120 "Bengali\0" |
|
121 "Bopomofo\0" |
|
122 "Braille\0" |
|
123 "Buginese\0" |
|
124 "Buhid\0" |
|
125 "C\0" |
|
126 "Canadian_Aboriginal\0" |
|
127 "Carian\0" |
|
128 "Cc\0" |
|
129 "Cf\0" |
|
130 "Cham\0" |
|
131 "Cherokee\0" |
|
132 "Cn\0" |
|
133 "Co\0" |
|
134 "Common\0" |
|
135 "Coptic\0" |
|
136 "Cs\0" |
|
137 "Cuneiform\0" |
|
138 "Cypriot\0" |
|
139 "Cyrillic\0" |
|
140 "Deseret\0" |
|
141 "Devanagari\0" |
|
142 "Ethiopic\0" |
|
143 "Georgian\0" |
|
144 "Glagolitic\0" |
|
145 "Gothic\0" |
|
146 "Greek\0" |
|
147 "Gujarati\0" |
|
148 "Gurmukhi\0" |
|
149 "Han\0" |
|
150 "Hangul\0" |
|
151 "Hanunoo\0" |
|
152 "Hebrew\0" |
|
153 "Hiragana\0" |
|
154 "Inherited\0" |
|
155 "Kannada\0" |
|
156 "Katakana\0" |
|
157 "Kayah_Li\0" |
|
158 "Kharoshthi\0" |
|
159 "Khmer\0" |
|
160 "L\0" |
|
161 "L&\0" |
|
162 "Lao\0" |
|
163 "Latin\0" |
|
164 "Lepcha\0" |
|
165 "Limbu\0" |
|
166 "Linear_B\0" |
|
167 "Ll\0" |
|
168 "Lm\0" |
|
169 "Lo\0" |
|
170 "Lt\0" |
|
171 "Lu\0" |
|
172 "Lycian\0" |
|
173 "Lydian\0" |
|
174 "M\0" |
|
175 "Malayalam\0" |
|
176 "Mc\0" |
|
177 "Me\0" |
|
178 "Mn\0" |
|
179 "Mongolian\0" |
|
180 "Myanmar\0" |
|
181 "N\0" |
|
182 "Nd\0" |
|
183 "New_Tai_Lue\0" |
|
184 "Nko\0" |
|
185 "Nl\0" |
|
186 "No\0" |
|
187 "Ogham\0" |
|
188 "Ol_Chiki\0" |
|
189 "Old_Italic\0" |
|
190 "Old_Persian\0" |
|
191 "Oriya\0" |
|
192 "Osmanya\0" |
|
193 "P\0" |
|
194 "Pc\0" |
|
195 "Pd\0" |
|
196 "Pe\0" |
|
197 "Pf\0" |
|
198 "Phags_Pa\0" |
|
199 "Phoenician\0" |
|
200 "Pi\0" |
|
201 "Po\0" |
|
202 "Ps\0" |
|
203 "Rejang\0" |
|
204 "Runic\0" |
|
205 "S\0" |
|
206 "Saurashtra\0" |
|
207 "Sc\0" |
|
208 "Shavian\0" |
|
209 "Sinhala\0" |
|
210 "Sk\0" |
|
211 "Sm\0" |
|
212 "So\0" |
|
213 "Sundanese\0" |
|
214 "Syloti_Nagri\0" |
|
215 "Syriac\0" |
|
216 "Tagalog\0" |
|
217 "Tagbanwa\0" |
|
218 "Tai_Le\0" |
|
219 "Tamil\0" |
|
220 "Telugu\0" |
|
221 "Thaana\0" |
|
222 "Thai\0" |
|
223 "Tibetan\0" |
|
224 "Tifinagh\0" |
|
225 "Ugaritic\0" |
|
226 "Vai\0" |
|
227 "Yi\0" |
|
228 "Z\0" |
|
229 "Zl\0" |
|
230 "Zp\0" |
|
231 "Zs\0"; |
|
232 |
|
233 const ucp_type_table _pcre_utt[] = { |
|
234 { 0, PT_ANY, 0 }, |
|
235 { 4, PT_SC, ucp_Arabic }, |
|
236 { 11, PT_SC, ucp_Armenian }, |
|
237 { 20, PT_SC, ucp_Balinese }, |
|
238 { 29, PT_SC, ucp_Bengali }, |
|
239 { 37, PT_SC, ucp_Bopomofo }, |
|
240 { 46, PT_SC, ucp_Braille }, |
|
241 { 54, PT_SC, ucp_Buginese }, |
|
242 { 63, PT_SC, ucp_Buhid }, |
|
243 { 69, PT_GC, ucp_C }, |
|
244 { 71, PT_SC, ucp_Canadian_Aboriginal }, |
|
245 { 91, PT_SC, ucp_Carian }, |
|
246 { 98, PT_PC, ucp_Cc }, |
|
247 { 101, PT_PC, ucp_Cf }, |
|
248 { 104, PT_SC, ucp_Cham }, |
|
249 { 109, PT_SC, ucp_Cherokee }, |
|
250 { 118, PT_PC, ucp_Cn }, |
|
251 { 121, PT_PC, ucp_Co }, |
|
252 { 124, PT_SC, ucp_Common }, |
|
253 { 131, PT_SC, ucp_Coptic }, |
|
254 { 138, PT_PC, ucp_Cs }, |
|
255 { 141, PT_SC, ucp_Cuneiform }, |
|
256 { 151, PT_SC, ucp_Cypriot }, |
|
257 { 159, PT_SC, ucp_Cyrillic }, |
|
258 { 168, PT_SC, ucp_Deseret }, |
|
259 { 176, PT_SC, ucp_Devanagari }, |
|
260 { 187, PT_SC, ucp_Ethiopic }, |
|
261 { 196, PT_SC, ucp_Georgian }, |
|
262 { 205, PT_SC, ucp_Glagolitic }, |
|
263 { 216, PT_SC, ucp_Gothic }, |
|
264 { 223, PT_SC, ucp_Greek }, |
|
265 { 229, PT_SC, ucp_Gujarati }, |
|
266 { 238, PT_SC, ucp_Gurmukhi }, |
|
267 { 247, PT_SC, ucp_Han }, |
|
268 { 251, PT_SC, ucp_Hangul }, |
|
269 { 258, PT_SC, ucp_Hanunoo }, |
|
270 { 266, PT_SC, ucp_Hebrew }, |
|
271 { 273, PT_SC, ucp_Hiragana }, |
|
272 { 282, PT_SC, ucp_Inherited }, |
|
273 { 292, PT_SC, ucp_Kannada }, |
|
274 { 300, PT_SC, ucp_Katakana }, |
|
275 { 309, PT_SC, ucp_Kayah_Li }, |
|
276 { 318, PT_SC, ucp_Kharoshthi }, |
|
277 { 329, PT_SC, ucp_Khmer }, |
|
278 { 335, PT_GC, ucp_L }, |
|
279 { 337, PT_LAMP, 0 }, |
|
280 { 340, PT_SC, ucp_Lao }, |
|
281 { 344, PT_SC, ucp_Latin }, |
|
282 { 350, PT_SC, ucp_Lepcha }, |
|
283 { 357, PT_SC, ucp_Limbu }, |
|
284 { 363, PT_SC, ucp_Linear_B }, |
|
285 { 372, PT_PC, ucp_Ll }, |
|
286 { 375, PT_PC, ucp_Lm }, |
|
287 { 378, PT_PC, ucp_Lo }, |
|
288 { 381, PT_PC, ucp_Lt }, |
|
289 { 384, PT_PC, ucp_Lu }, |
|
290 { 387, PT_SC, ucp_Lycian }, |
|
291 { 394, PT_SC, ucp_Lydian }, |
|
292 { 401, PT_GC, ucp_M }, |
|
293 { 403, PT_SC, ucp_Malayalam }, |
|
294 { 413, PT_PC, ucp_Mc }, |
|
295 { 416, PT_PC, ucp_Me }, |
|
296 { 419, PT_PC, ucp_Mn }, |
|
297 { 422, PT_SC, ucp_Mongolian }, |
|
298 { 432, PT_SC, ucp_Myanmar }, |
|
299 { 440, PT_GC, ucp_N }, |
|
300 { 442, PT_PC, ucp_Nd }, |
|
301 { 445, PT_SC, ucp_New_Tai_Lue }, |
|
302 { 457, PT_SC, ucp_Nko }, |
|
303 { 461, PT_PC, ucp_Nl }, |
|
304 { 464, PT_PC, ucp_No }, |
|
305 { 467, PT_SC, ucp_Ogham }, |
|
306 { 473, PT_SC, ucp_Ol_Chiki }, |
|
307 { 482, PT_SC, ucp_Old_Italic }, |
|
308 { 493, PT_SC, ucp_Old_Persian }, |
|
309 { 505, PT_SC, ucp_Oriya }, |
|
310 { 511, PT_SC, ucp_Osmanya }, |
|
311 { 519, PT_GC, ucp_P }, |
|
312 { 521, PT_PC, ucp_Pc }, |
|
313 { 524, PT_PC, ucp_Pd }, |
|
314 { 527, PT_PC, ucp_Pe }, |
|
315 { 530, PT_PC, ucp_Pf }, |
|
316 { 533, PT_SC, ucp_Phags_Pa }, |
|
317 { 542, PT_SC, ucp_Phoenician }, |
|
318 { 553, PT_PC, ucp_Pi }, |
|
319 { 556, PT_PC, ucp_Po }, |
|
320 { 559, PT_PC, ucp_Ps }, |
|
321 { 562, PT_SC, ucp_Rejang }, |
|
322 { 569, PT_SC, ucp_Runic }, |
|
323 { 575, PT_GC, ucp_S }, |
|
324 { 577, PT_SC, ucp_Saurashtra }, |
|
325 { 588, PT_PC, ucp_Sc }, |
|
326 { 591, PT_SC, ucp_Shavian }, |
|
327 { 599, PT_SC, ucp_Sinhala }, |
|
328 { 607, PT_PC, ucp_Sk }, |
|
329 { 610, PT_PC, ucp_Sm }, |
|
330 { 613, PT_PC, ucp_So }, |
|
331 { 616, PT_SC, ucp_Sundanese }, |
|
332 { 626, PT_SC, ucp_Syloti_Nagri }, |
|
333 { 639, PT_SC, ucp_Syriac }, |
|
334 { 646, PT_SC, ucp_Tagalog }, |
|
335 { 654, PT_SC, ucp_Tagbanwa }, |
|
336 { 663, PT_SC, ucp_Tai_Le }, |
|
337 { 670, PT_SC, ucp_Tamil }, |
|
338 { 676, PT_SC, ucp_Telugu }, |
|
339 { 683, PT_SC, ucp_Thaana }, |
|
340 { 690, PT_SC, ucp_Thai }, |
|
341 { 695, PT_SC, ucp_Tibetan }, |
|
342 { 703, PT_SC, ucp_Tifinagh }, |
|
343 { 712, PT_SC, ucp_Ugaritic }, |
|
344 { 721, PT_SC, ucp_Vai }, |
|
345 { 725, PT_SC, ucp_Yi }, |
|
346 { 728, PT_GC, ucp_Z }, |
|
347 { 730, PT_PC, ucp_Zl }, |
|
348 { 733, PT_PC, ucp_Zp }, |
|
349 { 736, PT_PC, ucp_Zs } |
|
350 }; |
|
351 |
|
352 const int _pcre_utt_size = sizeof(_pcre_utt)/sizeof(ucp_type_table); |
|
353 |
|
354 #endif /* SUPPORT_UTF8 */ |
|
355 |
|
356 /* End of pcre_tables.c */ |