|
1 /************************************************* |
|
2 * Perl-Compatible Regular Expressions * |
|
3 *************************************************/ |
|
4 |
|
5 /* PCRE is a library of functions to support regular expressions whose syntax |
|
6 and semantics are as close as possible to those of the Perl 5 language. |
|
7 |
|
8 Written by Philip Hazel |
|
9 Copyright (c) 1997-2008 University of Cambridge |
|
10 |
|
11 ----------------------------------------------------------------------------- |
|
12 Redistribution and use in source and binary forms, with or without |
|
13 modification, are permitted provided that the following conditions are met: |
|
14 |
|
15 * Redistributions of source code must retain the above copyright notice, |
|
16 this list of conditions and the following disclaimer. |
|
17 |
|
18 * Redistributions in binary form must reproduce the above copyright |
|
19 notice, this list of conditions and the following disclaimer in the |
|
20 documentation and/or other materials provided with the distribution. |
|
21 |
|
22 * Neither the name of the University of Cambridge nor the names of its |
|
23 contributors may be used to endorse or promote products derived from |
|
24 this software without specific prior written permission. |
|
25 |
|
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
|
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
|
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
|
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
|
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
|
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
|
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
|
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
|
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
|
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
|
36 POSSIBILITY OF SUCH DAMAGE. |
|
37 ----------------------------------------------------------------------------- |
|
38 */ |
|
39 |
|
40 |
|
41 /* This module is a wrapper that provides a POSIX API to the underlying PCRE |
|
42 functions. */ |
|
43 |
|
44 |
|
45 #ifdef HAVE_CONFIG_H |
|
46 #include "config.h" |
|
47 #endif |
|
48 |
|
49 |
|
50 /* Ensure that the PCREPOSIX_EXP_xxx macros are set appropriately for |
|
51 compiling these functions. This must come before including pcreposix.h, where |
|
52 they are set for an application (using these functions) if they have not |
|
53 previously been set. */ |
|
54 |
|
55 #if defined(_WIN32) && !defined(PCRE_STATIC) |
|
56 # define PCREPOSIX_EXP_DECL extern __declspec(dllexport) |
|
57 # define PCREPOSIX_EXP_DEFN __declspec(dllexport) |
|
58 #endif |
|
59 |
|
60 #include "pcre.h" |
|
61 #include "pcre_internal.h" |
|
62 #include "pcreposix.h" |
|
63 |
|
64 |
|
65 /* Table to translate PCRE compile time error codes into POSIX error codes. */ |
|
66 |
|
67 static const int eint[] = { |
|
68 0, /* no error */ |
|
69 REG_EESCAPE, /* \ at end of pattern */ |
|
70 REG_EESCAPE, /* \c at end of pattern */ |
|
71 REG_EESCAPE, /* unrecognized character follows \ */ |
|
72 REG_BADBR, /* numbers out of order in {} quantifier */ |
|
73 REG_BADBR, /* number too big in {} quantifier */ |
|
74 REG_EBRACK, /* missing terminating ] for character class */ |
|
75 REG_ECTYPE, /* invalid escape sequence in character class */ |
|
76 REG_ERANGE, /* range out of order in character class */ |
|
77 REG_BADRPT, /* nothing to repeat */ |
|
78 REG_BADRPT, /* operand of unlimited repeat could match the empty string */ |
|
79 REG_ASSERT, /* internal error: unexpected repeat */ |
|
80 REG_BADPAT, /* unrecognized character after (? */ |
|
81 REG_BADPAT, /* POSIX named classes are supported only within a class */ |
|
82 REG_EPAREN, /* missing ) */ |
|
83 REG_ESUBREG, /* reference to non-existent subpattern */ |
|
84 REG_INVARG, /* erroffset passed as NULL */ |
|
85 REG_INVARG, /* unknown option bit(s) set */ |
|
86 REG_EPAREN, /* missing ) after comment */ |
|
87 REG_ESIZE, /* parentheses nested too deeply */ |
|
88 REG_ESIZE, /* regular expression too large */ |
|
89 REG_ESPACE, /* failed to get memory */ |
|
90 REG_EPAREN, /* unmatched brackets */ |
|
91 REG_ASSERT, /* internal error: code overflow */ |
|
92 REG_BADPAT, /* unrecognized character after (?< */ |
|
93 REG_BADPAT, /* lookbehind assertion is not fixed length */ |
|
94 REG_BADPAT, /* malformed number or name after (?( */ |
|
95 REG_BADPAT, /* conditional group contains more than two branches */ |
|
96 REG_BADPAT, /* assertion expected after (?( */ |
|
97 REG_BADPAT, /* (?R or (?[+-]digits must be followed by ) */ |
|
98 REG_ECTYPE, /* unknown POSIX class name */ |
|
99 REG_BADPAT, /* POSIX collating elements are not supported */ |
|
100 REG_INVARG, /* this version of PCRE is not compiled with PCRE_UTF8 support */ |
|
101 REG_BADPAT, /* spare error */ |
|
102 REG_BADPAT, /* character value in \x{...} sequence is too large */ |
|
103 REG_BADPAT, /* invalid condition (?(0) */ |
|
104 REG_BADPAT, /* \C not allowed in lookbehind assertion */ |
|
105 REG_EESCAPE, /* PCRE does not support \L, \l, \N, \U, or \u */ |
|
106 REG_BADPAT, /* number after (?C is > 255 */ |
|
107 REG_BADPAT, /* closing ) for (?C expected */ |
|
108 REG_BADPAT, /* recursive call could loop indefinitely */ |
|
109 REG_BADPAT, /* unrecognized character after (?P */ |
|
110 REG_BADPAT, /* syntax error in subpattern name (missing terminator) */ |
|
111 REG_BADPAT, /* two named subpatterns have the same name */ |
|
112 REG_BADPAT, /* invalid UTF-8 string */ |
|
113 REG_BADPAT, /* support for \P, \p, and \X has not been compiled */ |
|
114 REG_BADPAT, /* malformed \P or \p sequence */ |
|
115 REG_BADPAT, /* unknown property name after \P or \p */ |
|
116 REG_BADPAT, /* subpattern name is too long (maximum 32 characters) */ |
|
117 REG_BADPAT, /* too many named subpatterns (maximum 10,000) */ |
|
118 REG_BADPAT, /* repeated subpattern is too long */ |
|
119 REG_BADPAT, /* octal value is greater than \377 (not in UTF-8 mode) */ |
|
120 REG_BADPAT, /* internal error: overran compiling workspace */ |
|
121 REG_BADPAT, /* internal error: previously-checked referenced subpattern not found */ |
|
122 REG_BADPAT, /* DEFINE group contains more than one branch */ |
|
123 REG_BADPAT, /* repeating a DEFINE group is not allowed */ |
|
124 REG_INVARG, /* inconsistent NEWLINE options */ |
|
125 REG_BADPAT, /* \g is not followed followed by an (optionally braced) non-zero number */ |
|
126 REG_BADPAT, /* (?+ or (?- must be followed by a non-zero number */ |
|
127 REG_BADPAT, /* number is too big */ |
|
128 REG_BADPAT, /* subpattern name expected */ |
|
129 REG_BADPAT, /* digit expected after (?+ */ |
|
130 REG_BADPAT /* ] is an invalid data character in JavaScript compatibility mode */ |
|
131 }; |
|
132 |
|
133 /* Table of texts corresponding to POSIX error codes */ |
|
134 |
|
135 static const char *const pstring[] = { |
|
136 "", /* Dummy for value 0 */ |
|
137 "internal error", /* REG_ASSERT */ |
|
138 "invalid repeat counts in {}", /* BADBR */ |
|
139 "pattern error", /* BADPAT */ |
|
140 "? * + invalid", /* BADRPT */ |
|
141 "unbalanced {}", /* EBRACE */ |
|
142 "unbalanced []", /* EBRACK */ |
|
143 "collation error - not relevant", /* ECOLLATE */ |
|
144 "bad class", /* ECTYPE */ |
|
145 "bad escape sequence", /* EESCAPE */ |
|
146 "empty expression", /* EMPTY */ |
|
147 "unbalanced ()", /* EPAREN */ |
|
148 "bad range inside []", /* ERANGE */ |
|
149 "expression too big", /* ESIZE */ |
|
150 "failed to get memory", /* ESPACE */ |
|
151 "bad back reference", /* ESUBREG */ |
|
152 "bad argument", /* INVARG */ |
|
153 "match failed" /* NOMATCH */ |
|
154 }; |
|
155 |
|
156 |
|
157 |
|
158 |
|
159 /************************************************* |
|
160 * Translate error code to string * |
|
161 *************************************************/ |
|
162 |
|
163 PCREPOSIX_EXP_DEFN size_t PCRE_CALL_CONVENTION |
|
164 regerror(int errcode, const regex_t *preg, char *errbuf, size_t errbuf_size) |
|
165 { |
|
166 const char *message, *addmessage; |
|
167 size_t length, addlength; |
|
168 |
|
169 message = (errcode >= (int)(sizeof(pstring)/sizeof(char *)))? |
|
170 "unknown error code" : pstring[errcode]; |
|
171 length = strlen(message) + 1; |
|
172 |
|
173 addmessage = " at offset "; |
|
174 addlength = (preg != NULL && (int)preg->re_erroffset != -1)? |
|
175 strlen(addmessage) + 6 : 0; |
|
176 |
|
177 if (errbuf_size > 0) |
|
178 { |
|
179 if (addlength > 0 && errbuf_size >= length + addlength) |
|
180 sprintf(errbuf, "%s%s%-6d", message, addmessage, (int)preg->re_erroffset); |
|
181 else |
|
182 { |
|
183 strncpy(errbuf, message, errbuf_size - 1); |
|
184 errbuf[errbuf_size-1] = 0; |
|
185 } |
|
186 } |
|
187 |
|
188 return length + addlength; |
|
189 } |
|
190 |
|
191 |
|
192 |
|
193 |
|
194 /************************************************* |
|
195 * Free store held by a regex * |
|
196 *************************************************/ |
|
197 |
|
198 PCREPOSIX_EXP_DEFN void PCRE_CALL_CONVENTION |
|
199 regfree(regex_t *preg) |
|
200 { |
|
201 (pcre_free)(preg->re_pcre); |
|
202 } |
|
203 |
|
204 |
|
205 |
|
206 |
|
207 /************************************************* |
|
208 * Compile a regular expression * |
|
209 *************************************************/ |
|
210 |
|
211 /* |
|
212 Arguments: |
|
213 preg points to a structure for recording the compiled expression |
|
214 pattern the pattern to compile |
|
215 cflags compilation flags |
|
216 |
|
217 Returns: 0 on success |
|
218 various non-zero codes on failure |
|
219 */ |
|
220 |
|
221 PCREPOSIX_EXP_DEFN int PCRE_CALL_CONVENTION |
|
222 regcomp(regex_t *preg, const char *pattern, int cflags) |
|
223 { |
|
224 const char *errorptr; |
|
225 int erroffset; |
|
226 int errorcode; |
|
227 int options = 0; |
|
228 |
|
229 if ((cflags & REG_ICASE) != 0) options |= PCRE_CASELESS; |
|
230 if ((cflags & REG_NEWLINE) != 0) options |= PCRE_MULTILINE; |
|
231 if ((cflags & REG_DOTALL) != 0) options |= PCRE_DOTALL; |
|
232 if ((cflags & REG_NOSUB) != 0) options |= PCRE_NO_AUTO_CAPTURE; |
|
233 if ((cflags & REG_UTF8) != 0) options |= PCRE_UTF8; |
|
234 |
|
235 preg->re_pcre = pcre_compile2(pattern, options, &errorcode, &errorptr, |
|
236 &erroffset, NULL); |
|
237 preg->re_erroffset = erroffset; |
|
238 |
|
239 if (preg->re_pcre == NULL) return eint[errorcode]; |
|
240 |
|
241 preg->re_nsub = pcre_info((const pcre *)preg->re_pcre, NULL, NULL); |
|
242 return 0; |
|
243 } |
|
244 |
|
245 |
|
246 |
|
247 |
|
248 /************************************************* |
|
249 * Match a regular expression * |
|
250 *************************************************/ |
|
251 |
|
252 /* Unfortunately, PCRE requires 3 ints of working space for each captured |
|
253 substring, so we have to get and release working store instead of just using |
|
254 the POSIX structures as was done in earlier releases when PCRE needed only 2 |
|
255 ints. However, if the number of possible capturing brackets is small, use a |
|
256 block of store on the stack, to reduce the use of malloc/free. The threshold is |
|
257 in a macro that can be changed at configure time. |
|
258 |
|
259 If REG_NOSUB was specified at compile time, the PCRE_NO_AUTO_CAPTURE flag will |
|
260 be set. When this is the case, the nmatch and pmatch arguments are ignored, and |
|
261 the only result is yes/no/error. */ |
|
262 |
|
263 PCREPOSIX_EXP_DEFN int PCRE_CALL_CONVENTION |
|
264 regexec(const regex_t *preg, const char *string, size_t nmatch, |
|
265 regmatch_t pmatch[], int eflags) |
|
266 { |
|
267 int rc, so, eo; |
|
268 int options = 0; |
|
269 int *ovector = NULL; |
|
270 int small_ovector[POSIX_MALLOC_THRESHOLD * 3]; |
|
271 BOOL allocated_ovector = FALSE; |
|
272 BOOL nosub = |
|
273 (((const pcre *)preg->re_pcre)->options & PCRE_NO_AUTO_CAPTURE) != 0; |
|
274 |
|
275 if ((eflags & REG_NOTBOL) != 0) options |= PCRE_NOTBOL; |
|
276 if ((eflags & REG_NOTEOL) != 0) options |= PCRE_NOTEOL; |
|
277 |
|
278 ((regex_t *)preg)->re_erroffset = (size_t)(-1); /* Only has meaning after compile */ |
|
279 |
|
280 /* When no string data is being returned, ensure that nmatch is zero. |
|
281 Otherwise, ensure the vector for holding the return data is large enough. */ |
|
282 |
|
283 if (nosub) nmatch = 0; |
|
284 |
|
285 else if (nmatch > 0) |
|
286 { |
|
287 if (nmatch <= POSIX_MALLOC_THRESHOLD) |
|
288 { |
|
289 ovector = &(small_ovector[0]); |
|
290 } |
|
291 else |
|
292 { |
|
293 if (nmatch > INT_MAX/(sizeof(int) * 3)) return REG_ESPACE; |
|
294 ovector = (int *)malloc(sizeof(int) * nmatch * 3); |
|
295 if (ovector == NULL) return REG_ESPACE; |
|
296 allocated_ovector = TRUE; |
|
297 } |
|
298 } |
|
299 |
|
300 /* REG_STARTEND is a BSD extension, to allow for non-NUL-terminated strings. |
|
301 The man page from OS X says "REG_STARTEND affects only the location of the |
|
302 string, not how it is matched". That is why the "so" value is used to bump the |
|
303 start location rather than being passed as a PCRE "starting offset". */ |
|
304 |
|
305 if ((eflags & REG_STARTEND) != 0) |
|
306 { |
|
307 so = pmatch[0].rm_so; |
|
308 eo = pmatch[0].rm_eo; |
|
309 } |
|
310 else |
|
311 { |
|
312 so = 0; |
|
313 eo = strlen(string); |
|
314 } |
|
315 |
|
316 rc = pcre_exec((const pcre *)preg->re_pcre, NULL, string + so, (eo - so), |
|
317 0, options, ovector, nmatch * 3); |
|
318 |
|
319 if (rc == 0) rc = nmatch; /* All captured slots were filled in */ |
|
320 |
|
321 if (rc >= 0) |
|
322 { |
|
323 size_t i; |
|
324 if (!nosub) |
|
325 { |
|
326 for (i = 0; i < (size_t)rc; i++) |
|
327 { |
|
328 pmatch[i].rm_so = ovector[i*2]; |
|
329 pmatch[i].rm_eo = ovector[i*2+1]; |
|
330 } |
|
331 if (allocated_ovector) free(ovector); |
|
332 for (; i < nmatch; i++) pmatch[i].rm_so = pmatch[i].rm_eo = -1; |
|
333 } |
|
334 return 0; |
|
335 } |
|
336 |
|
337 else |
|
338 { |
|
339 if (allocated_ovector) free(ovector); |
|
340 switch(rc) |
|
341 { |
|
342 case PCRE_ERROR_NOMATCH: return REG_NOMATCH; |
|
343 case PCRE_ERROR_NULL: return REG_INVARG; |
|
344 case PCRE_ERROR_BADOPTION: return REG_INVARG; |
|
345 case PCRE_ERROR_BADMAGIC: return REG_INVARG; |
|
346 case PCRE_ERROR_UNKNOWN_NODE: return REG_ASSERT; |
|
347 case PCRE_ERROR_NOMEMORY: return REG_ESPACE; |
|
348 case PCRE_ERROR_MATCHLIMIT: return REG_ESPACE; |
|
349 case PCRE_ERROR_BADUTF8: return REG_INVARG; |
|
350 case PCRE_ERROR_BADUTF8_OFFSET: return REG_INVARG; |
|
351 default: return REG_ASSERT; |
|
352 } |
|
353 } |
|
354 } |
|
355 |
|
356 /* End of pcreposix.c */ |