|
1 // Copyright (c) 2005, Google Inc. |
|
2 // All rights reserved. |
|
3 // |
|
4 // Redistribution and use in source and binary forms, with or without |
|
5 // modification, are permitted provided that the following conditions are |
|
6 // met: |
|
7 // |
|
8 // * Redistributions of source code must retain the above copyright |
|
9 // notice, this list of conditions and the following disclaimer. |
|
10 // * Redistributions in binary form must reproduce the above |
|
11 // copyright notice, this list of conditions and the following disclaimer |
|
12 // in the documentation and/or other materials provided with the |
|
13 // distribution. |
|
14 // * Neither the name of Google Inc. nor the names of its |
|
15 // contributors may be used to endorse or promote products derived from |
|
16 // this software without specific prior written permission. |
|
17 // |
|
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
|
19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
|
20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
|
21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
|
22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
|
23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
|
24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
|
25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
|
26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
|
27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
|
28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
|
29 // |
|
30 // Author: Sanjay Ghemawat |
|
31 |
|
32 #ifdef HAVE_CONFIG_H |
|
33 #include "config.h" |
|
34 #endif |
|
35 |
|
36 #include <stdlib.h> |
|
37 #include <stdio.h> |
|
38 #include <ctype.h> |
|
39 #include <limits.h> /* for SHRT_MIN, USHRT_MAX, etc */ |
|
40 #include <assert.h> |
|
41 #include <errno.h> |
|
42 #include <string> |
|
43 #include <algorithm> |
|
44 |
|
45 #include "pcrecpp_internal.h" |
|
46 #include "pcre.h" |
|
47 #include "pcrecpp.h" |
|
48 #include "pcre_stringpiece.h" |
|
49 |
|
50 |
|
51 namespace pcrecpp { |
|
52 |
|
53 // Maximum number of args we can set |
|
54 static const int kMaxArgs = 16; |
|
55 static const int kVecSize = (1 + kMaxArgs) * 3; // results + PCRE workspace |
|
56 |
|
57 // Special object that stands-in for no argument |
|
58 Arg RE::no_arg((void*)NULL); |
|
59 |
|
60 // This is for ABI compatibility with old versions of pcre (pre-7.6), |
|
61 // which defined a global no_arg variable instead of putting it in the |
|
62 // RE class. This works on GCC >= 3, at least. It definitely works |
|
63 // for ELF, but may not for other object formats (Mach-O, for |
|
64 // instance, does not support aliases.) We could probably have a more |
|
65 // inclusive test if we ever needed it. (Note that not only the |
|
66 // __attribute__ syntax, but also __USER_LABEL_PREFIX__, are |
|
67 // gnu-specific.) |
|
68 #if defined(__GNUC__) && __GNUC__ >= 3 && defined(__ELF__) |
|
69 # define ULP_AS_STRING(x) ULP_AS_STRING_INTERNAL(x) |
|
70 # define ULP_AS_STRING_INTERNAL(x) #x |
|
71 # define USER_LABEL_PREFIX_STR ULP_AS_STRING(__USER_LABEL_PREFIX__) |
|
72 extern Arg no_arg |
|
73 __attribute__((alias(USER_LABEL_PREFIX_STR "_ZN7pcrecpp2RE6no_argE"))); |
|
74 #endif |
|
75 |
|
76 // If a regular expression has no error, its error_ field points here |
|
77 static const string empty_string; |
|
78 |
|
79 // If the user doesn't ask for any options, we just use this one |
|
80 static RE_Options default_options; |
|
81 |
|
82 void RE::Init(const string& pat, const RE_Options* options) { |
|
83 pattern_ = pat; |
|
84 if (options == NULL) { |
|
85 options_ = default_options; |
|
86 } else { |
|
87 options_ = *options; |
|
88 } |
|
89 error_ = &empty_string; |
|
90 re_full_ = NULL; |
|
91 re_partial_ = NULL; |
|
92 |
|
93 re_partial_ = Compile(UNANCHORED); |
|
94 if (re_partial_ != NULL) { |
|
95 re_full_ = Compile(ANCHOR_BOTH); |
|
96 } |
|
97 } |
|
98 |
|
99 void RE::Cleanup() { |
|
100 if (re_full_ != NULL) (*pcre_free)(re_full_); |
|
101 if (re_partial_ != NULL) (*pcre_free)(re_partial_); |
|
102 if (error_ != &empty_string) delete error_; |
|
103 } |
|
104 |
|
105 |
|
106 RE::~RE() { |
|
107 Cleanup(); |
|
108 } |
|
109 |
|
110 |
|
111 pcre* RE::Compile(Anchor anchor) { |
|
112 // First, convert RE_Options into pcre options |
|
113 int pcre_options = 0; |
|
114 pcre_options = options_.all_options(); |
|
115 |
|
116 // Special treatment for anchoring. This is needed because at |
|
117 // runtime pcre only provides an option for anchoring at the |
|
118 // beginning of a string (unless you use offset). |
|
119 // |
|
120 // There are three types of anchoring we want: |
|
121 // UNANCHORED Compile the original pattern, and use |
|
122 // a pcre unanchored match. |
|
123 // ANCHOR_START Compile the original pattern, and use |
|
124 // a pcre anchored match. |
|
125 // ANCHOR_BOTH Tack a "\z" to the end of the original pattern |
|
126 // and use a pcre anchored match. |
|
127 |
|
128 const char* compile_error; |
|
129 int eoffset; |
|
130 pcre* re; |
|
131 if (anchor != ANCHOR_BOTH) { |
|
132 re = pcre_compile(pattern_.c_str(), pcre_options, |
|
133 &compile_error, &eoffset, NULL); |
|
134 } else { |
|
135 // Tack a '\z' at the end of RE. Parenthesize it first so that |
|
136 // the '\z' applies to all top-level alternatives in the regexp. |
|
137 string wrapped = "(?:"; // A non-counting grouping operator |
|
138 wrapped += pattern_; |
|
139 wrapped += ")\\z"; |
|
140 re = pcre_compile(wrapped.c_str(), pcre_options, |
|
141 &compile_error, &eoffset, NULL); |
|
142 } |
|
143 if (re == NULL) { |
|
144 if (error_ == &empty_string) error_ = new string(compile_error); |
|
145 } |
|
146 return re; |
|
147 } |
|
148 |
|
149 /***** Matching interfaces *****/ |
|
150 |
|
151 bool RE::FullMatch(const StringPiece& text, |
|
152 const Arg& ptr1, |
|
153 const Arg& ptr2, |
|
154 const Arg& ptr3, |
|
155 const Arg& ptr4, |
|
156 const Arg& ptr5, |
|
157 const Arg& ptr6, |
|
158 const Arg& ptr7, |
|
159 const Arg& ptr8, |
|
160 const Arg& ptr9, |
|
161 const Arg& ptr10, |
|
162 const Arg& ptr11, |
|
163 const Arg& ptr12, |
|
164 const Arg& ptr13, |
|
165 const Arg& ptr14, |
|
166 const Arg& ptr15, |
|
167 const Arg& ptr16) const { |
|
168 const Arg* args[kMaxArgs]; |
|
169 int n = 0; |
|
170 if (&ptr1 == &no_arg) goto done; args[n++] = &ptr1; |
|
171 if (&ptr2 == &no_arg) goto done; args[n++] = &ptr2; |
|
172 if (&ptr3 == &no_arg) goto done; args[n++] = &ptr3; |
|
173 if (&ptr4 == &no_arg) goto done; args[n++] = &ptr4; |
|
174 if (&ptr5 == &no_arg) goto done; args[n++] = &ptr5; |
|
175 if (&ptr6 == &no_arg) goto done; args[n++] = &ptr6; |
|
176 if (&ptr7 == &no_arg) goto done; args[n++] = &ptr7; |
|
177 if (&ptr8 == &no_arg) goto done; args[n++] = &ptr8; |
|
178 if (&ptr9 == &no_arg) goto done; args[n++] = &ptr9; |
|
179 if (&ptr10 == &no_arg) goto done; args[n++] = &ptr10; |
|
180 if (&ptr11 == &no_arg) goto done; args[n++] = &ptr11; |
|
181 if (&ptr12 == &no_arg) goto done; args[n++] = &ptr12; |
|
182 if (&ptr13 == &no_arg) goto done; args[n++] = &ptr13; |
|
183 if (&ptr14 == &no_arg) goto done; args[n++] = &ptr14; |
|
184 if (&ptr15 == &no_arg) goto done; args[n++] = &ptr15; |
|
185 if (&ptr16 == &no_arg) goto done; args[n++] = &ptr16; |
|
186 done: |
|
187 |
|
188 int consumed; |
|
189 int vec[kVecSize]; |
|
190 return DoMatchImpl(text, ANCHOR_BOTH, &consumed, args, n, vec, kVecSize); |
|
191 } |
|
192 |
|
193 bool RE::PartialMatch(const StringPiece& text, |
|
194 const Arg& ptr1, |
|
195 const Arg& ptr2, |
|
196 const Arg& ptr3, |
|
197 const Arg& ptr4, |
|
198 const Arg& ptr5, |
|
199 const Arg& ptr6, |
|
200 const Arg& ptr7, |
|
201 const Arg& ptr8, |
|
202 const Arg& ptr9, |
|
203 const Arg& ptr10, |
|
204 const Arg& ptr11, |
|
205 const Arg& ptr12, |
|
206 const Arg& ptr13, |
|
207 const Arg& ptr14, |
|
208 const Arg& ptr15, |
|
209 const Arg& ptr16) const { |
|
210 const Arg* args[kMaxArgs]; |
|
211 int n = 0; |
|
212 if (&ptr1 == &no_arg) goto done; args[n++] = &ptr1; |
|
213 if (&ptr2 == &no_arg) goto done; args[n++] = &ptr2; |
|
214 if (&ptr3 == &no_arg) goto done; args[n++] = &ptr3; |
|
215 if (&ptr4 == &no_arg) goto done; args[n++] = &ptr4; |
|
216 if (&ptr5 == &no_arg) goto done; args[n++] = &ptr5; |
|
217 if (&ptr6 == &no_arg) goto done; args[n++] = &ptr6; |
|
218 if (&ptr7 == &no_arg) goto done; args[n++] = &ptr7; |
|
219 if (&ptr8 == &no_arg) goto done; args[n++] = &ptr8; |
|
220 if (&ptr9 == &no_arg) goto done; args[n++] = &ptr9; |
|
221 if (&ptr10 == &no_arg) goto done; args[n++] = &ptr10; |
|
222 if (&ptr11 == &no_arg) goto done; args[n++] = &ptr11; |
|
223 if (&ptr12 == &no_arg) goto done; args[n++] = &ptr12; |
|
224 if (&ptr13 == &no_arg) goto done; args[n++] = &ptr13; |
|
225 if (&ptr14 == &no_arg) goto done; args[n++] = &ptr14; |
|
226 if (&ptr15 == &no_arg) goto done; args[n++] = &ptr15; |
|
227 if (&ptr16 == &no_arg) goto done; args[n++] = &ptr16; |
|
228 done: |
|
229 |
|
230 int consumed; |
|
231 int vec[kVecSize]; |
|
232 return DoMatchImpl(text, UNANCHORED, &consumed, args, n, vec, kVecSize); |
|
233 } |
|
234 |
|
235 bool RE::Consume(StringPiece* input, |
|
236 const Arg& ptr1, |
|
237 const Arg& ptr2, |
|
238 const Arg& ptr3, |
|
239 const Arg& ptr4, |
|
240 const Arg& ptr5, |
|
241 const Arg& ptr6, |
|
242 const Arg& ptr7, |
|
243 const Arg& ptr8, |
|
244 const Arg& ptr9, |
|
245 const Arg& ptr10, |
|
246 const Arg& ptr11, |
|
247 const Arg& ptr12, |
|
248 const Arg& ptr13, |
|
249 const Arg& ptr14, |
|
250 const Arg& ptr15, |
|
251 const Arg& ptr16) const { |
|
252 const Arg* args[kMaxArgs]; |
|
253 int n = 0; |
|
254 if (&ptr1 == &no_arg) goto done; args[n++] = &ptr1; |
|
255 if (&ptr2 == &no_arg) goto done; args[n++] = &ptr2; |
|
256 if (&ptr3 == &no_arg) goto done; args[n++] = &ptr3; |
|
257 if (&ptr4 == &no_arg) goto done; args[n++] = &ptr4; |
|
258 if (&ptr5 == &no_arg) goto done; args[n++] = &ptr5; |
|
259 if (&ptr6 == &no_arg) goto done; args[n++] = &ptr6; |
|
260 if (&ptr7 == &no_arg) goto done; args[n++] = &ptr7; |
|
261 if (&ptr8 == &no_arg) goto done; args[n++] = &ptr8; |
|
262 if (&ptr9 == &no_arg) goto done; args[n++] = &ptr9; |
|
263 if (&ptr10 == &no_arg) goto done; args[n++] = &ptr10; |
|
264 if (&ptr11 == &no_arg) goto done; args[n++] = &ptr11; |
|
265 if (&ptr12 == &no_arg) goto done; args[n++] = &ptr12; |
|
266 if (&ptr13 == &no_arg) goto done; args[n++] = &ptr13; |
|
267 if (&ptr14 == &no_arg) goto done; args[n++] = &ptr14; |
|
268 if (&ptr15 == &no_arg) goto done; args[n++] = &ptr15; |
|
269 if (&ptr16 == &no_arg) goto done; args[n++] = &ptr16; |
|
270 done: |
|
271 |
|
272 int consumed; |
|
273 int vec[kVecSize]; |
|
274 if (DoMatchImpl(*input, ANCHOR_START, &consumed, |
|
275 args, n, vec, kVecSize)) { |
|
276 input->remove_prefix(consumed); |
|
277 return true; |
|
278 } else { |
|
279 return false; |
|
280 } |
|
281 } |
|
282 |
|
283 bool RE::FindAndConsume(StringPiece* input, |
|
284 const Arg& ptr1, |
|
285 const Arg& ptr2, |
|
286 const Arg& ptr3, |
|
287 const Arg& ptr4, |
|
288 const Arg& ptr5, |
|
289 const Arg& ptr6, |
|
290 const Arg& ptr7, |
|
291 const Arg& ptr8, |
|
292 const Arg& ptr9, |
|
293 const Arg& ptr10, |
|
294 const Arg& ptr11, |
|
295 const Arg& ptr12, |
|
296 const Arg& ptr13, |
|
297 const Arg& ptr14, |
|
298 const Arg& ptr15, |
|
299 const Arg& ptr16) const { |
|
300 const Arg* args[kMaxArgs]; |
|
301 int n = 0; |
|
302 if (&ptr1 == &no_arg) goto done; args[n++] = &ptr1; |
|
303 if (&ptr2 == &no_arg) goto done; args[n++] = &ptr2; |
|
304 if (&ptr3 == &no_arg) goto done; args[n++] = &ptr3; |
|
305 if (&ptr4 == &no_arg) goto done; args[n++] = &ptr4; |
|
306 if (&ptr5 == &no_arg) goto done; args[n++] = &ptr5; |
|
307 if (&ptr6 == &no_arg) goto done; args[n++] = &ptr6; |
|
308 if (&ptr7 == &no_arg) goto done; args[n++] = &ptr7; |
|
309 if (&ptr8 == &no_arg) goto done; args[n++] = &ptr8; |
|
310 if (&ptr9 == &no_arg) goto done; args[n++] = &ptr9; |
|
311 if (&ptr10 == &no_arg) goto done; args[n++] = &ptr10; |
|
312 if (&ptr11 == &no_arg) goto done; args[n++] = &ptr11; |
|
313 if (&ptr12 == &no_arg) goto done; args[n++] = &ptr12; |
|
314 if (&ptr13 == &no_arg) goto done; args[n++] = &ptr13; |
|
315 if (&ptr14 == &no_arg) goto done; args[n++] = &ptr14; |
|
316 if (&ptr15 == &no_arg) goto done; args[n++] = &ptr15; |
|
317 if (&ptr16 == &no_arg) goto done; args[n++] = &ptr16; |
|
318 done: |
|
319 |
|
320 int consumed; |
|
321 int vec[kVecSize]; |
|
322 if (DoMatchImpl(*input, UNANCHORED, &consumed, |
|
323 args, n, vec, kVecSize)) { |
|
324 input->remove_prefix(consumed); |
|
325 return true; |
|
326 } else { |
|
327 return false; |
|
328 } |
|
329 } |
|
330 |
|
331 bool RE::Replace(const StringPiece& rewrite, |
|
332 string *str) const { |
|
333 int vec[kVecSize]; |
|
334 int matches = TryMatch(*str, 0, UNANCHORED, vec, kVecSize); |
|
335 if (matches == 0) |
|
336 return false; |
|
337 |
|
338 string s; |
|
339 if (!Rewrite(&s, rewrite, *str, vec, matches)) |
|
340 return false; |
|
341 |
|
342 assert(vec[0] >= 0); |
|
343 assert(vec[1] >= 0); |
|
344 str->replace(vec[0], vec[1] - vec[0], s); |
|
345 return true; |
|
346 } |
|
347 |
|
348 // Returns PCRE_NEWLINE_CRLF, PCRE_NEWLINE_CR, or PCRE_NEWLINE_LF. |
|
349 // Note that PCRE_NEWLINE_CRLF is defined to be P_N_CR | P_N_LF. |
|
350 // Modified by PH to add PCRE_NEWLINE_ANY and PCRE_NEWLINE_ANYCRLF. |
|
351 |
|
352 static int NewlineMode(int pcre_options) { |
|
353 // TODO: if we can make it threadsafe, cache this var |
|
354 int newline_mode = 0; |
|
355 /* if (newline_mode) return newline_mode; */ // do this once it's cached |
|
356 if (pcre_options & (PCRE_NEWLINE_CRLF|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF| |
|
357 PCRE_NEWLINE_ANY|PCRE_NEWLINE_ANYCRLF)) { |
|
358 newline_mode = (pcre_options & |
|
359 (PCRE_NEWLINE_CRLF|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF| |
|
360 PCRE_NEWLINE_ANY|PCRE_NEWLINE_ANYCRLF)); |
|
361 } else { |
|
362 int newline; |
|
363 pcre_config(PCRE_CONFIG_NEWLINE, &newline); |
|
364 if (newline == 10) |
|
365 newline_mode = PCRE_NEWLINE_LF; |
|
366 else if (newline == 13) |
|
367 newline_mode = PCRE_NEWLINE_CR; |
|
368 else if (newline == 3338) |
|
369 newline_mode = PCRE_NEWLINE_CRLF; |
|
370 else if (newline == -1) |
|
371 newline_mode = PCRE_NEWLINE_ANY; |
|
372 else if (newline == -2) |
|
373 newline_mode = PCRE_NEWLINE_ANYCRLF; |
|
374 else |
|
375 assert(NULL == "Unexpected return value from pcre_config(NEWLINE)"); |
|
376 } |
|
377 return newline_mode; |
|
378 } |
|
379 |
|
380 int RE::GlobalReplace(const StringPiece& rewrite, |
|
381 string *str) const { |
|
382 int count = 0; |
|
383 int vec[kVecSize]; |
|
384 string out; |
|
385 int start = 0; |
|
386 int lastend = -1; |
|
387 |
|
388 while (start <= static_cast<int>(str->length())) { |
|
389 int matches = TryMatch(*str, start, UNANCHORED, vec, kVecSize); |
|
390 if (matches <= 0) |
|
391 break; |
|
392 int matchstart = vec[0], matchend = vec[1]; |
|
393 assert(matchstart >= start); |
|
394 assert(matchend >= matchstart); |
|
395 if (matchstart == matchend && matchstart == lastend) { |
|
396 // advance one character if we matched an empty string at the same |
|
397 // place as the last match occurred |
|
398 matchend = start + 1; |
|
399 // If the current char is CR and we're in CRLF mode, skip LF too. |
|
400 // Note it's better to call pcre_fullinfo() than to examine |
|
401 // all_options(), since options_ could have changed bewteen |
|
402 // compile-time and now, but this is simpler and safe enough. |
|
403 // Modified by PH to add ANY and ANYCRLF. |
|
404 if (start+1 < static_cast<int>(str->length()) && |
|
405 (*str)[start] == '\r' && (*str)[start+1] == '\n' && |
|
406 (NewlineMode(options_.all_options()) == PCRE_NEWLINE_CRLF || |
|
407 NewlineMode(options_.all_options()) == PCRE_NEWLINE_ANY || |
|
408 NewlineMode(options_.all_options()) == PCRE_NEWLINE_ANYCRLF) |
|
409 ) { |
|
410 matchend++; |
|
411 } |
|
412 // We also need to advance more than one char if we're in utf8 mode. |
|
413 #ifdef SUPPORT_UTF8 |
|
414 if (options_.utf8()) { |
|
415 while (matchend < static_cast<int>(str->length()) && |
|
416 ((*str)[matchend] & 0xc0) == 0x80) |
|
417 matchend++; |
|
418 } |
|
419 #endif |
|
420 if (matchend <= static_cast<int>(str->length())) |
|
421 out.append(*str, start, matchend - start); |
|
422 start = matchend; |
|
423 } else { |
|
424 out.append(*str, start, matchstart - start); |
|
425 Rewrite(&out, rewrite, *str, vec, matches); |
|
426 start = matchend; |
|
427 lastend = matchend; |
|
428 count++; |
|
429 } |
|
430 } |
|
431 |
|
432 if (count == 0) |
|
433 return 0; |
|
434 |
|
435 if (start < static_cast<int>(str->length())) |
|
436 out.append(*str, start, str->length() - start); |
|
437 swap(out, *str); |
|
438 return count; |
|
439 } |
|
440 |
|
441 bool RE::Extract(const StringPiece& rewrite, |
|
442 const StringPiece& text, |
|
443 string *out) const { |
|
444 int vec[kVecSize]; |
|
445 int matches = TryMatch(text, 0, UNANCHORED, vec, kVecSize); |
|
446 if (matches == 0) |
|
447 return false; |
|
448 out->erase(); |
|
449 return Rewrite(out, rewrite, text, vec, matches); |
|
450 } |
|
451 |
|
452 /*static*/ string RE::QuoteMeta(const StringPiece& unquoted) { |
|
453 string result; |
|
454 |
|
455 // Escape any ascii character not in [A-Za-z_0-9]. |
|
456 // |
|
457 // Note that it's legal to escape a character even if it has no |
|
458 // special meaning in a regular expression -- so this function does |
|
459 // that. (This also makes it identical to the perl function of the |
|
460 // same name; see `perldoc -f quotemeta`.) The one exception is |
|
461 // escaping NUL: rather than doing backslash + NUL, like perl does, |
|
462 // we do '\0', because pcre itself doesn't take embedded NUL chars. |
|
463 for (int ii = 0; ii < unquoted.size(); ++ii) { |
|
464 // Note that using 'isalnum' here raises the benchmark time from |
|
465 // 32ns to 58ns: |
|
466 if (unquoted[ii] == '\0') { |
|
467 result += "\\0"; |
|
468 } else if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') && |
|
469 (unquoted[ii] < 'A' || unquoted[ii] > 'Z') && |
|
470 (unquoted[ii] < '0' || unquoted[ii] > '9') && |
|
471 unquoted[ii] != '_' && |
|
472 // If this is the part of a UTF8 or Latin1 character, we need |
|
473 // to copy this byte without escaping. Experimentally this is |
|
474 // what works correctly with the regexp library. |
|
475 !(unquoted[ii] & 128)) { |
|
476 result += '\\'; |
|
477 result += unquoted[ii]; |
|
478 } else { |
|
479 result += unquoted[ii]; |
|
480 } |
|
481 } |
|
482 |
|
483 return result; |
|
484 } |
|
485 |
|
486 /***** Actual matching and rewriting code *****/ |
|
487 |
|
488 int RE::TryMatch(const StringPiece& text, |
|
489 int startpos, |
|
490 Anchor anchor, |
|
491 int *vec, |
|
492 int vecsize) const { |
|
493 pcre* re = (anchor == ANCHOR_BOTH) ? re_full_ : re_partial_; |
|
494 if (re == NULL) { |
|
495 //fprintf(stderr, "Matching against invalid re: %s\n", error_->c_str()); |
|
496 return 0; |
|
497 } |
|
498 |
|
499 pcre_extra extra = { 0, 0, 0, 0, 0, 0 }; |
|
500 if (options_.match_limit() > 0) { |
|
501 extra.flags |= PCRE_EXTRA_MATCH_LIMIT; |
|
502 extra.match_limit = options_.match_limit(); |
|
503 } |
|
504 if (options_.match_limit_recursion() > 0) { |
|
505 extra.flags |= PCRE_EXTRA_MATCH_LIMIT_RECURSION; |
|
506 extra.match_limit_recursion = options_.match_limit_recursion(); |
|
507 } |
|
508 int rc = pcre_exec(re, // The regular expression object |
|
509 &extra, |
|
510 (text.data() == NULL) ? "" : text.data(), |
|
511 text.size(), |
|
512 startpos, |
|
513 (anchor == UNANCHORED) ? 0 : PCRE_ANCHORED, |
|
514 vec, |
|
515 vecsize); |
|
516 |
|
517 // Handle errors |
|
518 if (rc == PCRE_ERROR_NOMATCH) { |
|
519 return 0; |
|
520 } else if (rc < 0) { |
|
521 //fprintf(stderr, "Unexpected return code: %d when matching '%s'\n", |
|
522 // re, pattern_.c_str()); |
|
523 return 0; |
|
524 } else if (rc == 0) { |
|
525 // pcre_exec() returns 0 as a special case when the number of |
|
526 // capturing subpatterns exceeds the size of the vector. |
|
527 // When this happens, there is a match and the output vector |
|
528 // is filled, but we miss out on the positions of the extra subpatterns. |
|
529 rc = vecsize / 2; |
|
530 } |
|
531 |
|
532 return rc; |
|
533 } |
|
534 |
|
535 bool RE::DoMatchImpl(const StringPiece& text, |
|
536 Anchor anchor, |
|
537 int* consumed, |
|
538 const Arg* const* args, |
|
539 int n, |
|
540 int* vec, |
|
541 int vecsize) const { |
|
542 assert((1 + n) * 3 <= vecsize); // results + PCRE workspace |
|
543 int matches = TryMatch(text, 0, anchor, vec, vecsize); |
|
544 assert(matches >= 0); // TryMatch never returns negatives |
|
545 if (matches == 0) |
|
546 return false; |
|
547 |
|
548 *consumed = vec[1]; |
|
549 |
|
550 if (n == 0 || args == NULL) { |
|
551 // We are not interested in results |
|
552 return true; |
|
553 } |
|
554 |
|
555 if (NumberOfCapturingGroups() < n) { |
|
556 // RE has fewer capturing groups than number of arg pointers passed in |
|
557 return false; |
|
558 } |
|
559 |
|
560 // If we got here, we must have matched the whole pattern. |
|
561 // We do not need (can not do) any more checks on the value of 'matches' here |
|
562 // -- see the comment for TryMatch. |
|
563 for (int i = 0; i < n; i++) { |
|
564 const int start = vec[2*(i+1)]; |
|
565 const int limit = vec[2*(i+1)+1]; |
|
566 if (!args[i]->Parse(text.data() + start, limit-start)) { |
|
567 // TODO: Should we indicate what the error was? |
|
568 return false; |
|
569 } |
|
570 } |
|
571 |
|
572 return true; |
|
573 } |
|
574 |
|
575 bool RE::DoMatch(const StringPiece& text, |
|
576 Anchor anchor, |
|
577 int* consumed, |
|
578 const Arg* const args[], |
|
579 int n) const { |
|
580 assert(n >= 0); |
|
581 size_t const vecsize = (1 + n) * 3; // results + PCRE workspace |
|
582 // (as for kVecSize) |
|
583 int space[21]; // use stack allocation for small vecsize (common case) |
|
584 int* vec = vecsize <= 21 ? space : new int[vecsize]; |
|
585 bool retval = DoMatchImpl(text, anchor, consumed, args, n, vec, vecsize); |
|
586 if (vec != space) delete [] vec; |
|
587 return retval; |
|
588 } |
|
589 |
|
590 bool RE::Rewrite(string *out, const StringPiece &rewrite, |
|
591 const StringPiece &text, int *vec, int veclen) const { |
|
592 for (const char *s = rewrite.data(), *end = s + rewrite.size(); |
|
593 s < end; s++) { |
|
594 int c = *s; |
|
595 if (c == '\\') { |
|
596 c = *++s; |
|
597 if (isdigit(c)) { |
|
598 int n = (c - '0'); |
|
599 if (n >= veclen) { |
|
600 //fprintf(stderr, requested group %d in regexp %.*s\n", |
|
601 // n, rewrite.size(), rewrite.data()); |
|
602 return false; |
|
603 } |
|
604 int start = vec[2 * n]; |
|
605 if (start >= 0) |
|
606 out->append(text.data() + start, vec[2 * n + 1] - start); |
|
607 } else if (c == '\\') { |
|
608 *out += '\\'; |
|
609 } else { |
|
610 //fprintf(stderr, "invalid rewrite pattern: %.*s\n", |
|
611 // rewrite.size(), rewrite.data()); |
|
612 return false; |
|
613 } |
|
614 } else { |
|
615 *out += c; |
|
616 } |
|
617 } |
|
618 return true; |
|
619 } |
|
620 |
|
621 // Return the number of capturing subpatterns, or -1 if the |
|
622 // regexp wasn't valid on construction. |
|
623 int RE::NumberOfCapturingGroups() const { |
|
624 if (re_partial_ == NULL) return -1; |
|
625 |
|
626 int result; |
|
627 int pcre_retval = pcre_fullinfo(re_partial_, // The regular expression object |
|
628 NULL, // We did not study the pattern |
|
629 PCRE_INFO_CAPTURECOUNT, |
|
630 &result); |
|
631 assert(pcre_retval == 0); |
|
632 return result; |
|
633 } |
|
634 |
|
635 /***** Parsers for various types *****/ |
|
636 |
|
637 bool Arg::parse_null(const char* str, int n, void* dest) { |
|
638 // We fail if somebody asked us to store into a non-NULL void* pointer |
|
639 return (dest == NULL); |
|
640 } |
|
641 |
|
642 bool Arg::parse_string(const char* str, int n, void* dest) { |
|
643 if (dest == NULL) return true; |
|
644 reinterpret_cast<string*>(dest)->assign(str, n); |
|
645 return true; |
|
646 } |
|
647 |
|
648 bool Arg::parse_stringpiece(const char* str, int n, void* dest) { |
|
649 if (dest == NULL) return true; |
|
650 reinterpret_cast<StringPiece*>(dest)->set(str, n); |
|
651 return true; |
|
652 } |
|
653 |
|
654 bool Arg::parse_char(const char* str, int n, void* dest) { |
|
655 if (n != 1) return false; |
|
656 if (dest == NULL) return true; |
|
657 *(reinterpret_cast<char*>(dest)) = str[0]; |
|
658 return true; |
|
659 } |
|
660 |
|
661 bool Arg::parse_uchar(const char* str, int n, void* dest) { |
|
662 if (n != 1) return false; |
|
663 if (dest == NULL) return true; |
|
664 *(reinterpret_cast<unsigned char*>(dest)) = str[0]; |
|
665 return true; |
|
666 } |
|
667 |
|
668 // Largest number spec that we are willing to parse |
|
669 static const int kMaxNumberLength = 32; |
|
670 |
|
671 // REQUIRES "buf" must have length at least kMaxNumberLength+1 |
|
672 // REQUIRES "n > 0" |
|
673 // Copies "str" into "buf" and null-terminates if necessary. |
|
674 // Returns one of: |
|
675 // a. "str" if no termination is needed |
|
676 // b. "buf" if the string was copied and null-terminated |
|
677 // c. "" if the input was invalid and has no hope of being parsed |
|
678 static const char* TerminateNumber(char* buf, const char* str, int n) { |
|
679 if ((n > 0) && isspace(*str)) { |
|
680 // We are less forgiving than the strtoxxx() routines and do not |
|
681 // allow leading spaces. |
|
682 return ""; |
|
683 } |
|
684 |
|
685 // See if the character right after the input text may potentially |
|
686 // look like a digit. |
|
687 if (isdigit(str[n]) || |
|
688 ((str[n] >= 'a') && (str[n] <= 'f')) || |
|
689 ((str[n] >= 'A') && (str[n] <= 'F'))) { |
|
690 if (n > kMaxNumberLength) return ""; // Input too big to be a valid number |
|
691 memcpy(buf, str, n); |
|
692 buf[n] = '\0'; |
|
693 return buf; |
|
694 } else { |
|
695 // We can parse right out of the supplied string, so return it. |
|
696 return str; |
|
697 } |
|
698 } |
|
699 |
|
700 bool Arg::parse_long_radix(const char* str, |
|
701 int n, |
|
702 void* dest, |
|
703 int radix) { |
|
704 if (n == 0) return false; |
|
705 char buf[kMaxNumberLength+1]; |
|
706 str = TerminateNumber(buf, str, n); |
|
707 char* end; |
|
708 errno = 0; |
|
709 long r = strtol(str, &end, radix); |
|
710 if (end != str + n) return false; // Leftover junk |
|
711 if (errno) return false; |
|
712 if (dest == NULL) return true; |
|
713 *(reinterpret_cast<long*>(dest)) = r; |
|
714 return true; |
|
715 } |
|
716 |
|
717 bool Arg::parse_ulong_radix(const char* str, |
|
718 int n, |
|
719 void* dest, |
|
720 int radix) { |
|
721 if (n == 0) return false; |
|
722 char buf[kMaxNumberLength+1]; |
|
723 str = TerminateNumber(buf, str, n); |
|
724 if (str[0] == '-') return false; // strtoul() on a negative number?! |
|
725 char* end; |
|
726 errno = 0; |
|
727 unsigned long r = strtoul(str, &end, radix); |
|
728 if (end != str + n) return false; // Leftover junk |
|
729 if (errno) return false; |
|
730 if (dest == NULL) return true; |
|
731 *(reinterpret_cast<unsigned long*>(dest)) = r; |
|
732 return true; |
|
733 } |
|
734 |
|
735 bool Arg::parse_short_radix(const char* str, |
|
736 int n, |
|
737 void* dest, |
|
738 int radix) { |
|
739 long r; |
|
740 if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse |
|
741 if (r < SHRT_MIN || r > SHRT_MAX) return false; // Out of range |
|
742 if (dest == NULL) return true; |
|
743 *(reinterpret_cast<short*>(dest)) = static_cast<short>(r); |
|
744 return true; |
|
745 } |
|
746 |
|
747 bool Arg::parse_ushort_radix(const char* str, |
|
748 int n, |
|
749 void* dest, |
|
750 int radix) { |
|
751 unsigned long r; |
|
752 if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse |
|
753 if (r > USHRT_MAX) return false; // Out of range |
|
754 if (dest == NULL) return true; |
|
755 *(reinterpret_cast<unsigned short*>(dest)) = static_cast<unsigned short>(r); |
|
756 return true; |
|
757 } |
|
758 |
|
759 bool Arg::parse_int_radix(const char* str, |
|
760 int n, |
|
761 void* dest, |
|
762 int radix) { |
|
763 long r; |
|
764 if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse |
|
765 if (r < INT_MIN || r > INT_MAX) return false; // Out of range |
|
766 if (dest == NULL) return true; |
|
767 *(reinterpret_cast<int*>(dest)) = r; |
|
768 return true; |
|
769 } |
|
770 |
|
771 bool Arg::parse_uint_radix(const char* str, |
|
772 int n, |
|
773 void* dest, |
|
774 int radix) { |
|
775 unsigned long r; |
|
776 if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse |
|
777 if (r > UINT_MAX) return false; // Out of range |
|
778 if (dest == NULL) return true; |
|
779 *(reinterpret_cast<unsigned int*>(dest)) = r; |
|
780 return true; |
|
781 } |
|
782 |
|
783 bool Arg::parse_longlong_radix(const char* str, |
|
784 int n, |
|
785 void* dest, |
|
786 int radix) { |
|
787 #ifndef HAVE_LONG_LONG |
|
788 return false; |
|
789 #else |
|
790 if (n == 0) return false; |
|
791 char buf[kMaxNumberLength+1]; |
|
792 str = TerminateNumber(buf, str, n); |
|
793 char* end; |
|
794 errno = 0; |
|
795 #if defined HAVE_STRTOQ |
|
796 long long r = strtoq(str, &end, radix); |
|
797 #elif defined HAVE_STRTOLL |
|
798 long long r = strtoll(str, &end, radix); |
|
799 #elif defined HAVE__STRTOI64 |
|
800 long long r = _strtoi64(str, &end, radix); |
|
801 #else |
|
802 #error parse_longlong_radix: cannot convert input to a long-long |
|
803 #endif |
|
804 if (end != str + n) return false; // Leftover junk |
|
805 if (errno) return false; |
|
806 if (dest == NULL) return true; |
|
807 *(reinterpret_cast<long long*>(dest)) = r; |
|
808 return true; |
|
809 #endif /* HAVE_LONG_LONG */ |
|
810 } |
|
811 |
|
812 bool Arg::parse_ulonglong_radix(const char* str, |
|
813 int n, |
|
814 void* dest, |
|
815 int radix) { |
|
816 #ifndef HAVE_UNSIGNED_LONG_LONG |
|
817 return false; |
|
818 #else |
|
819 if (n == 0) return false; |
|
820 char buf[kMaxNumberLength+1]; |
|
821 str = TerminateNumber(buf, str, n); |
|
822 if (str[0] == '-') return false; // strtoull() on a negative number?! |
|
823 char* end; |
|
824 errno = 0; |
|
825 #if defined HAVE_STRTOQ |
|
826 unsigned long long r = strtouq(str, &end, radix); |
|
827 #elif defined HAVE_STRTOLL |
|
828 unsigned long long r = strtoull(str, &end, radix); |
|
829 #elif defined HAVE__STRTOI64 |
|
830 unsigned long long r = _strtoui64(str, &end, radix); |
|
831 #else |
|
832 #error parse_ulonglong_radix: cannot convert input to a long-long |
|
833 #endif |
|
834 if (end != str + n) return false; // Leftover junk |
|
835 if (errno) return false; |
|
836 if (dest == NULL) return true; |
|
837 *(reinterpret_cast<unsigned long long*>(dest)) = r; |
|
838 return true; |
|
839 #endif /* HAVE_UNSIGNED_LONG_LONG */ |
|
840 } |
|
841 |
|
842 bool Arg::parse_double(const char* str, int n, void* dest) { |
|
843 if (n == 0) return false; |
|
844 static const int kMaxLength = 200; |
|
845 char buf[kMaxLength]; |
|
846 if (n >= kMaxLength) return false; |
|
847 memcpy(buf, str, n); |
|
848 buf[n] = '\0'; |
|
849 errno = 0; |
|
850 char* end; |
|
851 double r = strtod(buf, &end); |
|
852 if (end != buf + n) return false; // Leftover junk |
|
853 if (errno) return false; |
|
854 if (dest == NULL) return true; |
|
855 *(reinterpret_cast<double*>(dest)) = r; |
|
856 return true; |
|
857 } |
|
858 |
|
859 bool Arg::parse_float(const char* str, int n, void* dest) { |
|
860 double r; |
|
861 if (!parse_double(str, n, &r)) return false; |
|
862 if (dest == NULL) return true; |
|
863 *(reinterpret_cast<float*>(dest)) = static_cast<float>(r); |
|
864 return true; |
|
865 } |
|
866 |
|
867 |
|
868 #define DEFINE_INTEGER_PARSERS(name) \ |
|
869 bool Arg::parse_##name(const char* str, int n, void* dest) { \ |
|
870 return parse_##name##_radix(str, n, dest, 10); \ |
|
871 } \ |
|
872 bool Arg::parse_##name##_hex(const char* str, int n, void* dest) { \ |
|
873 return parse_##name##_radix(str, n, dest, 16); \ |
|
874 } \ |
|
875 bool Arg::parse_##name##_octal(const char* str, int n, void* dest) { \ |
|
876 return parse_##name##_radix(str, n, dest, 8); \ |
|
877 } \ |
|
878 bool Arg::parse_##name##_cradix(const char* str, int n, void* dest) { \ |
|
879 return parse_##name##_radix(str, n, dest, 0); \ |
|
880 } |
|
881 |
|
882 DEFINE_INTEGER_PARSERS(short) /* */ |
|
883 DEFINE_INTEGER_PARSERS(ushort) /* */ |
|
884 DEFINE_INTEGER_PARSERS(int) /* Don't use semicolons after these */ |
|
885 DEFINE_INTEGER_PARSERS(uint) /* statements because they can cause */ |
|
886 DEFINE_INTEGER_PARSERS(long) /* compiler warnings if the checking */ |
|
887 DEFINE_INTEGER_PARSERS(ulong) /* level is turned up high enough. */ |
|
888 DEFINE_INTEGER_PARSERS(longlong) /* */ |
|
889 DEFINE_INTEGER_PARSERS(ulonglong) /* */ |
|
890 |
|
891 #undef DEFINE_INTEGER_PARSERS |
|
892 |
|
893 } // namespace pcrecpp |