|
1 // Copyright (c) 2005 - 2006, Google Inc. |
|
2 // All rights reserved. |
|
3 // |
|
4 // Redistribution and use in source and binary forms, with or without |
|
5 // modification, are permitted provided that the following conditions are |
|
6 // met: |
|
7 // |
|
8 // * Redistributions of source code must retain the above copyright |
|
9 // notice, this list of conditions and the following disclaimer. |
|
10 // * Redistributions in binary form must reproduce the above |
|
11 // copyright notice, this list of conditions and the following disclaimer |
|
12 // in the documentation and/or other materials provided with the |
|
13 // distribution. |
|
14 // * Neither the name of Google Inc. nor the names of its |
|
15 // contributors may be used to endorse or promote products derived from |
|
16 // this software without specific prior written permission. |
|
17 // |
|
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
|
19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
|
20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
|
21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
|
22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
|
23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
|
24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
|
25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
|
26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
|
27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
|
28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
|
29 // |
|
30 // Author: Sanjay Ghemawat |
|
31 // |
|
32 |
|
33 // Heavily refactored for Symbian OS by Accenture. |
|
34 |
|
35 #ifndef CREGEX_H_ |
|
36 #define CREGXX_H_ |
|
37 |
|
38 #include <e32base.h> |
|
39 #include "tregexoptions.h" |
|
40 |
|
41 // Forward declarations |
|
42 class TRegExArg; |
|
43 struct real_pcre; |
|
44 typedef struct real_pcre pcre; |
|
45 struct pcre_extra; |
|
46 |
|
47 // Errors |
|
48 |
|
49 /** Base offset for CRegEx Errors */ |
|
50 const TInt KErrRegExBase = -20000; |
|
51 |
|
52 /** Base offset for RegEx pattern compile errors */ |
|
53 const TInt KErrRegExCompileBase = KErrRegExBase; |
|
54 |
|
55 /** \ at end of pattern */ |
|
56 const TInt KErrRegExCmpBackslashAtEOP = KErrRegExCompileBase - 1; |
|
57 /** \c at end of pattern */ |
|
58 const TInt KErrRegExCmpBackslashCAtEOP = KErrRegExCompileBase - 2; |
|
59 /** unrecognized character follows \ */ |
|
60 const TInt KErrRegExCmpUnrecCharAftBackslash = KErrRegExCompileBase - 3; |
|
61 /** numbers out of order in {} quantifier */ |
|
62 const TInt KErrRegExCmpNumsOutOfOrderInBraceQuantifier = KErrRegExCompileBase - 4; |
|
63 /** number too big in {} quantifier */ |
|
64 const TInt KErrRegExCmpNumTooBigInBraceQuantifier = KErrRegExCompileBase - 5; |
|
65 /** missing terminating ] for character class */ |
|
66 const TInt KErrRegExCmpMissingTermBracketInCharClass = KErrRegExCompileBase - 6; |
|
67 /** invalid escape sequence in character class */ |
|
68 const TInt KErrRegExCmpInvalidEscapeSeqInCharClass = KErrRegExCompileBase - 7; |
|
69 /** range out of order in character class */ |
|
70 const TInt KErrRegExCmpRangeOutOfOrderInCharClass = KErrRegExCompileBase - 8; |
|
71 /** nothing to repeat */ |
|
72 const TInt KErrRegExCmpNothingToRepeat = KErrRegExCompileBase - 9; |
|
73 /** operand of unlimited repeat could match the empty string - no longer used */ |
|
74 const TInt KErrRegExCmpUnused01 = KErrRegExCompileBase - 10; |
|
75 /** internal error: unexpected repeat */ |
|
76 const TInt KErrRegExCmpUnexpectedRepeat = KErrRegExCompileBase - 11; |
|
77 /** unrecognized character after (? or (?-\0t */ |
|
78 const TInt KErrRegExCmpUnexpectedCharAftParenthQuest = KErrRegExCompileBase - 12; |
|
79 /** POSIX named classes are supported only within a class */ |
|
80 const TInt KErrRegExCmpPosixNamedSupportedWithinClass = KErrRegExCompileBase - 13; |
|
81 /** missing ) */ |
|
82 const TInt KErrRegExCmpMissingCloseParenth = KErrRegExCompileBase - 14; |
|
83 /** reference to non-existent subpattern */ |
|
84 const TInt KErrRegExCmpRefNonExistSubpattern = KErrRegExCompileBase - 15; |
|
85 /** internal error: erroffset passed as NULL */ |
|
86 const TInt KErrRegExCmpErrOffsetNull = KErrRegExCompileBase - 16; |
|
87 /** unknown option bit(s) set */ |
|
88 const TInt KErrRegExCmpUnknownOptionBitsSet = KErrRegExCompileBase - 17; |
|
89 /** missing ) after comment */ |
|
90 const TInt KErrRegExCmpMissingCloseParenthAftComment = KErrRegExCompileBase - 18; |
|
91 /** parentheses nested too deeply - no longer used */ |
|
92 const TInt KErrRegExCmpUnused02 = KErrRegExCompileBase - 19; |
|
93 /** regular expression is too large */ |
|
94 const TInt KErrRegExCmpExprTooLarge = KErrRegExCompileBase - 20; |
|
95 /** failed to get memory */ |
|
96 const TInt KErrRegExCmpFailedGetMemory = KErrRegExCompileBase - 21; |
|
97 /** unmatched parentheses */ |
|
98 const TInt KErrRegExCmpUnmatchedParenth = KErrRegExCompileBase - 22; |
|
99 /** internal error: code overflow */ |
|
100 const TInt KErrRegExCmpCodeOverflow = KErrRegExCompileBase - 23; |
|
101 /** unrecognized character after (?< */ |
|
102 const TInt KErrRegExCmpUnRecogCharAftParenthQuestAngle = KErrRegExCompileBase - 24; |
|
103 /** lookbehind assertion is not fixed length*/ |
|
104 const TInt KErrRegExCmpLookbehindAssertNotFixedLen = KErrRegExCompileBase - 25; |
|
105 /** malformed number or name after (?( */ |
|
106 const TInt KErrRegExCmpMalformedAftParenthQuestParenth = KErrRegExCompileBase - 26; |
|
107 /** conditional group contains more than two branches */ |
|
108 const TInt KErrRegExCmpCondGroupMoreThanTwoBranches = KErrRegExCompileBase - 27; |
|
109 /** assertion expected after (?(\0 */ |
|
110 const TInt KErrRegExCmpAssertExpAftParenthQuestParent = KErrRegExCompileBase - 28; |
|
111 /** (?R or (?[+-]digits must be followed by ) */ |
|
112 const TInt KErrRegExCmpMustFollowedByCloseParenth = KErrRegExCompileBase - 29; |
|
113 /** unknown POSIX class name */ |
|
114 const TInt KErrRegExCmpUnknownPosixClassName = KErrRegExCompileBase - 30; |
|
115 /** POSIX collating elements are not supported */ |
|
116 const TInt KErrRegExCmpPosixCollElemsNotSupported = KErrRegExCompileBase - 31; |
|
117 /** this version of PCRE is not compiled with PCRE_UTF8 support */ |
|
118 const TInt KErrRegExCmpNotCompiledWithUtf8Support = KErrRegExCompileBase - 32; |
|
119 /** spare error - no longer user */ |
|
120 const TInt KErrRegExCmpUnused03 = KErrRegExCompileBase - 33; |
|
121 /** character value in \x{...} sequence is too large */ |
|
122 const TInt KErrRegExCmpCharValueInBackslashXSeqTooLarge = KErrRegExCompileBase - 34; |
|
123 /** invalid condition (?(0) */ |
|
124 const TInt KErrRegExCmpInvalidCondition = KErrRegExCompileBase - 35; |
|
125 /** \C not allowed in lookbehind assertion */ |
|
126 const TInt KErrRegExCmpBackslashCNotAllowedinLookbehind = KErrRegExCompileBase - 36; |
|
127 /** PCRE does not support \L, \l, \N, \U, or \u\0 */ |
|
128 const TInt KErrRegExCmpLNUEscapeSeqNotSupported = KErrRegExCompileBase - 37; |
|
129 /** number after (?C is > 255 */ |
|
130 const TInt KErrRegExCmpNumAftParenthQuestCIsGreatherThan = KErrRegExCompileBase - 38; |
|
131 /** closing ) for (?C expected\ */ |
|
132 const TInt KErrRegExCmpCloseParenthAftParenthQuestCExp = KErrRegExCompileBase - 39; |
|
133 /** recursive call could loop indefinitely */ |
|
134 const TInt KErrRegExCmpRecuriveCallLoopIndef = KErrRegExCompileBase - 40; |
|
135 /** unrecognized character after (?P */ |
|
136 const TInt KErrRegExCmpUnrecCharaftParenthQuestP = KErrRegExCompileBase - 41; |
|
137 /** syntax error in subpattern name (missing terminator) */ |
|
138 const TInt KErrRegExCmpSyntaxInSubpatternName = KErrRegExCompileBase - 42; |
|
139 /** two named subpatterns have the same name */ |
|
140 const TInt KErrRegExCmpTwoSubpatternsHaveSameName = KErrRegExCompileBase - 43; |
|
141 /** invalid UTF-8 string */ |
|
142 const TInt KErrRegExCmpInvalidUtf8String = KErrRegExCompileBase - 44; |
|
143 /** support for \P, \p, and \X has not been compiled */ |
|
144 const TInt KErrRegExCmpSupportForEscapeSeqNotCompiled = KErrRegExCompileBase - 45; |
|
145 /** malformed \P or \p sequence */ |
|
146 const TInt KErrRegExCmpMalformedBackslashPSeq = KErrRegExCompileBase - 46; |
|
147 /** unknown property name after \P or \p */ |
|
148 const TInt KErrRegExCmpUnknownPropNameAftBackslashPSeq = KErrRegExCompileBase - 47; |
|
149 /** subpattern name is too long. Default max = 32 chars. See MAX_NAME_SIZE */ |
|
150 const TInt KErrRegExCmpSubpatternNameTooLong = KErrRegExCompileBase - 48; |
|
151 /** subpattern name is too long. Default max = 10000. See MAX_NAME_COUNT */ |
|
152 const TInt KErrRegExCmpTooManyNamesSubpatterns = KErrRegExCompileBase - 49; |
|
153 /** repeated subpattern is too long - no longer used */ |
|
154 const TInt KErrRegExCmpUnused04 = KErrRegExCompileBase - 50; |
|
155 /** octal value is greater than \\377 (not in UTF-8 mode) */ |
|
156 const TInt KErrRegExCmpOctalValueGreatherThan377 = KErrRegExCompileBase - 51; |
|
157 /** internal error: overran compiling workspace */ |
|
158 const TInt KErrRegExCmpOverranCompilingSpace = KErrRegExCompileBase - 52; |
|
159 /** internal error: previously-checked referenced subpattern not found */ |
|
160 const TInt KErrRegExCmpCheckedSubpatternNotFound = KErrRegExCompileBase - 53; |
|
161 /** DEFINE group contains more than one branch */ |
|
162 const TInt KErrRegExCmpDefineGroupMoreThanOneBranch = KErrRegExCompileBase - 54; |
|
163 /** repeating a DEFINE group is not allowed */ |
|
164 const TInt KErrRegExCmpRepeatingDefineGroupNotAllowed = KErrRegExCompileBase - 55; |
|
165 /** inconsistent NEWLINE options */ |
|
166 const TInt KErrRegExCmpInconsistantNewlineOpts = KErrRegExCompileBase - 56; |
|
167 /** \g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number */ |
|
168 const TInt KErrRegExCmpBackslashGNotFollowed = KErrRegExCompileBase - 57; |
|
169 /** a numbered reference must not be zero */ |
|
170 const TInt KErrRegExCmpNumberedRefIsZero = KErrRegExCompileBase - 58; |
|
171 /** (*VERB) with an argument is not supported */ |
|
172 const TInt KErrRegExCmpAsteriskVerbWithArgNotSupported = KErrRegExCompileBase - 59; |
|
173 /** (*VERB) not recognized */ |
|
174 const TInt KErrRegExCmpAsteriskVerbNotRecog = KErrRegExCompileBase - 60; |
|
175 /** number is too big */ |
|
176 const TInt KErrRegExCmpNumTooBig = KErrRegExCompileBase - 61; |
|
177 /** subpattern name expected */ |
|
178 const TInt KErrRegExCmpSubpatternNameExp = KErrRegExCompileBase - 62; |
|
179 /** digit expected after (?+ */ |
|
180 const TInt KErrRegExCmpDigitExpAftParenthQuestPlus = KErrRegExCompileBase - 63; |
|
181 /** ] is an invalid data character in JavaScript compatibility mode */ |
|
182 const TInt KErrRegExCloseBracketInvalidInJSCompatMode = KErrRegExCompileBase - 64; |
|
183 |
|
184 /** Base offset for RegEx pattern general errors */ |
|
185 const TInt KErrRegExGeneralBase = KErrRegExCompileBase - 512; |
|
186 |
|
187 /** Zero Matches */ |
|
188 const TInt KErrRegExZeroMatches = KErrRegExGeneralBase - 1; |
|
189 |
|
190 /** Regular expression has fewer capturing groups than number of args passed in */ |
|
191 const TInt KErrRegExFewerCaptureGroupsThanArgs = KErrRegExGeneralBase - 2; |
|
192 |
|
193 /** Failed to parse argument, e.g. the supplied argument does not have enough capacity */ |
|
194 const TInt KErrRegExFailedToParseArg = KErrRegExGeneralBase - 3; |
|
195 |
|
196 /** The modifiable descriptor supplied to recieve output does not have a large enough maximum length */ |
|
197 const TInt KErrRegExOutputTooBig = KErrRegExGeneralBase - 4; |
|
198 |
|
199 /** Invalid rewrite pattern */ |
|
200 const TInt KErrRegExInvalidRewritePattern = KErrRegExGeneralBase - 5; |
|
201 |
|
202 /** Error with one of the backslash substitutions in the supplied rewrite string */ |
|
203 const TInt KErrRegExBadBackslashSubsitution = KErrRegExGeneralBase - 6; |
|
204 |
|
205 // Panic Codes |
|
206 _LIT(KRegExPanic, "CRegEx"); |
|
207 |
|
208 /** |
|
209 * Internal CRegEx panics (Debug only) |
|
210 */ |
|
211 enum TRegExPanic |
|
212 { |
|
213 EInvalidMatchResults, |
|
214 EUnexpectedRetValFromPcre, |
|
215 EVectorTooSmall, |
|
216 EInvalidNumArgs |
|
217 }; |
|
218 |
|
219 /* |
|
220 * Newlines are indicated by a single LF character. |
|
221 * @see KNewLineCr |
|
222 * @see KNewLineCrLf |
|
223 * @see KNewLineAnyCrLf |
|
224 * @see KNewLineAny |
|
225 */ |
|
226 static const TInt KNewLineLf = 10; |
|
227 /* |
|
228 * Newlines are indicated by a single CR character. |
|
229 * @see KNewLineLf |
|
230 * @see KNewLineCrLf |
|
231 * @see KNewLineAnyCrLf |
|
232 * @see KNewLineAny |
|
233 */ |
|
234 static const TInt KNewLineCr = 13; |
|
235 |
|
236 /* |
|
237 * Newlines are indicated by the two-character CRLF sequence. |
|
238 * @see KNewLineCr |
|
239 * @see KNewLineLf |
|
240 * @see KNewLineAnyCrLf |
|
241 * @see KNewLineAny |
|
242 */ |
|
243 static const TInt KNewLineCrLf = 3338; |
|
244 |
|
245 /* |
|
246 * Newlines are indicated by any of the following: |
|
247 * - A single CR character. |
|
248 * - A single LF character. |
|
249 * - The two-character CRLF sequence. |
|
250 * @see KNewLineLf |
|
251 * @see KNewLineCr |
|
252 * @see KNewLineCrLf |
|
253 * @see KNewLineAny |
|
254 */ |
|
255 static const TInt KNewLineAnyCrLf = -2; |
|
256 |
|
257 /* |
|
258 * Newlines are indicated by any Unicode sequence: |
|
259 * - A single CR character. |
|
260 * - A single LF character. |
|
261 * - The two-character CRLF sequence. |
|
262 * - A single VT character (vertical tab, U+000B). |
|
263 * - A single FF character (formfeed, U+000C). |
|
264 * - A single NEL character (next line, U+0085). |
|
265 * - A single LS character (line separator, U+2028). |
|
266 * - A single PS character (paragraph separator, U+2029). |
|
267 * The last two are recognized only in UTF-8 mode. |
|
268 * @see KNewLineLf |
|
269 * @see KNewLineCr |
|
270 * @see KNewLineCrLf |
|
271 * @see KNewLineAnyCrLf |
|
272 */ |
|
273 static const TInt KNewLineAny = -1; |
|
274 |
|
275 /** |
|
276 * Symbian C++ interface to the pcre regular-expression library. This class, its |
|
277 * supporting classes and most of the following documentation is largely based |
|
278 * on or taken from the C++ wrapper included with source distributions of PCRE |
|
279 * to which all credit should be given. |
|
280 * |
|
281 * CRegEx supports |
|
282 * Perl-style regular expressions (with extensions like \d, \w, \s, |
|
283 * ...). |
|
284 * |
|
285 * NOTE: These following examples make liberal use of _L8() purely for clarity |
|
286 * and not because it is recommend. In fact, it is strongly discouraged |
|
287 * in favour of _LIT8() as per the standard Symbian coding conventions. |
|
288 * ----------------------------------------------------------------------- |
|
289 * REGEXP SYNTAX: |
|
290 * |
|
291 * This module is part of the pcre library and hence supports its syntax |
|
292 * for regular expressions. |
|
293 * |
|
294 * The syntax is pretty similar to Perl's. For those not familiar |
|
295 * with Perl's regular expressions, here are some examples of the most |
|
296 * commonly used extensions: |
|
297 * |
|
298 * "hello (\\w+) world" -- \w matches a "word" character |
|
299 * "version (\\d+)" -- \d matches a digit |
|
300 * "hello\\s+world" -- \s matches any whitespace character |
|
301 * "\\b(\\w+)\\b" -- \b matches empty string at a word boundary |
|
302 * "(?i)hello" -- (?i) turns on case-insensitive matching |
|
303 * |
|
304 * ----------------------------------------------------------------------- |
|
305 * MATCHING INTERFACE: |
|
306 * |
|
307 * The FullMatchL() operation checks that supplied text matches a |
|
308 * supplied pattern exactly. |
|
309 * |
|
310 * Example: successful match |
|
311 * @code |
|
312 * CRegEx* re = CRegEx::NewLC(_L8("h.*o")); |
|
313 * re->FullMatchL(_L8("hello")); |
|
314 * CleanupStack::PopAndDestroy(re); |
|
315 * @endcode |
|
316 * |
|
317 * Example: unsuccessful match (requires full match): |
|
318 * @code |
|
319 * CRegEx* re = CRegEx::NewLC(_L8("e")); |
|
320 * !re->FullMatchL(_L8("hello")); |
|
321 * CleanupStack::PopAndDestroy(re); |
|
322 * @endcode |
|
323 * |
|
324 * ----------------------------------------------------------------------- |
|
325 * MATCHING WITH SUB-STRING EXTRACTION: |
|
326 * |
|
327 * You can supply extra pointer arguments to extract matched subpieces. |
|
328 * |
|
329 * Example: extracts "ruby" into "s" and 1234 into "i" |
|
330 * @code |
|
331 * TInt i; |
|
332 * TBuf<4> s; |
|
333 * CRegEx* re = CRegEx::NewLC(_L8("(\\w+):(\\d+)")); |
|
334 * re->FullMatchL(_L8("ruby:1234"), &s, &i); |
|
335 * CleanupStack::PopAndDestroy(re); |
|
336 * @endcode |
|
337 * |
|
338 * Example: does not try to extract any extra sub-patterns |
|
339 * @code |
|
340 * re->FullMatchL(_L8("ruby:1234"), &s); |
|
341 * @endcode |
|
342 * |
|
343 * Example: does not try to extract into NULL |
|
344 * @code |
|
345 * re->FullMatchL(_L8("ruby:1234"), NULL, &i); |
|
346 * @endcode |
|
347 * |
|
348 * Example: integer overflow causes failure |
|
349 * @code |
|
350 * !re.FullMatchL(_L8("ruby:1234567891234"), NULL, &i); |
|
351 * @endcode |
|
352 * |
|
353 * Example: fails because there aren't enough sub-patterns: |
|
354 * @code |
|
355 * TBuf<4> s; |
|
356 * CRegEx* re = CRegEx::NewLC(_L8("\\w+:\\d+")); |
|
357 * !re->FullMatchL(_L8("ruby:1234"), &s); |
|
358 * CleanupStack::PopAndDestroy(re); |
|
359 * @endcode |
|
360 * |
|
361 * Example: fails because string cannot be stored in integer |
|
362 * @code |
|
363 * TInt i; |
|
364 * CRegEx* re = CRegEx::NewLC(_L8("(.*)")); |
|
365 * !re->FullMatchL(_L8("ruby"), &i); |
|
366 * CleanupStack::PopAndDestroy(re); |
|
367 * @endcode |
|
368 * |
|
369 * The provided pointer arguments can be pointers to any scalar numeric |
|
370 * type, or one of |
|
371 * TDes8 (matched piece is copied to descriptor) |
|
372 * TPtrC8 (matched piece is pointed to by) |
|
373 * T (where "TBool T::ParseFrom(const TDesC8&)" exists) |
|
374 * NULL (the corresponding matched sub-pattern is not copied) |
|
375 * |
|
376 * CAVEAT: An optional sub-pattern that does not exist in the matched |
|
377 * string is assigned the empty string. Therefore, the following will |
|
378 * return false (because the empty string is not a valid number): |
|
379 * @code |
|
380 * TInt number; |
|
381 * CRegEx* re = CRegEx::NewLC(_L8("abc")); |
|
382 * re->FullMatchL(_L8("[a-z]+(\\d+)?"), &number); |
|
383 * CleanupStack::PopAndDestroy(re); |
|
384 * @endcode |
|
385 * |
|
386 * ----------------------------------------------------------------------- |
|
387 * DO_MATCH |
|
388 * |
|
389 * The matching interface supports at most 4 arguments per call. |
|
390 * If you need more, consider using the more general interface |
|
391 * CRegEx::DoMatchL(). |
|
392 * |
|
393 * ----------------------------------------------------------------------- |
|
394 * PARTIAL MATCHES |
|
395 * |
|
396 * You can use the PartialMatchL() operation when you want the pattern |
|
397 * to match any substring of the text. |
|
398 * |
|
399 * Example: simple search for a string: |
|
400 * @code |
|
401 * CRegEx* re = CRegEx::NewLC(_L8("ell")); |
|
402 * re->PartialMatchL(_L8("hello")); |
|
403 * CleanupStack::PopAndDestroy(re); |
|
404 * @endcode |
|
405 * |
|
406 * Example: find first number in a string: |
|
407 * @code |
|
408 * TInt number; |
|
409 * CRegEx* re = CRegEx::NewLC(_L8("(\\d+)")); |
|
410 * re->PartialMatchL(_L8("x*100 + 20"), &number); |
|
411 * ASSERT(number == 100); |
|
412 * CleanupStack::PopAndDestroy(re); |
|
413 * @endcode |
|
414 * |
|
415 * ----------------------------------------------------------------------- |
|
416 * UTF-8 AND THE MATCHING INTERFACE: |
|
417 * |
|
418 * By default, pattern and text are plain text, one byte per character. |
|
419 * The UTF8 flag, passed to the constructor, causes both pattern |
|
420 * and string to be treated as UTF-8 text, still a byte stream but |
|
421 * potentially multiple bytes per character. In practice, the text |
|
422 * is likelier to be UTF-8 than the pattern, but the match returned |
|
423 * may depend on the UTF8 flag, so always use it when matching |
|
424 * UTF8 text. E.g., "." will match one byte normally but with UTF8 |
|
425 * set may match up to three bytes of a multi-byte character. |
|
426 * |
|
427 * Example: |
|
428 * @code |
|
429 * TRegExOptions options; |
|
430 * options.SetUtf8(ETrue); |
|
431 * CRegEx* re = CRegEx::NewLC(utf8Pattern); |
|
432 * re->FullMatchL(utf8String); |
|
433 * CleanupStack::PopAndDestroy(re); |
|
434 * @endcode |
|
435 * NOTE: The UTF8 option is ignored if libpcre was not compiled with the |
|
436 * SUPPORT_UTF8 macro. |
|
437 * |
|
438 * ----------------------------------------------------------------------- |
|
439 * PASSING MODIFIERS TO THE REGULAR EXPRESSION ENGINE |
|
440 * |
|
441 * SPCRE defines some modifiers to change the behavior of the regular |
|
442 * expression engine. |
|
443 * The C++ wrapper defines an auxiliary class, TRegExOptions, as a vehicle |
|
444 * to pass such modifiers to a CRegEx class. |
|
445 * |
|
446 * Currently, the following modifiers are supported |
|
447 * |
|
448 * modifier description Perl corresponding |
|
449 * |
|
450 * EPcreCaseless case insensitive match /i |
|
451 * EPcreMultiline multiple lines match /m |
|
452 * EPcreDotAll dot matches newlines /s |
|
453 * EPcreDollarEndOnly $ matches only at end N/A |
|
454 * EPcreExtra strict escape parsing N/A |
|
455 * EPcreExtended ignore whitespaces /x |
|
456 * EPcreUtf8 handles UTF8 chars built-in |
|
457 * EPcreUngreedy reverses * and *? N/A |
|
458 * EPcreNoAutoCapture disables matching parens N/A (*) |
|
459 * |
|
460 * (For a full account on how each modifier works, please check the |
|
461 * PCRE API reference manual). |
|
462 * |
|
463 * (*) Both Perl and PCRE allow non matching parentheses by means of the |
|
464 * "?:" modifier within the pattern itself. e.g. (?:ab|cd) does not |
|
465 * capture, while (ab|cd) does. |
|
466 * |
|
467 * For each modifier, there are two member functions whose name is made |
|
468 * out of the modifier , without the "EPcre" prefix. For |
|
469 * instance, EPcreCaseless is handled by |
|
470 * TBool Caseless(), |
|
471 * which returns ETrue if the modifier is set, and |
|
472 * TRegExOptions SetCaseless(TBool), |
|
473 * which sets or unsets the modifier. |
|
474 * |
|
475 * Moreover, PCRE_EXTRA_MATCH_LIMIT can be accessed through the |
|
476 * SetMatchLimit() and MatchLimit() member functions. |
|
477 * Setting the match limit to a non-zero value will limit the executation of |
|
478 * SPCRE to keep it from doing bad things like blowing the stack or taking |
|
479 * an eternity to return a result. A value of 5000 is good enough to stop |
|
480 * stack blowup in a 2MB thread stack. Setting MathLimit to zero will |
|
481 * disable match limiting. Alternately, you can set MatchLimitRecursion() |
|
482 * which uses PCRE_EXTRA_MATCH_LIMIT_RECURSION to limit how much SPCRE |
|
483 * recurses. MatchLimit() caps the number of matches pcre does; |
|
484 * MatchLimitRecursion() caps the depth of recursion. |
|
485 * |
|
486 * Normally, to pass one or more modifiers to a CRegEx class, you declare |
|
487 * a TRegExOptions object, set the appropriate options, and pass this |
|
488 * object to a CRegEx constructor. Example: |
|
489 * |
|
490 * @code |
|
491 * TRegExOptions opt; |
|
492 * opt.setCaseless(ETrue); |
|
493 * CRegEx* re = CRegEx::NewLC(_L8("HELLO"), opt); |
|
494 * if(re->PartialMatchL(_L8("hello world"))) ... |
|
495 * @endcode |
|
496 * |
|
497 * ----------------------------------------------------------------------- |
|
498 * SCANNING TEXT INCREMENTALLY |
|
499 * |
|
500 * The ConsumeL() operation may be useful if you want to repeatedly |
|
501 * match regular expressions at the front of a string and skip over |
|
502 * them as they match. This requires use of the "StringPiece" type, |
|
503 * which represents a sub-range of a real string. Like RE, StringPiece |
|
504 * is defined in the pcrecpp namespace. |
|
505 * |
|
506 * Example: read lines of the form "var = value" from a string. |
|
507 * @code |
|
508 * TBuf8<KContentLength> contents; |
|
509 * // fill contents somehow |
|
510 * TBuf8<KMaxVarLength> var; |
|
511 * TInt value; |
|
512 * CRegEx* re = CRegEx::NewLC(_L8("(\\w+) = (\\d+)\n")); |
|
513 * while(re->ConsumeL(contents, &var, &value)) |
|
514 * { |
|
515 * ... |
|
516 * } |
|
517 * CleanupStack::PopAndDestroy(re); |
|
518 * @endcode |
|
519 * |
|
520 * Each successful call to ConsumeL will set "var/value", and also |
|
521 * advance "contents" so it points past the matched text. |
|
522 * |
|
523 * The FindAndConsumeL() operation is similar to ConsumeL() but does not |
|
524 * anchor your match at the beginning of the string. For example, you |
|
525 * could extract all words from a string by repeatedly calling |
|
526 * @code |
|
527 * TBuf8<KContentLength> contents; |
|
528 * // fill contents somehow |
|
529 * TBuf8<KMaxWordLength> word; |
|
530 * TInt value; |
|
531 * CRegEx* re = CRegEx::NewLC(_L8("(\\w+)")); |
|
532 * while(re->FindAndConsumeL(contents, &word)) |
|
533 * { |
|
534 * ... |
|
535 * } |
|
536 * CleanupStack::PopAndDestroy(re); |
|
537 * @endcode |
|
538 * ----------------------------------------------------------------------- |
|
539 * PARSING HEX/OCTAL NUMBERS |
|
540 * |
|
541 * By default, if you pass a pointer to a numeric value, the |
|
542 * corresponding text is interpreted as a base-10 number. You can |
|
543 * instead wrap the pointer with a call to one of the operators Hex(), |
|
544 * or Octal() to interpret the text in another base. |
|
545 * |
|
546 * Example: |
|
547 * @code |
|
548 * TInt a, b; |
|
549 * CRegEx* re = CRegEx::NewLC(_L8("(.*) (.*)")); |
|
550 * re->FullMatchL(_L8("100 40"), Hex(&a), Octal(&b)); |
|
551 * CleanupStack::PopAndDestroy(re); |
|
552 * @endcode |
|
553 * will leave 64 in a and b. |
|
554 * |
|
555 * ----------------------------------------------------------------------- |
|
556 * REPLACING PARTS OF STRINGS |
|
557 * |
|
558 * You can replace the first match of aPattern in aString with |
|
559 * aRewrite. Within aRewrite, backslash-escaped digits (\1 to \9) |
|
560 * can be used to insert text matching corresponding parenthesized |
|
561 * group from the pattern. \0 in aRewrite refers to the entire |
|
562 * matching text. E.g., |
|
563 * @code |
|
564 * _LIT8(KYabbaDabbaDoo, "yabba dabba doo"); |
|
565 * TBuf8<20> s; |
|
566 * CRegEx* re = CRegEx::NewLC(_L8("b+"); |
|
567 * re->ReplaceL(KYabbaDabbaDoo(), s); |
|
568 * CleanupStack::PopAndDestroy(re); |
|
569 * @endcode |
|
570 * will leave "s" containing "yada dabba doo". The result is ETrue if |
|
571 * the pattern matches and a replacement occurs, or Efalse otherwise. |
|
572 * |
|
573 * GlobalReplaceL() is like Replace:(), except that it replaces all |
|
574 * occurrences of the pattern in the string with the rewrite. |
|
575 * Replacements are not subject to re-matching. E.g., |
|
576 * @code |
|
577 * _LIT8(KYabbaDabbaDoo, "yabba dabba doo"); |
|
578 * TBuf8<20> s; |
|
579 * CRegEx* re = CRegEx::NewLC(_L8("b+"); |
|
580 * re->GlobalReplaceL(_L8("d"), s); |
|
581 * CleanupStack::PopAndDestroy(re); |
|
582 * @endcode |
|
583 * will leave "s" containing "yada dada doo". It returns the number |
|
584 * of replacements made. |
|
585 * |
|
586 * ExtractL() is like Replace(), except that if the pattern matches, |
|
587 * aRewrite is copied into aOut (an additional argument) with |
|
588 * substitutions. The non-matching portions of aText are ignored. |
|
589 * Returns ETrue if a match occurred and the extraction happened |
|
590 * successfully. If no match occurs, the string is left unaffected. |
|
591 */ |
|
592 class CRegEx : public CBase |
|
593 { |
|
594 public: |
|
595 // Type of match (TODO: Should be restructured as part of TRegExOptions) |
|
596 enum TAnchor |
|
597 { |
|
598 EUnanchored, /** No anchoring */ |
|
599 EAnchorStart, /** Anchor at start only */ |
|
600 EAnchorBoth /** Anchor at start and end */ |
|
601 }; |
|
602 |
|
603 public: |
|
604 IMPORT_C static CRegEx* NewL(const TDesC8& aPattern); |
|
605 IMPORT_C static CRegEx* NewL(const TDesC8& aPattern, const TRegExOptions& aOptions); |
|
606 IMPORT_C static CRegEx* NewLC(const TDesC8& aPattern); |
|
607 IMPORT_C static CRegEx* NewLC(const TDesC8& aPattern, const TRegExOptions& aOptions); |
|
608 |
|
609 IMPORT_C static CRegEx* NewL(const TDesC16& aPattern, const TRegExOptions& aOptions); |
|
610 |
|
611 IMPORT_C ~CRegEx(); |
|
612 |
|
613 inline const TDesC8& Pattern() const; |
|
614 inline TInt Error() const; |
|
615 |
|
616 IMPORT_C TBool FullMatchL(const TDesC8& aText) const; |
|
617 |
|
618 IMPORT_C TBool FullMatchL(const TDesC8& aText, |
|
619 const TRegExArg& aArg1) const; |
|
620 |
|
621 IMPORT_C TBool FullMatchL(const TDesC8& aText, |
|
622 const TRegExArg& aArg1, |
|
623 const TRegExArg& aArg2) const; |
|
624 |
|
625 IMPORT_C TBool FullMatchL(const TDesC8& aText, |
|
626 const TRegExArg& aArg1, |
|
627 const TRegExArg& aArg2, |
|
628 const TRegExArg& aArg3) const; |
|
629 |
|
630 IMPORT_C TBool FullMatchL(const TDesC8& aText, |
|
631 const TRegExArg& aArg1, |
|
632 const TRegExArg& aArg2, |
|
633 const TRegExArg& aArg3, |
|
634 const TRegExArg& aArg4) const; |
|
635 |
|
636 IMPORT_C TBool PartialMatchL(const TDesC8& aText) const; |
|
637 |
|
638 IMPORT_C TBool PartialMatchL(const TDesC8& aText, |
|
639 const TRegExArg& aArg1) const; |
|
640 |
|
641 IMPORT_C TBool PartialMatchL(const TDesC8& aText, |
|
642 const TRegExArg& aArg1, |
|
643 const TRegExArg& aArg2) const; |
|
644 |
|
645 IMPORT_C TBool PartialMatchL(const TDesC8& aText, |
|
646 const TRegExArg& aArg1, |
|
647 const TRegExArg& aArg2, |
|
648 const TRegExArg& aArg3) const; |
|
649 |
|
650 IMPORT_C TBool PartialMatchL(const TDesC8& aText, |
|
651 const TRegExArg& aArg1, |
|
652 const TRegExArg& aArg2, |
|
653 const TRegExArg& aArg3, |
|
654 const TRegExArg& aArg4) const; |
|
655 |
|
656 IMPORT_C TBool DoMatchL(const TDesC8& aText, |
|
657 TAnchor aAnchor, |
|
658 TInt& aConsumed) const; |
|
659 |
|
660 IMPORT_C TBool DoMatchL(const TDesC8& aText, |
|
661 TAnchor aAnchor, |
|
662 TInt& aConsumed, |
|
663 const TRegExArg& aArg1) const; |
|
664 |
|
665 IMPORT_C TBool DoMatchL(const TDesC8& aText, |
|
666 TAnchor aAnchor, |
|
667 TInt& aConsumed, |
|
668 const TRegExArg& aArg1, |
|
669 const TRegExArg& aArg2) const; |
|
670 |
|
671 IMPORT_C TBool DoMatchL(const TDesC8& aText, |
|
672 TAnchor aAnchor, |
|
673 TInt& aConsumed, |
|
674 const TRegExArg& aArg1, |
|
675 const TRegExArg& aArg2, |
|
676 const TRegExArg& aArg3) const; |
|
677 |
|
678 IMPORT_C TBool DoMatchL(const TDesC8& aText, |
|
679 TAnchor aAnchor, |
|
680 TInt& aConsumed, |
|
681 const TRegExArg& aArg1, |
|
682 const TRegExArg& aArg2, |
|
683 const TRegExArg& aArg3, |
|
684 const TRegExArg& aArg4) const; |
|
685 |
|
686 IMPORT_C TBool DoMatchL(const TDesC8& aText, |
|
687 TAnchor aAnchor, |
|
688 TInt& aConsumed, |
|
689 const RPointerArray<const TRegExArg>& aArgs) const; |
|
690 |
|
691 |
|
692 IMPORT_C TBool ConsumeL(TDes8& aText) const; |
|
693 |
|
694 IMPORT_C TBool ConsumeL(TDes8& aText, |
|
695 const TRegExArg& aArg1) const; |
|
696 |
|
697 IMPORT_C TBool ConsumeL(TDes8& aText, |
|
698 const TRegExArg& aArg1, |
|
699 const TRegExArg& aArg2) const; |
|
700 |
|
701 IMPORT_C TBool ConsumeL(TDes8& aText, |
|
702 const TRegExArg& aArg1, |
|
703 const TRegExArg& aArg2, |
|
704 const TRegExArg& aArg3) const; |
|
705 |
|
706 IMPORT_C TBool ConsumeL(TDes8& aText, |
|
707 const TRegExArg& aArg1, |
|
708 const TRegExArg& aArg2, |
|
709 const TRegExArg& aArg3, |
|
710 const TRegExArg& aArg4) const; |
|
711 |
|
712 IMPORT_C TBool FindAndConsumeL(TDes8& aText) const; |
|
713 |
|
714 IMPORT_C TBool FindAndConsumeL(TDes8& aText, |
|
715 const TRegExArg& aArg1) const; |
|
716 |
|
717 IMPORT_C TBool FindAndConsumeL(TDes8& aText, |
|
718 const TRegExArg& aArg1, |
|
719 const TRegExArg& aArg2) const; |
|
720 |
|
721 IMPORT_C TBool FindAndConsumeL(TDes8& aText, |
|
722 const TRegExArg& aArg1, |
|
723 const TRegExArg& aArg2, |
|
724 const TRegExArg& aArg3) const; |
|
725 |
|
726 IMPORT_C TBool FindAndConsumeL(TDes8& aText, |
|
727 const TRegExArg& aArg1, |
|
728 const TRegExArg& aArg2, |
|
729 const TRegExArg& aArg3, |
|
730 const TRegExArg& aArg4) const; |
|
731 |
|
732 IMPORT_C TBool ReplaceL(const TDesC8& aRewrite, TDes8& aString) const; |
|
733 |
|
734 IMPORT_C TInt GlobalReplaceL(const TDesC8& aRewrite, TDes8& aString) const; |
|
735 |
|
736 IMPORT_C TBool ExtractL(const TDesC8& aRewrite, const TDesC8& aText, TDes8& aOut) const; |
|
737 |
|
738 IMPORT_C static TInt NewlineMode(TInt aOptions); |
|
739 |
|
740 IMPORT_C static HBufC8* QuoteMetaL(const TDesC8& aUnquoted); |
|
741 |
|
742 IMPORT_C TInt NumberOfCapturingGroups() const; |
|
743 |
|
744 IMPORT_C void Study(); |
|
745 |
|
746 private: |
|
747 CRegEx(); |
|
748 CRegEx(const TRegExOptions& aOptions); |
|
749 |
|
750 void ConstructL(const TDesC8& aPattern); |
|
751 void ConstructL(const TDesC16& aPattern); |
|
752 void CommonConstructL(); |
|
753 |
|
754 pcre* CompileL(TAnchor anchor); |
|
755 void Cleanup(); |
|
756 |
|
757 TInt TryMatch(const TDesC8& aText, |
|
758 TInt aStartPos, |
|
759 TAnchor aAnchor, |
|
760 TInt* aVector, |
|
761 TInt aVectorSize) const; |
|
762 |
|
763 |
|
764 TBool Rewrite(TDes8& aOut, |
|
765 const TDesC8& aRewrite, |
|
766 const TDesC8& aText, |
|
767 TInt* aVector, |
|
768 TInt aVectorSize, |
|
769 TInt aMatches) const; |
|
770 |
|
771 TBool DoMatchImpl(const TDesC8& aText, |
|
772 TAnchor aAnchor, |
|
773 TInt& aConsumed, |
|
774 const RPointerArray<const TRegExArg>& aArgs, |
|
775 TInt* aVector, |
|
776 TInt aVectorSize) const; |
|
777 |
|
778 |
|
779 static void Panic(TRegExPanic aPanic); |
|
780 |
|
781 mutable TInt iErrorCode; // Error code for the alst unsuccessful operation. |
|
782 TInt iErrorOffset; // Offset in pattern where error was detected |
|
783 HBufC8* iPattern; // Regular expression pattern |
|
784 TRegExOptions iOptions; // Options used to compile RE pattern. |
|
785 pcre* iReFull; // For full matches |
|
786 pcre* iRePartial; // For partial matches |
|
787 pcre_extra* iExtraPartial; // Study Data for iRePartial |
|
788 TRegExArg* iNoArg; // Default argument |
|
789 }; |
|
790 |
|
791 #include <cregex.inl> |
|
792 #endif /* CREGEX_H_ */ |