libraries/spcre/inc/cregex.h
changeset 0 7f656887cf89
equal deleted inserted replaced
-1:000000000000 0:7f656887cf89
       
     1 // Copyright (c) 2005 - 2006, Google Inc.
       
     2 // All rights reserved.
       
     3 //
       
     4 // Redistribution and use in source and binary forms, with or without
       
     5 // modification, are permitted provided that the following conditions are
       
     6 // met:
       
     7 //
       
     8 //     * Redistributions of source code must retain the above copyright
       
     9 // notice, this list of conditions and the following disclaimer.
       
    10 //     * Redistributions in binary form must reproduce the above
       
    11 // copyright notice, this list of conditions and the following disclaimer
       
    12 // in the documentation and/or other materials provided with the
       
    13 // distribution.
       
    14 //     * Neither the name of Google Inc. nor the names of its
       
    15 // contributors may be used to endorse or promote products derived from
       
    16 // this software without specific prior written permission.
       
    17 //
       
    18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
       
    19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
       
    20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
       
    21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
       
    22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
       
    23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
       
    24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
       
    25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
       
    26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
       
    27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
       
    28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
       
    29 //
       
    30 // Author: Sanjay Ghemawat
       
    31 //
       
    32 
       
    33 // Heavily refactored for Symbian OS by Accenture.
       
    34 
       
    35 #ifndef CREGEX_H_
       
    36 #define CREGXX_H_
       
    37 
       
    38 #include <e32base.h>
       
    39 #include "tregexoptions.h"
       
    40 
       
    41 // Forward declarations
       
    42 class TRegExArg;
       
    43 struct real_pcre;
       
    44 typedef struct real_pcre pcre;
       
    45 struct pcre_extra;
       
    46 
       
    47 // Errors
       
    48 
       
    49 /** Base offset for CRegEx Errors */
       
    50 const TInt KErrRegExBase 									= -20000;
       
    51 
       
    52 /** Base offset for RegEx pattern compile errors */
       
    53 const TInt KErrRegExCompileBase 							= KErrRegExBase;
       
    54 
       
    55 /** \ at end of pattern */
       
    56 const TInt KErrRegExCmpBackslashAtEOP 						= KErrRegExCompileBase - 1;
       
    57 /** \c at end of pattern */
       
    58 const TInt KErrRegExCmpBackslashCAtEOP 						= KErrRegExCompileBase - 2;
       
    59 /** unrecognized character follows \ */
       
    60 const TInt KErrRegExCmpUnrecCharAftBackslash				= KErrRegExCompileBase - 3;
       
    61 /** numbers out of order in {} quantifier */
       
    62 const TInt KErrRegExCmpNumsOutOfOrderInBraceQuantifier		= KErrRegExCompileBase - 4;
       
    63 /** number too big in {} quantifier */
       
    64 const TInt KErrRegExCmpNumTooBigInBraceQuantifier			= KErrRegExCompileBase - 5;
       
    65 /** missing terminating ] for character class */
       
    66 const TInt KErrRegExCmpMissingTermBracketInCharClass		= KErrRegExCompileBase - 6;
       
    67 /** invalid escape sequence in character class */
       
    68 const TInt KErrRegExCmpInvalidEscapeSeqInCharClass			= KErrRegExCompileBase - 7;
       
    69 /** range out of order in character class */
       
    70 const TInt KErrRegExCmpRangeOutOfOrderInCharClass			= KErrRegExCompileBase - 8;
       
    71 /** nothing to repeat */
       
    72 const TInt KErrRegExCmpNothingToRepeat						= KErrRegExCompileBase - 9;
       
    73 /** operand of unlimited repeat could match the empty string - no longer used */
       
    74 const TInt KErrRegExCmpUnused01								= KErrRegExCompileBase - 10;
       
    75 /** internal error: unexpected repeat */
       
    76 const TInt KErrRegExCmpUnexpectedRepeat						= KErrRegExCompileBase - 11;
       
    77 /** unrecognized character after (? or (?-\0t */
       
    78 const TInt KErrRegExCmpUnexpectedCharAftParenthQuest		= KErrRegExCompileBase - 12;
       
    79 /** POSIX named classes are supported only within a class */
       
    80 const TInt KErrRegExCmpPosixNamedSupportedWithinClass		= KErrRegExCompileBase - 13;
       
    81 /** missing ) */
       
    82 const TInt KErrRegExCmpMissingCloseParenth					= KErrRegExCompileBase - 14;
       
    83 /** reference to non-existent subpattern */
       
    84 const TInt KErrRegExCmpRefNonExistSubpattern				= KErrRegExCompileBase - 15;
       
    85 /** internal error: erroffset passed as NULL */
       
    86 const TInt KErrRegExCmpErrOffsetNull						= KErrRegExCompileBase - 16;
       
    87 /** unknown option bit(s) set */
       
    88 const TInt KErrRegExCmpUnknownOptionBitsSet					= KErrRegExCompileBase - 17;
       
    89 /** missing ) after comment */
       
    90 const TInt KErrRegExCmpMissingCloseParenthAftComment		= KErrRegExCompileBase - 18;
       
    91 /** parentheses nested too deeply - no longer used */
       
    92 const TInt KErrRegExCmpUnused02								= KErrRegExCompileBase - 19;
       
    93 /** regular expression is too large */
       
    94 const TInt KErrRegExCmpExprTooLarge							= KErrRegExCompileBase - 20;
       
    95 /** failed to get memory */
       
    96 const TInt KErrRegExCmpFailedGetMemory						= KErrRegExCompileBase - 21;
       
    97 /** unmatched parentheses */
       
    98 const TInt KErrRegExCmpUnmatchedParenth						= KErrRegExCompileBase - 22;
       
    99 /** internal error: code overflow */
       
   100 const TInt KErrRegExCmpCodeOverflow							= KErrRegExCompileBase - 23;
       
   101 /** unrecognized character after (?< */
       
   102 const TInt KErrRegExCmpUnRecogCharAftParenthQuestAngle		= KErrRegExCompileBase - 24;
       
   103 /** lookbehind assertion is not fixed length*/
       
   104 const TInt KErrRegExCmpLookbehindAssertNotFixedLen			= KErrRegExCompileBase - 25;
       
   105 /** malformed number or name after (?( */
       
   106 const TInt KErrRegExCmpMalformedAftParenthQuestParenth		= KErrRegExCompileBase - 26;
       
   107 /** conditional group contains more than two branches */
       
   108 const TInt KErrRegExCmpCondGroupMoreThanTwoBranches			= KErrRegExCompileBase - 27;
       
   109 /** assertion expected after (?(\0 */
       
   110 const TInt KErrRegExCmpAssertExpAftParenthQuestParent		= KErrRegExCompileBase - 28;
       
   111 /** (?R or (?[+-]digits must be followed by ) */
       
   112 const TInt KErrRegExCmpMustFollowedByCloseParenth			= KErrRegExCompileBase - 29;
       
   113 /** unknown POSIX class name */
       
   114 const TInt KErrRegExCmpUnknownPosixClassName				= KErrRegExCompileBase - 30;
       
   115 /** POSIX collating elements are not supported */
       
   116 const TInt KErrRegExCmpPosixCollElemsNotSupported			= KErrRegExCompileBase - 31;
       
   117 /** this version of PCRE is not compiled with PCRE_UTF8 support */
       
   118 const TInt KErrRegExCmpNotCompiledWithUtf8Support			= KErrRegExCompileBase - 32;
       
   119 /** spare error - no longer user  */
       
   120 const TInt KErrRegExCmpUnused03								= KErrRegExCompileBase - 33;
       
   121 /** character value in \x{...} sequence is too large */
       
   122 const TInt KErrRegExCmpCharValueInBackslashXSeqTooLarge		= KErrRegExCompileBase - 34;
       
   123 /** invalid condition (?(0) */
       
   124 const TInt KErrRegExCmpInvalidCondition						= KErrRegExCompileBase - 35;
       
   125 /** \C not allowed in lookbehind assertion */
       
   126 const TInt KErrRegExCmpBackslashCNotAllowedinLookbehind		= KErrRegExCompileBase - 36;
       
   127 /** PCRE does not support \L, \l, \N, \U, or \u\0 */
       
   128 const TInt KErrRegExCmpLNUEscapeSeqNotSupported				= KErrRegExCompileBase - 37;
       
   129 /** number after (?C is > 255 */
       
   130 const TInt KErrRegExCmpNumAftParenthQuestCIsGreatherThan	= KErrRegExCompileBase - 38;
       
   131 /** closing ) for (?C expected\ */
       
   132 const TInt KErrRegExCmpCloseParenthAftParenthQuestCExp		= KErrRegExCompileBase - 39;
       
   133 /** recursive call could loop indefinitely */
       
   134 const TInt KErrRegExCmpRecuriveCallLoopIndef				= KErrRegExCompileBase - 40;
       
   135 /** unrecognized character after (?P */
       
   136 const TInt KErrRegExCmpUnrecCharaftParenthQuestP			= KErrRegExCompileBase - 41;
       
   137 /** syntax error in subpattern name (missing terminator) */
       
   138 const TInt KErrRegExCmpSyntaxInSubpatternName				= KErrRegExCompileBase - 42;
       
   139 /** two named subpatterns have the same name */
       
   140 const TInt KErrRegExCmpTwoSubpatternsHaveSameName			= KErrRegExCompileBase - 43;
       
   141 /** invalid UTF-8 string */
       
   142 const TInt KErrRegExCmpInvalidUtf8String					= KErrRegExCompileBase - 44;
       
   143 /** support for \P, \p, and \X has not been compiled */
       
   144 const TInt KErrRegExCmpSupportForEscapeSeqNotCompiled		= KErrRegExCompileBase - 45;
       
   145 /** malformed \P or \p sequence */
       
   146 const TInt KErrRegExCmpMalformedBackslashPSeq				= KErrRegExCompileBase - 46;
       
   147 /** unknown property name after \P or \p */
       
   148 const TInt KErrRegExCmpUnknownPropNameAftBackslashPSeq		= KErrRegExCompileBase - 47;
       
   149 /** subpattern name is too long. Default max = 32 chars. See MAX_NAME_SIZE */
       
   150 const TInt KErrRegExCmpSubpatternNameTooLong				= KErrRegExCompileBase - 48;
       
   151 /** subpattern name is too long. Default max = 10000. See MAX_NAME_COUNT */
       
   152 const TInt KErrRegExCmpTooManyNamesSubpatterns				= KErrRegExCompileBase - 49;
       
   153 /** repeated subpattern is too long - no longer used */
       
   154 const TInt KErrRegExCmpUnused04								= KErrRegExCompileBase - 50;
       
   155 /** octal value is greater than \\377 (not in UTF-8 mode) */
       
   156 const TInt KErrRegExCmpOctalValueGreatherThan377			= KErrRegExCompileBase - 51;
       
   157 /** internal error: overran compiling workspace */
       
   158 const TInt KErrRegExCmpOverranCompilingSpace				= KErrRegExCompileBase - 52;
       
   159 /** internal error: previously-checked referenced subpattern not found */
       
   160 const TInt KErrRegExCmpCheckedSubpatternNotFound			= KErrRegExCompileBase - 53;
       
   161 /** DEFINE group contains more than one branch */
       
   162 const TInt KErrRegExCmpDefineGroupMoreThanOneBranch			= KErrRegExCompileBase - 54;
       
   163 /** repeating a DEFINE group is not allowed */
       
   164 const TInt KErrRegExCmpRepeatingDefineGroupNotAllowed		= KErrRegExCompileBase - 55;
       
   165 /** inconsistent NEWLINE options */
       
   166 const TInt KErrRegExCmpInconsistantNewlineOpts				= KErrRegExCompileBase - 56;
       
   167 /** \g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number */
       
   168 const TInt KErrRegExCmpBackslashGNotFollowed				= KErrRegExCompileBase - 57;
       
   169 /** a numbered reference must not be zero */
       
   170 const TInt KErrRegExCmpNumberedRefIsZero					= KErrRegExCompileBase - 58;
       
   171 /** (*VERB) with an argument is not supported */
       
   172 const TInt KErrRegExCmpAsteriskVerbWithArgNotSupported		= KErrRegExCompileBase - 59;
       
   173 /** (*VERB) not recognized */
       
   174 const TInt KErrRegExCmpAsteriskVerbNotRecog					= KErrRegExCompileBase - 60;
       
   175 /** number is too big */
       
   176 const TInt KErrRegExCmpNumTooBig							= KErrRegExCompileBase - 61;
       
   177 /** subpattern name expected */
       
   178 const TInt KErrRegExCmpSubpatternNameExp					= KErrRegExCompileBase - 62;
       
   179 /** digit expected after (?+ */
       
   180 const TInt KErrRegExCmpDigitExpAftParenthQuestPlus			= KErrRegExCompileBase - 63;
       
   181 /** ] is an invalid data character in JavaScript compatibility mode */
       
   182 const TInt KErrRegExCloseBracketInvalidInJSCompatMode		= KErrRegExCompileBase - 64;
       
   183 
       
   184 /** Base offset for RegEx pattern general errors */
       
   185 const TInt KErrRegExGeneralBase 							= KErrRegExCompileBase - 512;
       
   186 
       
   187 /** Zero Matches */
       
   188 const TInt KErrRegExZeroMatches								= KErrRegExGeneralBase - 1;
       
   189 
       
   190 /** Regular expression has fewer capturing groups than number of args passed in */
       
   191 const TInt KErrRegExFewerCaptureGroupsThanArgs				= KErrRegExGeneralBase - 2;
       
   192 
       
   193 /** Failed to parse argument, e.g. the supplied argument does not have enough capacity */
       
   194 const TInt KErrRegExFailedToParseArg						= KErrRegExGeneralBase - 3;
       
   195 
       
   196 /** The modifiable descriptor supplied to recieve output does not have a large enough maximum length */
       
   197 const TInt KErrRegExOutputTooBig							= KErrRegExGeneralBase - 4;
       
   198 
       
   199 /** Invalid rewrite pattern */
       
   200 const TInt KErrRegExInvalidRewritePattern					= KErrRegExGeneralBase - 5;
       
   201 
       
   202 /** Error with one of the backslash substitutions in the supplied rewrite string */
       
   203 const TInt KErrRegExBadBackslashSubsitution					= KErrRegExGeneralBase - 6;
       
   204 
       
   205 // Panic Codes
       
   206 _LIT(KRegExPanic, "CRegEx");
       
   207 
       
   208 /**
       
   209  * Internal CRegEx panics (Debug only)
       
   210  */
       
   211 enum TRegExPanic
       
   212 	{
       
   213 	EInvalidMatchResults,
       
   214 	EUnexpectedRetValFromPcre,
       
   215 	EVectorTooSmall,
       
   216 	EInvalidNumArgs
       
   217 	};
       
   218 
       
   219 /*
       
   220  * Newlines are indicated by a single LF character.
       
   221  * @see KNewLineCr
       
   222  * @see KNewLineCrLf
       
   223  * @see KNewLineAnyCrLf
       
   224  * @see KNewLineAny 
       
   225  */
       
   226 static const TInt KNewLineLf = 10;
       
   227 /*
       
   228  * Newlines are indicated by a single CR character.
       
   229  * @see KNewLineLf
       
   230  * @see KNewLineCrLf
       
   231  * @see KNewLineAnyCrLf
       
   232  * @see KNewLineAny 
       
   233  */
       
   234 static const TInt KNewLineCr = 13;
       
   235 
       
   236 /* 
       
   237  * Newlines are indicated by the two-character CRLF sequence.
       
   238  * @see KNewLineCr
       
   239  * @see KNewLineLf 
       
   240  * @see KNewLineAnyCrLf
       
   241  * @see KNewLineAny
       
   242  */
       
   243 static const TInt KNewLineCrLf = 3338;
       
   244 
       
   245 /*
       
   246  * Newlines are indicated by any of the following:
       
   247  * - A single CR character.
       
   248  * - A single LF character.
       
   249  * - The two-character CRLF sequence. 
       
   250  * @see KNewLineLf
       
   251  * @see KNewLineCr
       
   252  * @see KNewLineCrLf 
       
   253  * @see KNewLineAny
       
   254  */
       
   255 static const TInt KNewLineAnyCrLf = -2;
       
   256 
       
   257 /*
       
   258  * Newlines are indicated by any Unicode sequence:
       
   259  * - A single CR character.
       
   260  * - A single LF character.
       
   261  * - The two-character CRLF sequence. 
       
   262  * - A single VT character (vertical tab, U+000B).
       
   263  * - A single FF character (formfeed, U+000C).
       
   264  * - A single NEL character (next line, U+0085).
       
   265  * - A single LS character (line separator, U+2028). 
       
   266  * - A single PS character (paragraph separator, U+2029).
       
   267  * The last two are recognized only in UTF-8 mode. 
       
   268  * @see KNewLineLf
       
   269  * @see KNewLineCr
       
   270  * @see KNewLineCrLf
       
   271  * @see KNewLineAnyCrLf
       
   272  */
       
   273 static const TInt KNewLineAny = -1;
       
   274 
       
   275 /**
       
   276  * Symbian C++ interface to the pcre regular-expression library. This class, its
       
   277  * supporting classes and most of the following documentation is largely based
       
   278  * on or taken from the C++ wrapper included with source distributions of PCRE
       
   279  * to which all credit should be given.
       
   280  * 
       
   281  * CRegEx supports
       
   282  * Perl-style regular expressions (with extensions like \d, \w, \s,
       
   283  * ...).
       
   284  *
       
   285  * NOTE: These following examples make liberal use of _L8() purely for clarity
       
   286  * and not because it is recommend. In fact, it is strongly discouraged
       
   287  * in favour of _LIT8() as per the standard Symbian coding conventions.
       
   288  * -----------------------------------------------------------------------
       
   289  * REGEXP SYNTAX:
       
   290  *
       
   291  * This module is part of the pcre library and hence supports its syntax
       
   292  * for regular expressions.
       
   293  *
       
   294  * The syntax is pretty similar to Perl's.  For those not familiar
       
   295  * with Perl's regular expressions, here are some examples of the most
       
   296  * commonly used extensions:
       
   297  *
       
   298  *   "hello (\\w+) world"  -- \w matches a "word" character
       
   299  *   "version (\\d+)"      -- \d matches a digit
       
   300  *   "hello\\s+world"      -- \s matches any whitespace character
       
   301  *   "\\b(\\w+)\\b"        -- \b matches empty string at a word boundary
       
   302  *   "(?i)hello"           -- (?i) turns on case-insensitive matching
       
   303  *
       
   304  * -----------------------------------------------------------------------
       
   305  * MATCHING INTERFACE:
       
   306  *
       
   307  * The FullMatchL() operation checks that supplied text matches a
       
   308  * supplied pattern exactly.
       
   309  *
       
   310  * Example: successful match
       
   311  * @code
       
   312  *    CRegEx* re = CRegEx::NewLC(_L8("h.*o"));
       
   313  *    re->FullMatchL(_L8("hello"));
       
   314  *    CleanupStack::PopAndDestroy(re);
       
   315  * @endcode
       
   316  * 
       
   317  * Example: unsuccessful match (requires full match):
       
   318  * @code
       
   319  *    CRegEx* re = CRegEx::NewLC(_L8("e"));
       
   320  *    !re->FullMatchL(_L8("hello"));
       
   321  *    CleanupStack::PopAndDestroy(re);
       
   322  * @endcode
       
   323  *
       
   324  * -----------------------------------------------------------------------
       
   325  * MATCHING WITH SUB-STRING EXTRACTION:
       
   326  *
       
   327  * You can supply extra pointer arguments to extract matched subpieces.
       
   328  *
       
   329  * Example: extracts "ruby" into "s" and 1234 into "i"
       
   330  * @code
       
   331  *    TInt i;
       
   332  *    TBuf<4> s;
       
   333  *    CRegEx* re = CRegEx::NewLC(_L8("(\\w+):(\\d+)"));
       
   334  *    re->FullMatchL(_L8("ruby:1234"), &s, &i);
       
   335  *    CleanupStack::PopAndDestroy(re);
       
   336  * @endcode
       
   337  *
       
   338  * Example: does not try to extract any extra sub-patterns
       
   339  * @code
       
   340  *    re->FullMatchL(_L8("ruby:1234"), &s);
       
   341  * @endcode
       
   342  * 
       
   343  * Example: does not try to extract into NULL
       
   344  * @code
       
   345  *    re->FullMatchL(_L8("ruby:1234"), NULL, &i);
       
   346  * @endcode
       
   347  * 
       
   348  * Example: integer overflow causes failure
       
   349  * @code
       
   350  *    !re.FullMatchL(_L8("ruby:1234567891234"), NULL, &i);
       
   351  * @endcode
       
   352  * 
       
   353  * Example: fails because there aren't enough sub-patterns:
       
   354  * @code
       
   355  *    TBuf<4> s;
       
   356  *    CRegEx* re = CRegEx::NewLC(_L8("\\w+:\\d+"));
       
   357  *    !re->FullMatchL(_L8("ruby:1234"), &s);
       
   358  *    CleanupStack::PopAndDestroy(re);
       
   359  * @endcode
       
   360  * 
       
   361  * Example: fails because string cannot be stored in integer
       
   362  * @code
       
   363  *    TInt i;
       
   364  *    CRegEx* re = CRegEx::NewLC(_L8("(.*)"));
       
   365  *    !re->FullMatchL(_L8("ruby"), &i);
       
   366  *    CleanupStack::PopAndDestroy(re);
       
   367  * @endcode 
       
   368  *
       
   369  * The provided pointer arguments can be pointers to any scalar numeric
       
   370  * type, or one of
       
   371  *    TDes8        (matched piece is copied to descriptor)
       
   372  *    TPtrC8	   (matched piece is pointed to by)
       
   373  *    T            (where "TBool T::ParseFrom(const TDesC8&)" exists)
       
   374  *    NULL         (the corresponding matched sub-pattern is not copied)
       
   375  *
       
   376  * CAVEAT: An optional sub-pattern that does not exist in the matched
       
   377  * string is assigned the empty string.  Therefore, the following will
       
   378  * return false (because the empty string is not a valid number):
       
   379  * @code
       
   380  *    TInt number;
       
   381  *    CRegEx* re = CRegEx::NewLC(_L8("abc"));
       
   382  *    re->FullMatchL(_L8("[a-z]+(\\d+)?"), &number);
       
   383  *    CleanupStack::PopAndDestroy(re); 
       
   384  * @endcode
       
   385  *
       
   386  * -----------------------------------------------------------------------
       
   387  * DO_MATCH
       
   388  *
       
   389  * The matching interface supports at most 4 arguments per call.
       
   390  * If you need more, consider using the more general interface
       
   391  * CRegEx::DoMatchL().
       
   392  *
       
   393  * -----------------------------------------------------------------------
       
   394  * PARTIAL MATCHES
       
   395  *
       
   396  * You can use the PartialMatchL() operation when you want the pattern
       
   397  * to match any substring of the text.
       
   398  *
       
   399  * Example: simple search for a string:
       
   400  * @code
       
   401  *    CRegEx* re = CRegEx::NewLC(_L8("ell"));
       
   402  *    re->PartialMatchL(_L8("hello"));
       
   403  *    CleanupStack::PopAndDestroy(re);
       
   404  * @endcode
       
   405  * 
       
   406  * Example: find first number in a string:
       
   407  * @code
       
   408  *    TInt number;
       
   409  *    CRegEx* re = CRegEx::NewLC(_L8("(\\d+)"));
       
   410  *    re->PartialMatchL(_L8("x*100 + 20"), &number);
       
   411  *    ASSERT(number == 100);
       
   412  *    CleanupStack::PopAndDestroy(re);
       
   413  * @endcode
       
   414  *
       
   415  * -----------------------------------------------------------------------
       
   416  * UTF-8 AND THE MATCHING INTERFACE:
       
   417  *
       
   418  * By default, pattern and text are plain text, one byte per character.
       
   419  * The UTF8 flag, passed to the constructor, causes both pattern
       
   420  * and string to be treated as UTF-8 text, still a byte stream but
       
   421  * potentially multiple bytes per character. In practice, the text
       
   422  * is likelier to be UTF-8 than the pattern, but the match returned
       
   423  * may depend on the UTF8 flag, so always use it when matching
       
   424  * UTF8 text.  E.g., "." will match one byte normally but with UTF8
       
   425  * set may match up to three bytes of a multi-byte character.
       
   426  *
       
   427  * Example:
       
   428  * @code
       
   429  *    TRegExOptions options;
       
   430  *    options.SetUtf8(ETrue);
       
   431  *    CRegEx* re = CRegEx::NewLC(utf8Pattern);
       
   432  *    re->FullMatchL(utf8String);
       
   433  *    CleanupStack::PopAndDestroy(re);
       
   434  * @endcode
       
   435  * NOTE: The UTF8 option is ignored if libpcre was not compiled with the
       
   436  *       SUPPORT_UTF8 macro.
       
   437  *
       
   438  * -----------------------------------------------------------------------
       
   439  * PASSING MODIFIERS TO THE REGULAR EXPRESSION ENGINE
       
   440  *
       
   441  * SPCRE defines some modifiers to change the behavior of the regular
       
   442  * expression engine.
       
   443  * The C++ wrapper defines an auxiliary class, TRegExOptions, as a vehicle
       
   444  * to pass such modifiers to a CRegEx class.
       
   445  *
       
   446  * Currently, the following modifiers are supported
       
   447  *
       
   448  *    modifier              description               Perl corresponding
       
   449  *
       
   450  *    EPcreCaseless         case insensitive match    /i
       
   451  *    EPcreMultiline        multiple lines match      /m
       
   452  *    EPcreDotAll           dot matches newlines      /s
       
   453  *    EPcreDollarEndOnly    $ matches only at end     N/A
       
   454  *    EPcreExtra            strict escape parsing     N/A
       
   455  *    EPcreExtended         ignore whitespaces        /x
       
   456  *    EPcreUtf8             handles UTF8 chars        built-in
       
   457  *    EPcreUngreedy         reverses * and *?         N/A
       
   458  *    EPcreNoAutoCapture    disables matching parens  N/A (*)
       
   459  *
       
   460  * (For a full account on how each modifier works, please check the
       
   461  * PCRE API reference manual).
       
   462  *
       
   463  * (*) Both Perl and PCRE allow non matching parentheses by means of the
       
   464  * "?:" modifier within the pattern itself. e.g. (?:ab|cd) does not
       
   465  * capture, while (ab|cd) does.
       
   466  *
       
   467  * For each modifier, there are two member functions whose name is made
       
   468  * out of the modifier , without the "EPcre" prefix. For
       
   469  * instance, EPcreCaseless is handled by
       
   470  *    TBool Caseless(),
       
   471  * which returns ETrue if the modifier is set, and
       
   472  *    TRegExOptions SetCaseless(TBool),
       
   473  * which sets or unsets the modifier.
       
   474  *
       
   475  * Moreover, PCRE_EXTRA_MATCH_LIMIT can be accessed through the
       
   476  * SetMatchLimit() and MatchLimit() member functions.
       
   477  * Setting the match limit to a non-zero value will limit the executation of
       
   478  * SPCRE to keep it from doing bad things like blowing the stack or taking
       
   479  * an eternity to return a result.  A value of 5000 is good enough to stop
       
   480  * stack blowup in a 2MB thread stack.  Setting MathLimit to zero will
       
   481  * disable match limiting.  Alternately, you can set MatchLimitRecursion()
       
   482  * which uses PCRE_EXTRA_MATCH_LIMIT_RECURSION to limit how much SPCRE
       
   483  * recurses.  MatchLimit() caps the number of matches pcre does;
       
   484  * MatchLimitRecursion() caps the depth of recursion.
       
   485  *
       
   486  * Normally, to pass one or more modifiers to a CRegEx class, you declare
       
   487  * a TRegExOptions object, set the appropriate options, and pass this
       
   488  * object to a CRegEx constructor. Example:
       
   489  *
       
   490  * @code
       
   491  *    TRegExOptions opt;
       
   492  *    opt.setCaseless(ETrue);
       
   493  *    CRegEx* re = CRegEx::NewLC(_L8("HELLO"), opt);
       
   494  *    if(re->PartialMatchL(_L8("hello world"))) ...
       
   495  * @endcode
       
   496  *
       
   497  * -----------------------------------------------------------------------
       
   498  * SCANNING TEXT INCREMENTALLY
       
   499  *
       
   500  * The ConsumeL() operation may be useful if you want to repeatedly
       
   501  * match regular expressions at the front of a string and skip over
       
   502  * them as they match.  This requires use of the "StringPiece" type,
       
   503  * which represents a sub-range of a real string.  Like RE, StringPiece
       
   504  * is defined in the pcrecpp namespace.
       
   505  *
       
   506  * Example: read lines of the form "var = value" from a string.
       
   507  * @code
       
   508  *    TBuf8<KContentLength> contents;
       
   509  *    // fill contents somehow
       
   510  *    TBuf8<KMaxVarLength> var;
       
   511  *    TInt value;
       
   512  *    CRegEx* re = CRegEx::NewLC(_L8("(\\w+) = (\\d+)\n"));
       
   513  *    while(re->ConsumeL(contents, &var, &value))
       
   514  *    	{
       
   515  *    	...
       
   516  *    	}
       
   517  *    CleanupStack::PopAndDestroy(re);
       
   518  * @endcode
       
   519  * 
       
   520  * Each successful call to ConsumeL will set "var/value", and also
       
   521  * advance "contents" so it points past the matched text.
       
   522  *
       
   523  * The FindAndConsumeL() operation is similar to ConsumeL() but does not
       
   524  * anchor your match at the beginning of the string.  For example, you
       
   525  * could extract all words from a string by repeatedly calling
       
   526  * @code
       
   527  *    TBuf8<KContentLength> contents;
       
   528  *    // fill contents somehow
       
   529  *    TBuf8<KMaxWordLength> word;
       
   530  *    TInt value;
       
   531  *    CRegEx* re = CRegEx::NewLC(_L8("(\\w+)"));
       
   532  *    while(re->FindAndConsumeL(contents, &word))
       
   533  *    	{
       
   534  *    	...
       
   535  *    	}
       
   536  *    CleanupStack::PopAndDestroy(re);
       
   537  * @endcode
       
   538  * -----------------------------------------------------------------------
       
   539  * PARSING HEX/OCTAL NUMBERS
       
   540  *
       
   541  * By default, if you pass a pointer to a numeric value, the
       
   542  * corresponding text is interpreted as a base-10 number.  You can
       
   543  * instead wrap the pointer with a call to one of the operators Hex(),
       
   544  * or Octal() to interpret the text in another base.  
       
   545  *
       
   546  * Example:
       
   547  * @code
       
   548  *    TInt a, b;
       
   549  *    CRegEx* re = CRegEx::NewLC(_L8("(.*) (.*)"));
       
   550  *    re->FullMatchL(_L8("100 40"), Hex(&a), Octal(&b));
       
   551  *    CleanupStack::PopAndDestroy(re);
       
   552  * @endcode
       
   553  * will leave 64 in a and b.
       
   554  *
       
   555  * -----------------------------------------------------------------------
       
   556  * REPLACING PARTS OF STRINGS
       
   557  *
       
   558  * You can replace the first match of aPattern in aString with
       
   559  * aRewrite.  Within aRewrite, backslash-escaped digits (\1 to \9)
       
   560  * can be used to insert text matching corresponding parenthesized
       
   561  * group from the pattern.  \0 in aRewrite refers to the entire
       
   562  * matching text.  E.g.,
       
   563  * @code
       
   564  *    _LIT8(KYabbaDabbaDoo, "yabba dabba doo"); 
       
   565  *    TBuf8<20> s;
       
   566  *    CRegEx* re = CRegEx::NewLC(_L8("b+");
       
   567  *    re->ReplaceL(KYabbaDabbaDoo(), s);
       
   568  *    CleanupStack::PopAndDestroy(re);
       
   569  * @endcode
       
   570  * will leave "s" containing "yada dabba doo".  The result is ETrue if
       
   571  * the pattern matches and a replacement occurs, or Efalse otherwise.
       
   572  *
       
   573  * GlobalReplaceL() is like Replace:(), except that it replaces all
       
   574  * occurrences of the pattern in the string with the rewrite.
       
   575  * Replacements are not subject to re-matching.  E.g.,
       
   576  * @code
       
   577  *    _LIT8(KYabbaDabbaDoo, "yabba dabba doo"); 
       
   578  *    TBuf8<20> s;
       
   579  *    CRegEx* re = CRegEx::NewLC(_L8("b+");
       
   580  *    re->GlobalReplaceL(_L8("d"), s);
       
   581  *    CleanupStack::PopAndDestroy(re);
       
   582  * @endcode
       
   583  * will leave "s" containing "yada dada doo".  It returns the number
       
   584  * of replacements made.
       
   585  *
       
   586  * ExtractL() is like Replace(), except that if the pattern matches,
       
   587  * aRewrite is copied into aOut (an additional argument) with
       
   588  * substitutions.  The non-matching portions of aText are ignored.
       
   589  * Returns ETrue if a match occurred and the extraction happened
       
   590  * successfully.  If no match occurs, the string is left unaffected.
       
   591  */
       
   592 class CRegEx : public CBase
       
   593 	{
       
   594 public:
       
   595 	// Type of match (TODO: Should be restructured as part of TRegExOptions)
       
   596 	enum TAnchor
       
   597 		{
       
   598 		EUnanchored,		/** No anchoring */
       
   599 		EAnchorStart,		/** Anchor at start only */
       
   600 		EAnchorBoth			/** Anchor at start and end */
       
   601 		};
       
   602 	
       
   603 public:
       
   604 	IMPORT_C static CRegEx* NewL(const TDesC8& aPattern);
       
   605 	IMPORT_C static CRegEx* NewL(const TDesC8& aPattern, const TRegExOptions& aOptions);
       
   606 	IMPORT_C static CRegEx* NewLC(const TDesC8& aPattern);
       
   607 	IMPORT_C static CRegEx* NewLC(const TDesC8& aPattern, const TRegExOptions& aOptions);
       
   608 
       
   609 	IMPORT_C static CRegEx* NewL(const TDesC16& aPattern, const TRegExOptions& aOptions);
       
   610 	
       
   611 	IMPORT_C ~CRegEx();
       
   612 	
       
   613 	inline const TDesC8& Pattern() const;
       
   614 	inline TInt Error() const;
       
   615 
       
   616 	IMPORT_C TBool FullMatchL(const TDesC8& aText) const;
       
   617 	
       
   618 	IMPORT_C TBool FullMatchL(const TDesC8& aText,
       
   619 			  const TRegExArg& aArg1) const;
       
   620 	
       
   621 	IMPORT_C TBool FullMatchL(const TDesC8& aText,
       
   622 			  const TRegExArg& aArg1,
       
   623 			  const TRegExArg& aArg2) const;
       
   624 	
       
   625 	IMPORT_C TBool FullMatchL(const TDesC8& aText,
       
   626 			  const TRegExArg& aArg1,
       
   627 			  const TRegExArg& aArg2,
       
   628 			  const TRegExArg& aArg3) const;
       
   629 	
       
   630 	IMPORT_C TBool FullMatchL(const TDesC8& aText,
       
   631 			  const TRegExArg& aArg1,
       
   632 			  const TRegExArg& aArg2,
       
   633 			  const TRegExArg& aArg3,
       
   634 			  const TRegExArg& aArg4) const;	
       
   635 	
       
   636 	IMPORT_C TBool PartialMatchL(const TDesC8& aText) const;
       
   637 	
       
   638 	IMPORT_C TBool PartialMatchL(const TDesC8& aText,
       
   639 			  const TRegExArg& aArg1) const;
       
   640 	
       
   641 	IMPORT_C TBool PartialMatchL(const TDesC8& aText,
       
   642 			  const TRegExArg& aArg1,
       
   643 			  const TRegExArg& aArg2) const;
       
   644 	
       
   645 	IMPORT_C TBool PartialMatchL(const TDesC8& aText,
       
   646 			  const TRegExArg& aArg1,
       
   647 			  const TRegExArg& aArg2,
       
   648 			  const TRegExArg& aArg3) const;
       
   649 	
       
   650 	IMPORT_C TBool PartialMatchL(const TDesC8& aText,
       
   651 			  const TRegExArg& aArg1,
       
   652 			  const TRegExArg& aArg2,
       
   653 			  const TRegExArg& aArg3,
       
   654 			  const TRegExArg& aArg4) const;
       
   655 	
       
   656 	IMPORT_C TBool DoMatchL(const TDesC8& aText,
       
   657 			  TAnchor aAnchor,
       
   658 			  TInt&	aConsumed) const;
       
   659 	
       
   660 	IMPORT_C TBool DoMatchL(const TDesC8& aText,
       
   661 			  TAnchor aAnchor,
       
   662 			  TInt&	aConsumed,
       
   663 			  const TRegExArg& aArg1) const;
       
   664 	
       
   665 	IMPORT_C TBool DoMatchL(const TDesC8& aText,
       
   666 			  TAnchor aAnchor,
       
   667 			  TInt&	aConsumed,
       
   668 			  const TRegExArg& aArg1,
       
   669 			  const TRegExArg& aArg2) const;
       
   670 	
       
   671 	IMPORT_C TBool DoMatchL(const TDesC8& aText,
       
   672 			  TAnchor aAnchor,
       
   673 			  TInt&	aConsumed,
       
   674 			  const TRegExArg& aArg1,
       
   675 			  const TRegExArg& aArg2,
       
   676 			  const TRegExArg& aArg3) const;
       
   677 	
       
   678 	IMPORT_C TBool DoMatchL(const TDesC8& aText,
       
   679 			  TAnchor aAnchor,
       
   680 			  TInt&	aConsumed,
       
   681 			  const TRegExArg& aArg1,
       
   682 			  const TRegExArg& aArg2,
       
   683 			  const TRegExArg& aArg3,
       
   684 			  const TRegExArg& aArg4) const;
       
   685 	
       
   686 	IMPORT_C TBool DoMatchL(const TDesC8& aText,
       
   687 			  TAnchor aAnchor,
       
   688 			  TInt&	aConsumed,			
       
   689 			  const RPointerArray<const TRegExArg>& aArgs) const;
       
   690 	
       
   691 	
       
   692 	IMPORT_C TBool ConsumeL(TDes8& aText) const;
       
   693 	
       
   694 	IMPORT_C TBool ConsumeL(TDes8& aText,
       
   695 			  const TRegExArg& aArg1) const;
       
   696 	
       
   697 	IMPORT_C TBool ConsumeL(TDes8& aText,
       
   698 			  const TRegExArg& aArg1,
       
   699 			  const TRegExArg& aArg2) const;
       
   700 	
       
   701 	IMPORT_C TBool ConsumeL(TDes8& aText,
       
   702 			  const TRegExArg& aArg1,
       
   703 			  const TRegExArg& aArg2,
       
   704 			  const TRegExArg& aArg3) const;
       
   705 	
       
   706 	IMPORT_C TBool ConsumeL(TDes8& aText,
       
   707 			  const TRegExArg& aArg1,
       
   708 			  const TRegExArg& aArg2,
       
   709 			  const TRegExArg& aArg3,
       
   710 			  const TRegExArg& aArg4) const;
       
   711 	
       
   712 	IMPORT_C TBool FindAndConsumeL(TDes8& aText) const;
       
   713 	
       
   714 	IMPORT_C TBool FindAndConsumeL(TDes8& aText,
       
   715 			  const TRegExArg& aArg1) const;
       
   716 	
       
   717 	IMPORT_C TBool FindAndConsumeL(TDes8& aText,
       
   718 			  const TRegExArg& aArg1,
       
   719 			  const TRegExArg& aArg2) const;
       
   720 	
       
   721 	IMPORT_C TBool FindAndConsumeL(TDes8& aText,
       
   722 			  const TRegExArg& aArg1,
       
   723 			  const TRegExArg& aArg2,
       
   724 			  const TRegExArg& aArg3) const;
       
   725 	
       
   726 	IMPORT_C TBool FindAndConsumeL(TDes8& aText,
       
   727 			  const TRegExArg& aArg1,
       
   728 			  const TRegExArg& aArg2,
       
   729 			  const TRegExArg& aArg3,
       
   730 			  const TRegExArg& aArg4) const;
       
   731 	
       
   732 	IMPORT_C TBool ReplaceL(const TDesC8& aRewrite, TDes8& aString) const;
       
   733 	
       
   734 	IMPORT_C TInt GlobalReplaceL(const TDesC8& aRewrite, TDes8& aString) const;
       
   735 	
       
   736 	IMPORT_C TBool ExtractL(const TDesC8& aRewrite, const TDesC8& aText, TDes8& aOut) const;
       
   737 
       
   738 	IMPORT_C static TInt NewlineMode(TInt aOptions);
       
   739 	
       
   740 	IMPORT_C static HBufC8* QuoteMetaL(const TDesC8& aUnquoted);
       
   741 
       
   742 	IMPORT_C TInt NumberOfCapturingGroups() const;
       
   743 	
       
   744 	IMPORT_C void Study();	
       
   745 
       
   746 private:
       
   747 	CRegEx();
       
   748 	CRegEx(const TRegExOptions& aOptions);
       
   749 	
       
   750 	void ConstructL(const TDesC8& aPattern);
       
   751 	void ConstructL(const TDesC16& aPattern);
       
   752 	void CommonConstructL();
       
   753 	
       
   754 	pcre* CompileL(TAnchor anchor);
       
   755 	void Cleanup();
       
   756 
       
   757 	TInt TryMatch(const TDesC8& aText,
       
   758 			   TInt aStartPos,
       
   759 			   TAnchor aAnchor,
       
   760 			   TInt* aVector,
       
   761 			   TInt aVectorSize) const;
       
   762 	
       
   763 
       
   764 	TBool Rewrite(TDes8& aOut,
       
   765 			   const TDesC8& aRewrite,
       
   766 			   const TDesC8& aText,
       
   767 			   TInt* aVector,
       
   768 			   TInt aVectorSize,
       
   769 			   TInt aMatches) const;
       
   770 	
       
   771 	TBool DoMatchImpl(const TDesC8& aText,
       
   772 				   TAnchor aAnchor,
       
   773 				   TInt& aConsumed,
       
   774 				   const RPointerArray<const TRegExArg>& aArgs,
       
   775 				   TInt* aVector,
       
   776 				   TInt aVectorSize) const;
       
   777 	
       
   778 
       
   779 	static void Panic(TRegExPanic aPanic);
       
   780 
       
   781 	mutable TInt					iErrorCode;		// Error code for the alst unsuccessful operation.
       
   782 	TInt							iErrorOffset;	// Offset in pattern where error was detected
       
   783 	HBufC8*							iPattern;		// Regular expression pattern
       
   784 	TRegExOptions					iOptions;		// Options used to compile RE pattern.
       
   785 	pcre*							iReFull;		// For full matches
       
   786 	pcre*							iRePartial;		// For partial matches
       
   787 	pcre_extra*						iExtraPartial;	// Study Data for iRePartial
       
   788 	TRegExArg*						iNoArg;			// Default argument
       
   789 	};
       
   790 	
       
   791 #include <cregex.inl>
       
   792 #endif /* CREGEX_H_ */