MCL/sf/os/fshell: libraries/spcre/inc/cregex.h@257450419d10


// Copyright (c) 2005 - 2006, Google Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
//     * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//     * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
//     * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Author: Sanjay Ghemawat
//

// Heavily refactored for Symbian OS by Accenture.

#ifndef CREGEX_H_
#define CREGXX_H_

#include <e32base.h>
#include "tregexoptions.h"

// Forward declarations
class TRegExArg;
struct real_pcre;
typedef struct real_pcre pcre;
struct pcre_extra;

// Errors

/** Base offset for CRegEx Errors */
const TInt KErrRegExBase 									= -20000;

/** Base offset for RegEx pattern compile errors */
const TInt KErrRegExCompileBase 							= KErrRegExBase;

/** \ at end of pattern */
const TInt KErrRegExCmpBackslashAtEOP 						= KErrRegExCompileBase - 1;
/** \c at end of pattern */
const TInt KErrRegExCmpBackslashCAtEOP 						= KErrRegExCompileBase - 2;
/** unrecognized character follows \ */
const TInt KErrRegExCmpUnrecCharAftBackslash				= KErrRegExCompileBase - 3;
/** numbers out of order in {} quantifier */
const TInt KErrRegExCmpNumsOutOfOrderInBraceQuantifier		= KErrRegExCompileBase - 4;
/** number too big in {} quantifier */
const TInt KErrRegExCmpNumTooBigInBraceQuantifier			= KErrRegExCompileBase - 5;
/** missing terminating ] for character class */
const TInt KErrRegExCmpMissingTermBracketInCharClass		= KErrRegExCompileBase - 6;
/** invalid escape sequence in character class */
const TInt KErrRegExCmpInvalidEscapeSeqInCharClass			= KErrRegExCompileBase - 7;
/** range out of order in character class */
const TInt KErrRegExCmpRangeOutOfOrderInCharClass			= KErrRegExCompileBase - 8;
/** nothing to repeat */
const TInt KErrRegExCmpNothingToRepeat						= KErrRegExCompileBase - 9;
/** operand of unlimited repeat could match the empty string - no longer used */
const TInt KErrRegExCmpUnused01								= KErrRegExCompileBase - 10;
/** internal error: unexpected repeat */
const TInt KErrRegExCmpUnexpectedRepeat						= KErrRegExCompileBase - 11;
/** unrecognized character after (? or (?-\0t */
const TInt KErrRegExCmpUnexpectedCharAftParenthQuest		= KErrRegExCompileBase - 12;
/** POSIX named classes are supported only within a class */
const TInt KErrRegExCmpPosixNamedSupportedWithinClass		= KErrRegExCompileBase - 13;
/** missing ) */
const TInt KErrRegExCmpMissingCloseParenth					= KErrRegExCompileBase - 14;
/** reference to non-existent subpattern */
const TInt KErrRegExCmpRefNonExistSubpattern				= KErrRegExCompileBase - 15;
/** internal error: erroffset passed as NULL */
const TInt KErrRegExCmpErrOffsetNull						= KErrRegExCompileBase - 16;
/** unknown option bit(s) set */
const TInt KErrRegExCmpUnknownOptionBitsSet					= KErrRegExCompileBase - 17;
/** missing ) after comment */
const TInt KErrRegExCmpMissingCloseParenthAftComment		= KErrRegExCompileBase - 18;
/** parentheses nested too deeply - no longer used */
const TInt KErrRegExCmpUnused02								= KErrRegExCompileBase - 19;
/** regular expression is too large */
const TInt KErrRegExCmpExprTooLarge							= KErrRegExCompileBase - 20;
/** failed to get memory */
const TInt KErrRegExCmpFailedGetMemory						= KErrRegExCompileBase - 21;
/** unmatched parentheses */
const TInt KErrRegExCmpUnmatchedParenth						= KErrRegExCompileBase - 22;
/** internal error: code overflow */
const TInt KErrRegExCmpCodeOverflow							= KErrRegExCompileBase - 23;
/** unrecognized character after (?< */
const TInt KErrRegExCmpUnRecogCharAftParenthQuestAngle		= KErrRegExCompileBase - 24;
/** lookbehind assertion is not fixed length*/
const TInt KErrRegExCmpLookbehindAssertNotFixedLen			= KErrRegExCompileBase - 25;
/** malformed number or name after (?( */
const TInt KErrRegExCmpMalformedAftParenthQuestParenth		= KErrRegExCompileBase - 26;
/** conditional group contains more than two branches */
const TInt KErrRegExCmpCondGroupMoreThanTwoBranches			= KErrRegExCompileBase - 27;
/** assertion expected after (?(\0 */
const TInt KErrRegExCmpAssertExpAftParenthQuestParent		= KErrRegExCompileBase - 28;
/** (?R or (?[+-]digits must be followed by ) */
const TInt KErrRegExCmpMustFollowedByCloseParenth			= KErrRegExCompileBase - 29;
/** unknown POSIX class name */
const TInt KErrRegExCmpUnknownPosixClassName				= KErrRegExCompileBase - 30;
/** POSIX collating elements are not supported */
const TInt KErrRegExCmpPosixCollElemsNotSupported			= KErrRegExCompileBase - 31;
/** this version of PCRE is not compiled with PCRE_UTF8 support */
const TInt KErrRegExCmpNotCompiledWithUtf8Support			= KErrRegExCompileBase - 32;
/** spare error - no longer user  */
const TInt KErrRegExCmpUnused03								= KErrRegExCompileBase - 33;
/** character value in \x{...} sequence is too large */
const TInt KErrRegExCmpCharValueInBackslashXSeqTooLarge		= KErrRegExCompileBase - 34;
/** invalid condition (?(0) */
const TInt KErrRegExCmpInvalidCondition						= KErrRegExCompileBase - 35;
/** \C not allowed in lookbehind assertion */
const TInt KErrRegExCmpBackslashCNotAllowedinLookbehind		= KErrRegExCompileBase - 36;
/** PCRE does not support \L, \l, \N, \U, or \u\0 */
const TInt KErrRegExCmpLNUEscapeSeqNotSupported				= KErrRegExCompileBase - 37;
/** number after (?C is > 255 */
const TInt KErrRegExCmpNumAftParenthQuestCIsGreatherThan	= KErrRegExCompileBase - 38;
/** closing ) for (?C expected\ */
const TInt KErrRegExCmpCloseParenthAftParenthQuestCExp		= KErrRegExCompileBase - 39;
/** recursive call could loop indefinitely */
const TInt KErrRegExCmpRecuriveCallLoopIndef				= KErrRegExCompileBase - 40;
/** unrecognized character after (?P */
const TInt KErrRegExCmpUnrecCharaftParenthQuestP			= KErrRegExCompileBase - 41;
/** syntax error in subpattern name (missing terminator) */
const TInt KErrRegExCmpSyntaxInSubpatternName				= KErrRegExCompileBase - 42;
/** two named subpatterns have the same name */
const TInt KErrRegExCmpTwoSubpatternsHaveSameName			= KErrRegExCompileBase - 43;
/** invalid UTF-8 string */
const TInt KErrRegExCmpInvalidUtf8String					= KErrRegExCompileBase - 44;
/** support for \P, \p, and \X has not been compiled */
const TInt KErrRegExCmpSupportForEscapeSeqNotCompiled		= KErrRegExCompileBase - 45;
/** malformed \P or \p sequence */
const TInt KErrRegExCmpMalformedBackslashPSeq				= KErrRegExCompileBase - 46;
/** unknown property name after \P or \p */
const TInt KErrRegExCmpUnknownPropNameAftBackslashPSeq		= KErrRegExCompileBase - 47;
/** subpattern name is too long. Default max = 32 chars. See MAX_NAME_SIZE */
const TInt KErrRegExCmpSubpatternNameTooLong				= KErrRegExCompileBase - 48;
/** subpattern name is too long. Default max = 10000. See MAX_NAME_COUNT */
const TInt KErrRegExCmpTooManyNamesSubpatterns				= KErrRegExCompileBase - 49;
/** repeated subpattern is too long - no longer used */
const TInt KErrRegExCmpUnused04								= KErrRegExCompileBase - 50;
/** octal value is greater than \\377 (not in UTF-8 mode) */
const TInt KErrRegExCmpOctalValueGreatherThan377			= KErrRegExCompileBase - 51;
/** internal error: overran compiling workspace */
const TInt KErrRegExCmpOverranCompilingSpace				= KErrRegExCompileBase - 52;
/** internal error: previously-checked referenced subpattern not found */
const TInt KErrRegExCmpCheckedSubpatternNotFound			= KErrRegExCompileBase - 53;
/** DEFINE group contains more than one branch */
const TInt KErrRegExCmpDefineGroupMoreThanOneBranch			= KErrRegExCompileBase - 54;
/** repeating a DEFINE group is not allowed */
const TInt KErrRegExCmpRepeatingDefineGroupNotAllowed		= KErrRegExCompileBase - 55;
/** inconsistent NEWLINE options */
const TInt KErrRegExCmpInconsistantNewlineOpts				= KErrRegExCompileBase - 56;
/** \g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number */
const TInt KErrRegExCmpBackslashGNotFollowed				= KErrRegExCompileBase - 57;
/** a numbered reference must not be zero */
const TInt KErrRegExCmpNumberedRefIsZero					= KErrRegExCompileBase - 58;
/** (*VERB) with an argument is not supported */
const TInt KErrRegExCmpAsteriskVerbWithArgNotSupported		= KErrRegExCompileBase - 59;
/** (*VERB) not recognized */
const TInt KErrRegExCmpAsteriskVerbNotRecog					= KErrRegExCompileBase - 60;
/** number is too big */
const TInt KErrRegExCmpNumTooBig							= KErrRegExCompileBase - 61;
/** subpattern name expected */
const TInt KErrRegExCmpSubpatternNameExp					= KErrRegExCompileBase - 62;
/** digit expected after (?+ */
const TInt KErrRegExCmpDigitExpAftParenthQuestPlus			= KErrRegExCompileBase - 63;
/** ] is an invalid data character in JavaScript compatibility mode */
const TInt KErrRegExCloseBracketInvalidInJSCompatMode		= KErrRegExCompileBase - 64;

/** Base offset for RegEx pattern general errors */
const TInt KErrRegExGeneralBase 							= KErrRegExCompileBase - 512;

/** Zero Matches */
const TInt KErrRegExZeroMatches								= KErrRegExGeneralBase - 1;

/** Regular expression has fewer capturing groups than number of args passed in */
const TInt KErrRegExFewerCaptureGroupsThanArgs				= KErrRegExGeneralBase - 2;

/** Failed to parse argument, e.g. the supplied argument does not have enough capacity */
const TInt KErrRegExFailedToParseArg						= KErrRegExGeneralBase - 3;

/** The modifiable descriptor supplied to recieve output does not have a large enough maximum length */
const TInt KErrRegExOutputTooBig							= KErrRegExGeneralBase - 4;

/** Invalid rewrite pattern */
const TInt KErrRegExInvalidRewritePattern					= KErrRegExGeneralBase - 5;

/** Error with one of the backslash substitutions in the supplied rewrite string */
const TInt KErrRegExBadBackslashSubsitution					= KErrRegExGeneralBase - 6;

// Panic Codes
_LIT(KRegExPanic, "CRegEx");

/**
 * Internal CRegEx panics (Debug only)
 */
enum TRegExPanic
	{
	EInvalidMatchResults,
	EUnexpectedRetValFromPcre,
	EVectorTooSmall,
	EInvalidNumArgs
	};

/*
 * Newlines are indicated by a single LF character.
 * @see KNewLineCr
 * @see KNewLineCrLf
 * @see KNewLineAnyCrLf
 * @see KNewLineAny 
 */
static const TInt KNewLineLf = 10;
/*
 * Newlines are indicated by a single CR character.
 * @see KNewLineLf
 * @see KNewLineCrLf
 * @see KNewLineAnyCrLf
 * @see KNewLineAny 
 */
static const TInt KNewLineCr = 13;

/* 
 * Newlines are indicated by the two-character CRLF sequence.
 * @see KNewLineCr
 * @see KNewLineLf 
 * @see KNewLineAnyCrLf
 * @see KNewLineAny
 */
static const TInt KNewLineCrLf = 3338;

/*
 * Newlines are indicated by any of the following:
 * - A single CR character.
 * - A single LF character.
 * - The two-character CRLF sequence. 
 * @see KNewLineLf
 * @see KNewLineCr
 * @see KNewLineCrLf 
 * @see KNewLineAny
 */
static const TInt KNewLineAnyCrLf = -2;

/*
 * Newlines are indicated by any Unicode sequence:
 * - A single CR character.
 * - A single LF character.
 * - The two-character CRLF sequence. 
 * - A single VT character (vertical tab, U+000B).
 * - A single FF character (formfeed, U+000C).
 * - A single NEL character (next line, U+0085).
 * - A single LS character (line separator, U+2028). 
 * - A single PS character (paragraph separator, U+2029).
 * The last two are recognized only in UTF-8 mode. 
 * @see KNewLineLf
 * @see KNewLineCr
 * @see KNewLineCrLf
 * @see KNewLineAnyCrLf
 */
static const TInt KNewLineAny = -1;

/**
 * Symbian C++ interface to the pcre regular-expression library. This class, its
 * supporting classes and most of the following documentation is largely based
 * on or taken from the C++ wrapper included with source distributions of PCRE
 * to which all credit should be given.
 * 
 * CRegEx supports
 * Perl-style regular expressions (with extensions like \d, \w, \s,
 * ...).
 *
 * NOTE: These following examples make liberal use of _L8() purely for clarity
 * and not because it is recommend. In fact, it is strongly discouraged
 * in favour of _LIT8() as per the standard Symbian coding conventions.
 * -----------------------------------------------------------------------
 * REGEXP SYNTAX:
 *
 * This module is part of the pcre library and hence supports its syntax
 * for regular expressions.
 *
 * The syntax is pretty similar to Perl's.  For those not familiar
 * with Perl's regular expressions, here are some examples of the most
 * commonly used extensions:
 *
 *   "hello (\\w+) world"  -- \w matches a "word" character
 *   "version (\\d+)"      -- \d matches a digit
 *   "hello\\s+world"      -- \s matches any whitespace character
 *   "\\b(\\w+)\\b"        -- \b matches empty string at a word boundary
 *   "(?i)hello"           -- (?i) turns on case-insensitive matching
 *
 * -----------------------------------------------------------------------
 * MATCHING INTERFACE:
 *
 * The FullMatchL() operation checks that supplied text matches a
 * supplied pattern exactly.
 *
 * Example: successful match
 * @code
 *    CRegEx* re = CRegEx::NewLC(_L8("h.*o"));
 *    re->FullMatchL(_L8("hello"));
 *    CleanupStack::PopAndDestroy(re);
 * @endcode
 * 
 * Example: unsuccessful match (requires full match):
 * @code
 *    CRegEx* re = CRegEx::NewLC(_L8("e"));
 *    !re->FullMatchL(_L8("hello"));
 *    CleanupStack::PopAndDestroy(re);
 * @endcode
 *
 * -----------------------------------------------------------------------
 * MATCHING WITH SUB-STRING EXTRACTION:
 *
 * You can supply extra pointer arguments to extract matched subpieces.
 *
 * Example: extracts "ruby" into "s" and 1234 into "i"
 * @code
 *    TInt i;
 *    TBuf<4> s;
 *    CRegEx* re = CRegEx::NewLC(_L8("(\\w+):(\\d+)"));
 *    re->FullMatchL(_L8("ruby:1234"), &s, &i);
 *    CleanupStack::PopAndDestroy(re);
 * @endcode
 *
 * Example: does not try to extract any extra sub-patterns
 * @code
 *    re->FullMatchL(_L8("ruby:1234"), &s);
 * @endcode
 * 
 * Example: does not try to extract into NULL
 * @code
 *    re->FullMatchL(_L8("ruby:1234"), NULL, &i);
 * @endcode
 * 
 * Example: integer overflow causes failure
 * @code
 *    !re.FullMatchL(_L8("ruby:1234567891234"), NULL, &i);
 * @endcode
 * 
 * Example: fails because there aren't enough sub-patterns:
 * @code
 *    TBuf<4> s;
 *    CRegEx* re = CRegEx::NewLC(_L8("\\w+:\\d+"));
 *    !re->FullMatchL(_L8("ruby:1234"), &s);
 *    CleanupStack::PopAndDestroy(re);
 * @endcode
 * 
 * Example: fails because string cannot be stored in integer
 * @code
 *    TInt i;
 *    CRegEx* re = CRegEx::NewLC(_L8("(.*)"));
 *    !re->FullMatchL(_L8("ruby"), &i);
 *    CleanupStack::PopAndDestroy(re);
 * @endcode 
 *
 * The provided pointer arguments can be pointers to any scalar numeric
 * type, or one of
 *    TDes8        (matched piece is copied to descriptor)
 *    TPtrC8	   (matched piece is pointed to by)
 *    T            (where "TBool T::ParseFrom(const TDesC8&)" exists)
 *    NULL         (the corresponding matched sub-pattern is not copied)
 *
 * CAVEAT: An optional sub-pattern that does not exist in the matched
 * string is assigned the empty string.  Therefore, the following will
 * return false (because the empty string is not a valid number):
 * @code
 *    TInt number;
 *    CRegEx* re = CRegEx::NewLC(_L8("abc"));
 *    re->FullMatchL(_L8("[a-z]+(\\d+)?"), &number);
 *    CleanupStack::PopAndDestroy(re); 
 * @endcode
 *
 * -----------------------------------------------------------------------
 * DO_MATCH
 *
 * The matching interface supports at most 4 arguments per call.
 * If you need more, consider using the more general interface
 * CRegEx::DoMatchL().
 *
 * -----------------------------------------------------------------------
 * PARTIAL MATCHES
 *
 * You can use the PartialMatchL() operation when you want the pattern
 * to match any substring of the text.
 *
 * Example: simple search for a string:
 * @code
 *    CRegEx* re = CRegEx::NewLC(_L8("ell"));
 *    re->PartialMatchL(_L8("hello"));
 *    CleanupStack::PopAndDestroy(re);
 * @endcode
 * 
 * Example: find first number in a string:
 * @code
 *    TInt number;
 *    CRegEx* re = CRegEx::NewLC(_L8("(\\d+)"));
 *    re->PartialMatchL(_L8("x*100 + 20"), &number);
 *    ASSERT(number == 100);
 *    CleanupStack::PopAndDestroy(re);
 * @endcode
 *
 * -----------------------------------------------------------------------
 * UTF-8 AND THE MATCHING INTERFACE:
 *
 * By default, pattern and text are plain text, one byte per character.
 * The UTF8 flag, passed to the constructor, causes both pattern
 * and string to be treated as UTF-8 text, still a byte stream but
 * potentially multiple bytes per character. In practice, the text
 * is likelier to be UTF-8 than the pattern, but the match returned
 * may depend on the UTF8 flag, so always use it when matching
 * UTF8 text.  E.g., "." will match one byte normally but with UTF8
 * set may match up to three bytes of a multi-byte character.
 *
 * Example:
 * @code
 *    TRegExOptions options;
 *    options.SetUtf8(ETrue);
 *    CRegEx* re = CRegEx::NewLC(utf8Pattern);
 *    re->FullMatchL(utf8String);
 *    CleanupStack::PopAndDestroy(re);
 * @endcode
 * NOTE: The UTF8 option is ignored if libpcre was not compiled with the
 *       SUPPORT_UTF8 macro.
 *
 * -----------------------------------------------------------------------
 * PASSING MODIFIERS TO THE REGULAR EXPRESSION ENGINE
 *
 * SPCRE defines some modifiers to change the behavior of the regular
 * expression engine.
 * The C++ wrapper defines an auxiliary class, TRegExOptions, as a vehicle
 * to pass such modifiers to a CRegEx class.
 *
 * Currently, the following modifiers are supported
 *
 *    modifier              description               Perl corresponding
 *
 *    EPcreCaseless         case insensitive match    /i
 *    EPcreMultiline        multiple lines match      /m
 *    EPcreDotAll           dot matches newlines      /s
 *    EPcreDollarEndOnly    $ matches only at end     N/A
 *    EPcreExtra            strict escape parsing     N/A
 *    EPcreExtended         ignore whitespaces        /x
 *    EPcreUtf8             handles UTF8 chars        built-in
 *    EPcreUngreedy         reverses * and *?         N/A
 *    EPcreNoAutoCapture    disables matching parens  N/A (*)
 *
 * (For a full account on how each modifier works, please check the
 * PCRE API reference manual).
 *
 * (*) Both Perl and PCRE allow non matching parentheses by means of the
 * "?:" modifier within the pattern itself. e.g. (?:ab|cd) does not
 * capture, while (ab|cd) does.
 *
 * For each modifier, there are two member functions whose name is made
 * out of the modifier , without the "EPcre" prefix. For
 * instance, EPcreCaseless is handled by
 *    TBool Caseless(),
 * which returns ETrue if the modifier is set, and
 *    TRegExOptions SetCaseless(TBool),
 * which sets or unsets the modifier.
 *
 * Moreover, PCRE_EXTRA_MATCH_LIMIT can be accessed through the
 * SetMatchLimit() and MatchLimit() member functions.
 * Setting the match limit to a non-zero value will limit the executation of
 * SPCRE to keep it from doing bad things like blowing the stack or taking
 * an eternity to return a result.  A value of 5000 is good enough to stop
 * stack blowup in a 2MB thread stack.  Setting MathLimit to zero will
 * disable match limiting.  Alternately, you can set MatchLimitRecursion()
 * which uses PCRE_EXTRA_MATCH_LIMIT_RECURSION to limit how much SPCRE
 * recurses.  MatchLimit() caps the number of matches pcre does;
 * MatchLimitRecursion() caps the depth of recursion.
 *
 * Normally, to pass one or more modifiers to a CRegEx class, you declare
 * a TRegExOptions object, set the appropriate options, and pass this
 * object to a CRegEx constructor. Example:
 *
 * @code
 *    TRegExOptions opt;
 *    opt.setCaseless(ETrue);
 *    CRegEx* re = CRegEx::NewLC(_L8("HELLO"), opt);
 *    if(re->PartialMatchL(_L8("hello world"))) ...
 * @endcode
 *
 * -----------------------------------------------------------------------
 * SCANNING TEXT INCREMENTALLY
 *
 * The ConsumeL() operation may be useful if you want to repeatedly
 * match regular expressions at the front of a string and skip over
 * them as they match.  This requires use of the "StringPiece" type,
 * which represents a sub-range of a real string.  Like RE, StringPiece
 * is defined in the pcrecpp namespace.
 *
 * Example: read lines of the form "var = value" from a string.
 * @code
 *    TBuf8<KContentLength> contents;
 *    // fill contents somehow
 *    TBuf8<KMaxVarLength> var;
 *    TInt value;
 *    CRegEx* re = CRegEx::NewLC(_L8("(\\w+) = (\\d+)\n"));
 *    while(re->ConsumeL(contents, &var, &value))
 *    	{
 *    	...
 *    	}
 *    CleanupStack::PopAndDestroy(re);
 * @endcode
 * 
 * Each successful call to ConsumeL will set "var/value", and also
 * advance "contents" so it points past the matched text.
 *
 * The FindAndConsumeL() operation is similar to ConsumeL() but does not
 * anchor your match at the beginning of the string.  For example, you
 * could extract all words from a string by repeatedly calling
 * @code
 *    TBuf8<KContentLength> contents;
 *    // fill contents somehow
 *    TBuf8<KMaxWordLength> word;
 *    TInt value;
 *    CRegEx* re = CRegEx::NewLC(_L8("(\\w+)"));
 *    while(re->FindAndConsumeL(contents, &word))
 *    	{
 *    	...
 *    	}
 *    CleanupStack::PopAndDestroy(re);
 * @endcode
 * -----------------------------------------------------------------------
 * PARSING HEX/OCTAL NUMBERS
 *
 * By default, if you pass a pointer to a numeric value, the
 * corresponding text is interpreted as a base-10 number.  You can
 * instead wrap the pointer with a call to one of the operators Hex(),
 * or Octal() to interpret the text in another base.  
 *
 * Example:
 * @code
 *    TInt a, b;
 *    CRegEx* re = CRegEx::NewLC(_L8("(.*) (.*)"));
 *    re->FullMatchL(_L8("100 40"), Hex(&a), Octal(&b));
 *    CleanupStack::PopAndDestroy(re);
 * @endcode
 * will leave 64 in a and b.
 *
 * -----------------------------------------------------------------------
 * REPLACING PARTS OF STRINGS
 *
 * You can replace the first match of aPattern in aString with
 * aRewrite.  Within aRewrite, backslash-escaped digits (\1 to \9)
 * can be used to insert text matching corresponding parenthesized
 * group from the pattern.  \0 in aRewrite refers to the entire
 * matching text.  E.g.,
 * @code
 *    _LIT8(KYabbaDabbaDoo, "yabba dabba doo"); 
 *    TBuf8<20> s;
 *    CRegEx* re = CRegEx::NewLC(_L8("b+");
 *    re->ReplaceL(KYabbaDabbaDoo(), s);
 *    CleanupStack::PopAndDestroy(re);
 * @endcode
 * will leave "s" containing "yada dabba doo".  The result is ETrue if
 * the pattern matches and a replacement occurs, or Efalse otherwise.
 *
 * GlobalReplaceL() is like Replace:(), except that it replaces all
 * occurrences of the pattern in the string with the rewrite.
 * Replacements are not subject to re-matching.  E.g.,
 * @code
 *    _LIT8(KYabbaDabbaDoo, "yabba dabba doo"); 
 *    TBuf8<20> s;
 *    CRegEx* re = CRegEx::NewLC(_L8("b+");
 *    re->GlobalReplaceL(_L8("d"), s);
 *    CleanupStack::PopAndDestroy(re);
 * @endcode
 * will leave "s" containing "yada dada doo".  It returns the number
 * of replacements made.
 *
 * ExtractL() is like Replace(), except that if the pattern matches,
 * aRewrite is copied into aOut (an additional argument) with
 * substitutions.  The non-matching portions of aText are ignored.
 * Returns ETrue if a match occurred and the extraction happened
 * successfully.  If no match occurs, the string is left unaffected.
 */
class CRegEx : public CBase
	{
public:
	// Type of match (TODO: Should be restructured as part of TRegExOptions)
	enum TAnchor
		{
		EUnanchored,		/** No anchoring */
		EAnchorStart,		/** Anchor at start only */
		EAnchorBoth			/** Anchor at start and end */
		};
	
public:
	IMPORT_C static CRegEx* NewL(const TDesC8& aPattern);
	IMPORT_C static CRegEx* NewL(const TDesC8& aPattern, const TRegExOptions& aOptions);
	IMPORT_C static CRegEx* NewLC(const TDesC8& aPattern);
	IMPORT_C static CRegEx* NewLC(const TDesC8& aPattern, const TRegExOptions& aOptions);

	IMPORT_C static CRegEx* NewL(const TDesC16& aPattern, const TRegExOptions& aOptions);
	
	IMPORT_C ~CRegEx();
	
	inline const TDesC8& Pattern() const;
	inline TInt Error() const;

	IMPORT_C TBool FullMatchL(const TDesC8& aText) const;
	
	IMPORT_C TBool FullMatchL(const TDesC8& aText,
			  const TRegExArg& aArg1) const;
	
	IMPORT_C TBool FullMatchL(const TDesC8& aText,
			  const TRegExArg& aArg1,
			  const TRegExArg& aArg2) const;
	
	IMPORT_C TBool FullMatchL(const TDesC8& aText,
			  const TRegExArg& aArg1,
			  const TRegExArg& aArg2,
			  const TRegExArg& aArg3) const;
	
	IMPORT_C TBool FullMatchL(const TDesC8& aText,
			  const TRegExArg& aArg1,
			  const TRegExArg& aArg2,
			  const TRegExArg& aArg3,
			  const TRegExArg& aArg4) const;	
	
	IMPORT_C TBool PartialMatchL(const TDesC8& aText) const;
	
	IMPORT_C TBool PartialMatchL(const TDesC8& aText,
			  const TRegExArg& aArg1) const;
	
	IMPORT_C TBool PartialMatchL(const TDesC8& aText,
			  const TRegExArg& aArg1,
			  const TRegExArg& aArg2) const;
	
	IMPORT_C TBool PartialMatchL(const TDesC8& aText,
			  const TRegExArg& aArg1,
			  const TRegExArg& aArg2,
			  const TRegExArg& aArg3) const;
	
	IMPORT_C TBool PartialMatchL(const TDesC8& aText,
			  const TRegExArg& aArg1,
			  const TRegExArg& aArg2,
			  const TRegExArg& aArg3,
			  const TRegExArg& aArg4) const;
	
	IMPORT_C TBool DoMatchL(const TDesC8& aText,
			  TAnchor aAnchor,
			  TInt&	aConsumed) const;
	
	IMPORT_C TBool DoMatchL(const TDesC8& aText,
			  TAnchor aAnchor,
			  TInt&	aConsumed,
			  const TRegExArg& aArg1) const;
	
	IMPORT_C TBool DoMatchL(const TDesC8& aText,
			  TAnchor aAnchor,
			  TInt&	aConsumed,
			  const TRegExArg& aArg1,
			  const TRegExArg& aArg2) const;
	
	IMPORT_C TBool DoMatchL(const TDesC8& aText,
			  TAnchor aAnchor,
			  TInt&	aConsumed,
			  const TRegExArg& aArg1,
			  const TRegExArg& aArg2,
			  const TRegExArg& aArg3) const;
	
	IMPORT_C TBool DoMatchL(const TDesC8& aText,
			  TAnchor aAnchor,
			  TInt&	aConsumed,
			  const TRegExArg& aArg1,
			  const TRegExArg& aArg2,
			  const TRegExArg& aArg3,
			  const TRegExArg& aArg4) const;
	
	IMPORT_C TBool DoMatchL(const TDesC8& aText,
			  TAnchor aAnchor,
			  TInt&	aConsumed,			
			  const RPointerArray<const TRegExArg>& aArgs) const;
	
	
	IMPORT_C TBool ConsumeL(TDes8& aText) const;
	
	IMPORT_C TBool ConsumeL(TDes8& aText,
			  const TRegExArg& aArg1) const;
	
	IMPORT_C TBool ConsumeL(TDes8& aText,
			  const TRegExArg& aArg1,
			  const TRegExArg& aArg2) const;
	
	IMPORT_C TBool ConsumeL(TDes8& aText,
			  const TRegExArg& aArg1,
			  const TRegExArg& aArg2,
			  const TRegExArg& aArg3) const;
	
	IMPORT_C TBool ConsumeL(TDes8& aText,
			  const TRegExArg& aArg1,
			  const TRegExArg& aArg2,
			  const TRegExArg& aArg3,
			  const TRegExArg& aArg4) const;
	
	IMPORT_C TBool FindAndConsumeL(TDes8& aText) const;
	
	IMPORT_C TBool FindAndConsumeL(TDes8& aText,
			  const TRegExArg& aArg1) const;
	
	IMPORT_C TBool FindAndConsumeL(TDes8& aText,
			  const TRegExArg& aArg1,
			  const TRegExArg& aArg2) const;
	
	IMPORT_C TBool FindAndConsumeL(TDes8& aText,
			  const TRegExArg& aArg1,
			  const TRegExArg& aArg2,
			  const TRegExArg& aArg3) const;
	
	IMPORT_C TBool FindAndConsumeL(TDes8& aText,
			  const TRegExArg& aArg1,
			  const TRegExArg& aArg2,
			  const TRegExArg& aArg3,
			  const TRegExArg& aArg4) const;
	
	IMPORT_C TBool ReplaceL(const TDesC8& aRewrite, TDes8& aString) const;
	
	IMPORT_C TInt GlobalReplaceL(const TDesC8& aRewrite, TDes8& aString) const;
	
	IMPORT_C TBool ExtractL(const TDesC8& aRewrite, const TDesC8& aText, TDes8& aOut) const;

	IMPORT_C static TInt NewlineMode(TInt aOptions);
	
	IMPORT_C static HBufC8* QuoteMetaL(const TDesC8& aUnquoted);

	IMPORT_C TInt NumberOfCapturingGroups() const;
	
	IMPORT_C void Study();	

private:
	CRegEx();
	CRegEx(const TRegExOptions& aOptions);
	
	void ConstructL(const TDesC8& aPattern);
	void ConstructL(const TDesC16& aPattern);
	void CommonConstructL();
	
	pcre* CompileL(TAnchor anchor);
	void Cleanup();

	TInt TryMatch(const TDesC8& aText,
			   TInt aStartPos,
			   TAnchor aAnchor,
			   TInt* aVector,
			   TInt aVectorSize) const;
	

	TBool Rewrite(TDes8& aOut,
			   const TDesC8& aRewrite,
			   const TDesC8& aText,
			   TInt* aVector,
			   TInt aVectorSize,
			   TInt aMatches) const;
	
	TBool DoMatchImpl(const TDesC8& aText,
				   TAnchor aAnchor,
				   TInt& aConsumed,
				   const RPointerArray<const TRegExArg>& aArgs,
				   TInt* aVector,
				   TInt aVectorSize) const;
	

	static void Panic(TRegExPanic aPanic);

	mutable TInt					iErrorCode;		// Error code for the alst unsuccessful operation.
	TInt							iErrorOffset;	// Offset in pattern where error was detected
	HBufC8*							iPattern;		// Regular expression pattern
	TRegExOptions					iOptions;		// Options used to compile RE pattern.
	pcre*							iReFull;		// For full matches
	pcre*							iRePartial;		// For partial matches
	pcre_extra*						iExtraPartial;	// Study Data for iRePartial
	TRegExArg*						iNoArg;			// Default argument
	};
	
#include <cregex.inl>
#endif /* CREGEX_H_ */
author	Tom Sutcliffe <thomas.sutcliffe@accenture.com>
	Mon, 12 Jul 2010 15:23:23 +0100
changeset 9	257450419d10
parent 0	7f656887cf89
permissions	-rw-r--r--