src/corelib/tools/qregexp.cpp
changeset 0 1918ee327afb
child 3 41300fa6a67c
equal deleted inserted replaced
-1:000000000000 0:1918ee327afb
       
     1 /****************************************************************************
       
     2 **
       
     3 ** Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies).
       
     4 ** All rights reserved.
       
     5 ** Contact: Nokia Corporation (qt-info@nokia.com)
       
     6 **
       
     7 ** This file is part of the QtCore module of the Qt Toolkit.
       
     8 **
       
     9 ** $QT_BEGIN_LICENSE:LGPL$
       
    10 ** No Commercial Usage
       
    11 ** This file contains pre-release code and may not be distributed.
       
    12 ** You may use this file in accordance with the terms and conditions
       
    13 ** contained in the Technology Preview License Agreement accompanying
       
    14 ** this package.
       
    15 **
       
    16 ** GNU Lesser General Public License Usage
       
    17 ** Alternatively, this file may be used under the terms of the GNU Lesser
       
    18 ** General Public License version 2.1 as published by the Free Software
       
    19 ** Foundation and appearing in the file LICENSE.LGPL included in the
       
    20 ** packaging of this file.  Please review the following information to
       
    21 ** ensure the GNU Lesser General Public License version 2.1 requirements
       
    22 ** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
       
    23 **
       
    24 ** In addition, as a special exception, Nokia gives you certain additional
       
    25 ** rights.  These rights are described in the Nokia Qt LGPL Exception
       
    26 ** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
       
    27 **
       
    28 ** If you have questions regarding the use of this file, please contact
       
    29 ** Nokia at qt-info@nokia.com.
       
    30 **
       
    31 **
       
    32 **
       
    33 **
       
    34 **
       
    35 **
       
    36 **
       
    37 **
       
    38 ** $QT_END_LICENSE$
       
    39 **
       
    40 ****************************************************************************/
       
    41 
       
    42 #include "qregexp.h"
       
    43 
       
    44 #include "qalgorithms.h"
       
    45 #include "qbitarray.h"
       
    46 #include "qcache.h"
       
    47 #include "qdatastream.h"
       
    48 #include "qlist.h"
       
    49 #include "qmap.h"
       
    50 #include "qmutex.h"
       
    51 #include "qstring.h"
       
    52 #include "qstringlist.h"
       
    53 #include "qstringmatcher.h"
       
    54 #include "qvector.h"
       
    55 #include "private/qfunctions_p.h"
       
    56 
       
    57 #include <limits.h>
       
    58 
       
    59 QT_BEGIN_NAMESPACE
       
    60 
       
    61 int qFindString(const QChar *haystack, int haystackLen, int from,
       
    62     const QChar *needle, int needleLen, Qt::CaseSensitivity cs);
       
    63 
       
    64 // error strings for the regexp parser
       
    65 #define RXERR_OK         QT_TRANSLATE_NOOP("QRegExp", "no error occurred")
       
    66 #define RXERR_DISABLED   QT_TRANSLATE_NOOP("QRegExp", "disabled feature used")
       
    67 #define RXERR_CHARCLASS  QT_TRANSLATE_NOOP("QRegExp", "bad char class syntax")
       
    68 #define RXERR_LOOKAHEAD  QT_TRANSLATE_NOOP("QRegExp", "bad lookahead syntax")
       
    69 #define RXERR_REPETITION QT_TRANSLATE_NOOP("QRegExp", "bad repetition syntax")
       
    70 #define RXERR_OCTAL      QT_TRANSLATE_NOOP("QRegExp", "invalid octal value")
       
    71 #define RXERR_LEFTDELIM  QT_TRANSLATE_NOOP("QRegExp", "missing left delim")
       
    72 #define RXERR_END        QT_TRANSLATE_NOOP("QRegExp", "unexpected end")
       
    73 #define RXERR_LIMIT      QT_TRANSLATE_NOOP("QRegExp", "met internal limit")
       
    74 #define RXERR_INTERVAL   QT_TRANSLATE_NOOP("QRegExp", "invalid interval")
       
    75 #define RXERR_CATEGORY   QT_TRANSLATE_NOOP("QRegExp", "invalid category")
       
    76 
       
    77 /*
       
    78   WARNING! Be sure to read qregexp.tex before modifying this file.
       
    79 */
       
    80 
       
    81 /*!
       
    82     \class QRegExp
       
    83     \reentrant
       
    84     \brief The QRegExp class provides pattern matching using regular expressions.
       
    85 
       
    86     \ingroup tools
       
    87     \ingroup shared
       
    88 
       
    89     \keyword regular expression
       
    90 
       
    91     A regular expression, or "regexp", is a pattern for matching
       
    92     substrings in a text. This is useful in many contexts, e.g.,
       
    93 
       
    94     \table
       
    95     \row \i Validation
       
    96          \i A regexp can test whether a substring meets some criteria,
       
    97          e.g. is an integer or contains no whitespace.
       
    98     \row \i Searching
       
    99          \i A regexp provides more powerful pattern matching than
       
   100          simple substring matching, e.g., match one of the words
       
   101          \e{mail}, \e{letter} or \e{correspondence}, but none of the
       
   102          words \e{email}, \e{mailman}, \e{mailer}, \e{letterbox}, etc.
       
   103      \row \i Search and Replace
       
   104          \i A regexp can replace all occurrences of a substring with a
       
   105          different substring, e.g., replace all occurrences of \e{&}
       
   106          with \e{\&amp;} except where the \e{&} is already followed by
       
   107          an \e{amp;}.
       
   108     \row \i String Splitting
       
   109          \i A regexp can be used to identify where a string should be
       
   110          split apart, e.g. splitting tab-delimited strings.
       
   111     \endtable
       
   112 
       
   113     A brief introduction to regexps is presented, a description of
       
   114     Qt's regexp language, some examples, and the function
       
   115     documentation itself. QRegExp is modeled on Perl's regexp
       
   116     language. It fully supports Unicode. QRegExp can also be used in a
       
   117     simpler, \e{wildcard mode} that is similar to the functionality
       
   118     found in command shells. The syntax rules used by QRegExp can be
       
   119     changed with setPatternSyntax(). In particular, the pattern syntax
       
   120     can be set to QRegExp::FixedString, which means the pattern to be
       
   121     matched is interpreted as a plain string, i.e., special characters
       
   122     (e.g., backslash) are not escaped.
       
   123 
       
   124     A good text on regexps is \e {Mastering Regular Expressions}
       
   125     (Third Edition) by Jeffrey E. F.  Friedl, ISBN 0-596-52812-4.
       
   126 
       
   127     \tableofcontents
       
   128 
       
   129     \section1 Introduction
       
   130 
       
   131     Regexps are built up from expressions, quantifiers, and
       
   132     assertions. The simplest expression is a character, e.g. \bold{x}
       
   133     or \bold{5}. An expression can also be a set of characters
       
   134     enclosed in square brackets. \bold{[ABCD]} will match an \bold{A}
       
   135     or a \bold{B} or a \bold{C} or a \bold{D}. We can write this same
       
   136     expression as \bold{[A-D]}, and an experession to match any
       
   137     captital letter in the English alphabet is written as
       
   138     \bold{[A-Z]}.
       
   139 
       
   140     A quantifier specifies the number of occurrences of an expression
       
   141     that must be matched. \bold{x{1,1}} means match one and only one
       
   142     \bold{x}. \bold{x{1,5}} means match a sequence of \bold{x}
       
   143     characters that contains at least one \bold{x} but no more than
       
   144     five.
       
   145 
       
   146     Note that in general regexps cannot be used to check for balanced
       
   147     brackets or tags. For example, a regexp can be written to match an
       
   148     opening html \c{<b>} and its closing \c{</b>}, if the \c{<b>} tags
       
   149     are not nested, but if the \c{<b>} tags are nested, that same
       
   150     regexp will match an opening \c{<b>} tag with the wrong closing
       
   151     \c{</b>}.  For the fragment \c{<b>bold <b>bolder</b></b>}, the
       
   152     first \c{<b>} would be matched with the first \c{</b>}, which is
       
   153     not correct. However, it is possible to write a regexp that will
       
   154     match nested brackets or tags correctly, but only if the number of
       
   155     nesting levels is fixed and known. If the number of nesting levels
       
   156     is not fixed and known, it is impossible to write a regexp that
       
   157     will not fail.
       
   158 
       
   159     Suppose we want a regexp to match integers in the range 0 to 99.
       
   160     At least one digit is required, so we start with the expression
       
   161     \bold{[0-9]{1,1}}, which matches a single digit exactly once. This
       
   162     regexp matches integers in the range 0 to 9. To match integers up
       
   163     to 99, increase the maximum number of occurrences to 2, so the
       
   164     regexp becomes \bold{[0-9]{1,2}}. This regexp satisfies the
       
   165     original requirement to match integers from 0 to 99, but it will
       
   166     also match integers that occur in the middle of strings. If we
       
   167     want the matched integer to be the whole string, we must use the
       
   168     anchor assertions, \bold{^} (caret) and \bold{$} (dollar). When
       
   169     \bold{^} is the first character in a regexp, it means the regexp
       
   170     must match from the beginning of the string. When \bold{$} is the
       
   171     last character of the regexp, it means the regexp must match to
       
   172     the end of the string. The regexp becomes \bold{^[0-9]{1,2}$}.
       
   173     Note that assertions, e.g. \bold{^} and \bold{$}, do not match
       
   174     characters but locations in the string.
       
   175 
       
   176     If you have seen regexps described elsewhere, they may have looked
       
   177     different from the ones shown here. This is because some sets of
       
   178     characters and some quantifiers are so common that they have been
       
   179     given special symbols to represent them. \bold{[0-9]} can be
       
   180     replaced with the symbol \bold{\\d}. The quantifier to match
       
   181     exactly one occurrence, \bold{{1,1}}, can be replaced with the
       
   182     expression itself, i.e. \bold{x{1,1}} is the same as \bold{x}. So
       
   183     our 0 to 99 matcher could be written as \bold{^\\d{1,2}$}. It can
       
   184     also be written \bold{^\\d\\d{0,1}$}, i.e. \e{From the start of
       
   185     the string, match a digit, followed immediately by 0 or 1 digits}.
       
   186     In practice, it would be written as \bold{^\\d\\d?$}. The \bold{?}
       
   187     is shorthand for the quantifier \bold{{0,1}}, i.e. 0 or 1
       
   188     occurrences. \bold{?} makes an expression optional. The regexp
       
   189     \bold{^\\d\\d?$} means \e{From the beginning of the string, match
       
   190     one digit, followed immediately by 0 or 1 more digit, followed
       
   191     immediately by end of string}.
       
   192 
       
   193     To write a regexp that matches one of the words 'mail' \e or
       
   194     'letter' \e or 'correspondence' but does not match words that
       
   195     contain these words, e.g., 'email', 'mailman', 'mailer', and
       
   196     'letterbox', start with a regexp that matches 'mail'. Expressed
       
   197     fully, the regexp is \bold{m{1,1}a{1,1}i{1,1}l{1,1}}, but because
       
   198     a character expression is automatically quantified by
       
   199     \bold{{1,1}}, we can simplify the regexp to \bold{mail}, i.e., an
       
   200     'm' followed by an 'a' followed by an 'i' followed by an 'l'. Now
       
   201     we can use the vertical bar \bold{|}, which means \bold{or}, to
       
   202     include the other two words, so our regexp for matching any of the
       
   203     three words becomes \bold{mail|letter|correspondence}. Match
       
   204     'mail' \bold{or} 'letter' \bold{or} 'correspondence'. While this
       
   205     regexp will match one of the three words we want to match, it will
       
   206     also match words we don't want to match, e.g., 'email'.  To
       
   207     prevent the regexp from matching unwanted words, we must tell it
       
   208     to begin and end the match at word boundaries. First we enclose
       
   209     our regexp in parentheses, \bold{(mail|letter|correspondence)}.
       
   210     Parentheses group expressions together, and they identify a part
       
   211     of the regexp that we wish to \l{capturing text}{capture}.
       
   212     Enclosing the expression in parentheses allows us to use it as a
       
   213     component in more complex regexps. It also allows us to examine
       
   214     which of the three words was actually matched. To force the match
       
   215     to begin and end on word boundaries, we enclose the regexp in
       
   216     \bold{\\b} \e{word boundary} assertions:
       
   217     \bold{\\b(mail|letter|correspondence)\\b}.  Now the regexp means:
       
   218     \e{Match a word boundary, followed by the regexp in parentheses,
       
   219     followed by a word boundary}. The \bold{\\b} assertion matches a
       
   220     \e position in the regexp, not a \e character. A word boundary is
       
   221     any non-word character, e.g., a space, newline, or the beginning
       
   222     or ending of a string.
       
   223 
       
   224     If we want to replace ampersand characters with the HTML entity
       
   225     \bold{\&amp;}, the regexp to match is simply \bold{\&}. But this
       
   226     regexp will also match ampersands that have already been converted
       
   227     to HTML entities. We want to replace only ampersands that are not
       
   228     already followed by \bold{amp;}. For this, we need the negative
       
   229     lookahead assertion, \bold{(?!}__\bold{)}. The regexp can then be
       
   230     written as \bold{\&(?!amp;)}, i.e. \e{Match an ampersand that is}
       
   231     \bold{not} \e{followed by} \bold{amp;}.
       
   232 
       
   233     If we want to count all the occurrences of 'Eric' and 'Eirik' in a
       
   234     string, two valid solutions are \bold{\\b(Eric|Eirik)\\b} and
       
   235     \bold{\\bEi?ri[ck]\\b}. The word boundary assertion '\\b' is
       
   236     required to avoid matching words that contain either name,
       
   237     e.g. 'Ericsson'. Note that the second regexp matches more
       
   238     spellings than we want: 'Eric', 'Erik', 'Eiric' and 'Eirik'.
       
   239 
       
   240     Some of the examples discussed above are implemented in the
       
   241     \link #code-examples code examples \endlink section.
       
   242 
       
   243     \target characters-and-abbreviations-for-sets-of-characters
       
   244     \section1 Characters and Abbreviations for Sets of Characters
       
   245 
       
   246     \table
       
   247     \header \i Element \i Meaning
       
   248     \row \i \bold{c}
       
   249          \i A character represents itself unless it has a special
       
   250          regexp meaning. e.g. \bold{c} matches the character \e c.
       
   251     \row \i \bold{\\c}
       
   252          \i A character that follows a backslash matches the character
       
   253          itself, except as specified below. e.g., To match a literal
       
   254          caret at the beginning of a string, write \bold{\\^}.
       
   255     \row \i \bold{\\a}
       
   256          \i Matches the ASCII bell (BEL, 0x07).
       
   257     \row \i \bold{\\f}
       
   258          \i Matches the ASCII form feed (FF, 0x0C).
       
   259     \row \i \bold{\\n}
       
   260          \i Matches the ASCII line feed (LF, 0x0A, Unix newline).
       
   261     \row \i \bold{\\r}
       
   262          \i Matches the ASCII carriage return (CR, 0x0D).
       
   263     \row \i \bold{\\t}
       
   264          \i Matches the ASCII horizontal tab (HT, 0x09).
       
   265     \row \i \bold{\\v}
       
   266          \i Matches the ASCII vertical tab (VT, 0x0B).
       
   267     \row \i \bold{\\x\e{hhhh}}
       
   268          \i Matches the Unicode character corresponding to the
       
   269          hexadecimal number \e{hhhh} (between 0x0000 and 0xFFFF).
       
   270     \row \i \bold{\\0\e{ooo}} (i.e., \\zero \e{ooo})
       
   271          \i matches the ASCII/Latin1 character for the octal number
       
   272          \e{ooo} (between 0 and 0377).
       
   273     \row \i \bold{. (dot)}
       
   274          \i Matches any character (including newline).
       
   275     \row \i \bold{\\d}
       
   276          \i Matches a digit (QChar::isDigit()).
       
   277     \row \i \bold{\\D}
       
   278          \i Matches a non-digit.
       
   279     \row \i \bold{\\s}
       
   280          \i Matches a whitespace character (QChar::isSpace()).
       
   281     \row \i \bold{\\S}
       
   282          \i Matches a non-whitespace character.
       
   283     \row \i \bold{\\w}
       
   284          \i Matches a word character (QChar::isLetterOrNumber(), QChar::isMark(), or '_').
       
   285     \row \i \bold{\\W}
       
   286          \i Matches a non-word character.
       
   287     \row \i \bold{\\\e{n}}
       
   288          \i The \e{n}-th \l backreference, e.g. \\1, \\2, etc.
       
   289     \endtable
       
   290 
       
   291     \bold{Note:} The C++ compiler transforms backslashes in strings.
       
   292     To include a \bold{\\} in a regexp, enter it twice, i.e. \c{\\}.
       
   293     To match the backslash character itself, enter it four times, i.e.
       
   294     \c{\\\\}.
       
   295 
       
   296     \target sets-of-characters
       
   297     \section1 Sets of Characters
       
   298 
       
   299     Square brackets mean match any character contained in the square
       
   300     brackets. The character set abbreviations described above can
       
   301     appear in a character set in square brackets. Except for the
       
   302     character set abbreviations and the following two exceptions, 
       
   303     characters do not have special meanings in square brackets.
       
   304 
       
   305     \table
       
   306     \row \i \bold{^}
       
   307 
       
   308          \i The caret negates the character set if it occurs as the
       
   309          first character (i.e. immediately after the opening square
       
   310          bracket). \bold{[abc]} matches 'a' or 'b' or 'c', but
       
   311          \bold{[^abc]} matches anything \e but 'a' or 'b' or 'c'.
       
   312 
       
   313     \row \i \bold{-}
       
   314 
       
   315          \i The dash indicates a range of characters. \bold{[W-Z]}
       
   316          matches 'W' or 'X' or 'Y' or 'Z'.
       
   317 
       
   318     \endtable
       
   319 
       
   320     Using the predefined character set abbreviations is more portable
       
   321     than using character ranges across platforms and languages. For
       
   322     example, \bold{[0-9]} matches a digit in Western alphabets but
       
   323     \bold{\\d} matches a digit in \e any alphabet.
       
   324 
       
   325     Note: In other regexp documentation, sets of characters are often
       
   326     called "character classes".
       
   327 
       
   328     \target quantifiers
       
   329     \section1 Quantifiers
       
   330 
       
   331     By default, an expression is automatically quantified by
       
   332     \bold{{1,1}}, i.e. it should occur exactly once. In the following
       
   333     list, \bold{\e {E}} stands for expression. An expression is a
       
   334     character, or an abbreviation for a set of characters, or a set of
       
   335     characters in square brackets, or an expression in parentheses.
       
   336 
       
   337     \table
       
   338     \row \i \bold{\e {E}?}
       
   339 
       
   340          \i Matches zero or one occurrences of \e E. This quantifier
       
   341          means \e{The previous expression is optional}, because it
       
   342          will match whether or not the expression is found. \bold{\e
       
   343          {E}?} is the same as \bold{\e {E}{0,1}}. e.g., \bold{dents?}
       
   344          matches 'dent' or 'dents'.
       
   345 
       
   346     \row \i \bold{\e {E}+}
       
   347 
       
   348          \i Matches one or more occurrences of \e E. \bold{\e {E}+} is
       
   349          the same as \bold{\e {E}{1,}}. e.g., \bold{0+} matches '0',
       
   350          '00', '000', etc.
       
   351 
       
   352     \row \i \bold{\e {E}*}
       
   353 
       
   354          \i Matches zero or more occurrences of \e E. It is the same
       
   355          as \bold{\e {E}{0,}}. The \bold{*} quantifier is often used
       
   356          in error where \bold{+} should be used. For example, if
       
   357          \bold{\\s*$} is used in an expression to match strings that
       
   358          end in whitespace, it will match every string because
       
   359          \bold{\\s*$} means \e{Match zero or more whitespaces followed
       
   360          by end of string}. The correct regexp to match strings that
       
   361          have at least one trailing whitespace character is
       
   362          \bold{\\s+$}.
       
   363 
       
   364     \row \i \bold{\e {E}{n}}
       
   365 
       
   366          \i Matches exactly \e n occurrences of \e E. \bold{\e {E}{n}}
       
   367          is the same as repeating \e E \e n times. For example,
       
   368          \bold{x{5}} is the same as \bold{xxxxx}. It is also the same
       
   369          as \bold{\e {E}{n,n}}, e.g. \bold{x{5,5}}.
       
   370 
       
   371     \row \i \bold{\e {E}{n,}}
       
   372          \i Matches at least \e n occurrences of \e E.
       
   373 
       
   374     \row \i \bold{\e {E}{,m}}
       
   375          \i Matches at most \e m occurrences of \e E. \bold{\e {E}{,m}}
       
   376          is the same as \bold{\e {E}{0,m}}.
       
   377 
       
   378     \row \i \bold{\e {E}{n,m}}
       
   379          \i Matches at least \e n and at most \e m occurrences of \e E.
       
   380     \endtable
       
   381 
       
   382     To apply a quantifier to more than just the preceding character,
       
   383     use parentheses to group characters together in an expression. For
       
   384     example, \bold{tag+} matches a 't' followed by an 'a' followed by
       
   385     at least one 'g', whereas \bold{(tag)+} matches at least one
       
   386     occurrence of 'tag'.
       
   387 
       
   388     Note: Quantifiers are normally "greedy". They always match as much
       
   389     text as they can. For example, \bold{0+} matches the first zero it
       
   390     finds and all the consecutive zeros after the first zero. Applied
       
   391     to '20005', it matches'2\underline{000}5'. Quantifiers can be made
       
   392     non-greedy, see setMinimal().
       
   393 
       
   394     \target capturing parentheses
       
   395     \target backreferences
       
   396     \section1 Capturing Text
       
   397 
       
   398     Parentheses allow us to group elements together so that we can
       
   399     quantify and capture them. For example if we have the expression
       
   400     \bold{mail|letter|correspondence} that matches a string we know
       
   401     that \e one of the words matched but not which one. Using
       
   402     parentheses allows us to "capture" whatever is matched within
       
   403     their bounds, so if we used \bold{(mail|letter|correspondence)}
       
   404     and matched this regexp against the string "I sent you some email"
       
   405     we can use the cap() or capturedTexts() functions to extract the
       
   406     matched characters, in this case 'mail'.
       
   407 
       
   408     We can use captured text within the regexp itself. To refer to the
       
   409     captured text we use \e backreferences which are indexed from 1,
       
   410     the same as for cap(). For example we could search for duplicate
       
   411     words in a string using \bold{\\b(\\w+)\\W+\\1\\b} which means match a
       
   412     word boundary followed by one or more word characters followed by
       
   413     one or more non-word characters followed by the same text as the
       
   414     first parenthesized expression followed by a word boundary.
       
   415 
       
   416     If we want to use parentheses purely for grouping and not for
       
   417     capturing we can use the non-capturing syntax, e.g.
       
   418     \bold{(?:green|blue)}. Non-capturing parentheses begin '(?:' and
       
   419     end ')'. In this example we match either 'green' or 'blue' but we
       
   420     do not capture the match so we only know whether or not we matched
       
   421     but not which color we actually found. Using non-capturing
       
   422     parentheses is more efficient than using capturing parentheses
       
   423     since the regexp engine has to do less book-keeping.
       
   424 
       
   425     Both capturing and non-capturing parentheses may be nested.
       
   426 
       
   427     \target greedy quantifiers
       
   428 
       
   429     For historical reasons, quantifiers (e.g. \bold{*}) that apply to
       
   430     capturing parentheses are more "greedy" than other quantifiers.
       
   431     For example, \bold{a*(a)*} will match "aaa" with cap(1) == "aaa".
       
   432     This behavior is different from what other regexp engines do
       
   433     (notably, Perl). To obtain a more intuitive capturing behavior,
       
   434     specify QRegExp::RegExp2 to the QRegExp constructor or call
       
   435     setPatternSyntax(QRegExp::RegExp2).
       
   436 
       
   437     \target cap_in_a_loop
       
   438 
       
   439     When the number of matches cannot be determined in advance, a
       
   440     common idiom is to use cap() in a loop. For example:
       
   441 
       
   442     \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 0
       
   443 
       
   444     \target assertions
       
   445     \section1 Assertions
       
   446 
       
   447     Assertions make some statement about the text at the point where
       
   448     they occur in the regexp but they do not match any characters. In
       
   449     the following list \bold{\e {E}} stands for any expression.
       
   450 
       
   451     \table
       
   452     \row \i \bold{^}
       
   453          \i The caret signifies the beginning of the string. If you
       
   454          wish to match a literal \c{^} you must escape it by
       
   455          writing \c{\\^}. For example, \bold{^#include} will only
       
   456          match strings which \e begin with the characters '#include'.
       
   457          (When the caret is the first character of a character set it
       
   458          has a special meaning, see \link #sets-of-characters Sets of
       
   459          Characters \endlink.)
       
   460 
       
   461     \row \i \bold{$}
       
   462          \i The dollar signifies the end of the string. For example
       
   463          \bold{\\d\\s*$} will match strings which end with a digit
       
   464          optionally followed by whitespace. If you wish to match a
       
   465          literal \c{$} you must escape it by writing
       
   466          \c{\\$}.
       
   467 
       
   468     \row \i \bold{\\b}
       
   469          \i A word boundary. For example the regexp
       
   470          \bold{\\bOK\\b} means match immediately after a word
       
   471          boundary (e.g. start of string or whitespace) the letter 'O'
       
   472          then the letter 'K' immediately before another word boundary
       
   473          (e.g. end of string or whitespace). But note that the
       
   474          assertion does not actually match any whitespace so if we
       
   475          write \bold{(\\bOK\\b)} and we have a match it will only
       
   476          contain 'OK' even if the string is "It's \underline{OK} now".
       
   477 
       
   478     \row \i \bold{\\B}
       
   479          \i A non-word boundary. This assertion is true wherever
       
   480          \bold{\\b} is false. For example if we searched for
       
   481          \bold{\\Bon\\B} in "Left on" the match would fail (space
       
   482          and end of string aren't non-word boundaries), but it would
       
   483          match in "t\underline{on}ne".
       
   484 
       
   485     \row \i \bold{(?=\e E)}
       
   486          \i Positive lookahead. This assertion is true if the
       
   487          expression matches at this point in the regexp. For example,
       
   488          \bold{const(?=\\s+char)} matches 'const' whenever it is
       
   489          followed by 'char', as in 'static \underline{const} char *'.
       
   490          (Compare with \bold{const\\s+char}, which matches 'static
       
   491          \underline{const char} *'.)
       
   492 
       
   493     \row \i \bold{(?!\e E)}
       
   494          \i Negative lookahead. This assertion is true if the
       
   495          expression does not match at this point in the regexp. For
       
   496          example, \bold{const(?!\\s+char)} matches 'const' \e except
       
   497          when it is followed by 'char'.
       
   498     \endtable
       
   499 
       
   500     \keyword QRegExp wildcard matching
       
   501     \section1 Wildcard Matching
       
   502 
       
   503     Most command shells such as \e bash or \e cmd.exe support "file
       
   504     globbing", the ability to identify a group of files by using
       
   505     wildcards. The setPatternSyntax() function is used to switch
       
   506     between regexp and wildcard mode. Wildcard matching is much
       
   507     simpler than full regexps and has only four features:
       
   508 
       
   509     \table
       
   510     \row \i \bold{c}
       
   511          \i Any character represents itself apart from those mentioned
       
   512          below. Thus \bold{c} matches the character \e c.
       
   513     \row \i \bold{?}
       
   514          \i Matches any single character. It is the same as
       
   515          \bold{.} in full regexps.
       
   516     \row \i \bold{*}
       
   517          \i Matches zero or more of any characters. It is the
       
   518          same as \bold{.*} in full regexps.
       
   519     \row \i \bold{[...]}
       
   520          \i Sets of characters can be represented in square brackets,
       
   521          similar to full regexps. Within the character class, like
       
   522          outside, backslash has no special meaning.
       
   523     \endtable
       
   524 
       
   525     In the mode Wildcard, the wildcard characters cannot be
       
   526     escaped. In the mode WildcardUnix, the character '\' escapes the
       
   527     wildcard.
       
   528 
       
   529     For example if we are in wildcard mode and have strings which
       
   530     contain filenames we could identify HTML files with \bold{*.html}.
       
   531     This will match zero or more characters followed by a dot followed
       
   532     by 'h', 't', 'm' and 'l'.
       
   533 
       
   534     To test a string against a wildcard expression, use exactMatch().
       
   535     For example:
       
   536 
       
   537     \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 1
       
   538 
       
   539     \target perl-users
       
   540     \section1 Notes for Perl Users
       
   541 
       
   542     Most of the character class abbreviations supported by Perl are
       
   543     supported by QRegExp, see \link
       
   544     #characters-and-abbreviations-for-sets-of-characters characters
       
   545     and abbreviations for sets of characters \endlink.
       
   546 
       
   547     In QRegExp, apart from within character classes, \c{^} always
       
   548     signifies the start of the string, so carets must always be
       
   549     escaped unless used for that purpose. In Perl the meaning of caret
       
   550     varies automagically depending on where it occurs so escaping it
       
   551     is rarely necessary. The same applies to \c{$} which in
       
   552     QRegExp always signifies the end of the string.
       
   553 
       
   554     QRegExp's quantifiers are the same as Perl's greedy quantifiers
       
   555     (but see the \l{greedy quantifiers}{note above}). Non-greedy
       
   556     matching cannot be applied to individual quantifiers, but can be
       
   557     applied to all the quantifiers in the pattern. For example, to
       
   558     match the Perl regexp \bold{ro+?m} requires:
       
   559 
       
   560     \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 2
       
   561 
       
   562     The equivalent of Perl's \c{/i} option is
       
   563     setCaseSensitivity(Qt::CaseInsensitive).
       
   564 
       
   565     Perl's \c{/g} option can be emulated using a \l{#cap_in_a_loop}{loop}.
       
   566 
       
   567     In QRegExp \bold{.} matches any character, therefore all QRegExp
       
   568     regexps have the equivalent of Perl's \c{/s} option. QRegExp
       
   569     does not have an equivalent to Perl's \c{/m} option, but this
       
   570     can be emulated in various ways for example by splitting the input
       
   571     into lines or by looping with a regexp that searches for newlines.
       
   572 
       
   573     Because QRegExp is string oriented, there are no \\A, \\Z, or \\z
       
   574     assertions. The \\G assertion is not supported but can be emulated
       
   575     in a loop.
       
   576 
       
   577     Perl's $& is cap(0) or capturedTexts()[0]. There are no QRegExp
       
   578     equivalents for $`, $' or $+. Perl's capturing variables, $1, $2,
       
   579     ... correspond to cap(1) or capturedTexts()[1], cap(2) or
       
   580     capturedTexts()[2], etc.
       
   581 
       
   582     To substitute a pattern use QString::replace().
       
   583 
       
   584     Perl's extended \c{/x} syntax is not supported, nor are
       
   585     directives, e.g. (?i), or regexp comments, e.g. (?#comment). On
       
   586     the other hand, C++'s rules for literal strings can be used to
       
   587     achieve the same:
       
   588 
       
   589     \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 3
       
   590 
       
   591     Both zero-width positive and zero-width negative lookahead
       
   592     assertions (?=pattern) and (?!pattern) are supported with the same
       
   593     syntax as Perl. Perl's lookbehind assertions, "independent"
       
   594     subexpressions and conditional expressions are not supported.
       
   595 
       
   596     Non-capturing parentheses are also supported, with the same
       
   597     (?:pattern) syntax.
       
   598 
       
   599     See QString::split() and QStringList::join() for equivalents
       
   600     to Perl's split and join functions.
       
   601 
       
   602     Note: because C++ transforms \\'s they must be written \e twice in
       
   603     code, e.g. \bold{\\b} must be written \bold{\\\\b}.
       
   604 
       
   605     \target code-examples
       
   606     \section1 Code Examples
       
   607 
       
   608     \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 4
       
   609 
       
   610     The third string matches '\underline{6}'. This is a simple validation
       
   611     regexp for integers in the range 0 to 99.
       
   612 
       
   613     \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 5
       
   614 
       
   615     The second string matches '\underline{This_is-OK}'. We've used the
       
   616     character set abbreviation '\\S' (non-whitespace) and the anchors
       
   617     to match strings which contain no whitespace.
       
   618 
       
   619     In the following example we match strings containing 'mail' or
       
   620     'letter' or 'correspondence' but only match whole words i.e. not
       
   621     'email'
       
   622 
       
   623     \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 6
       
   624 
       
   625     The second string matches "Please write the \underline{letter}". The
       
   626     word 'letter' is also captured (because of the parentheses). We
       
   627     can see what text we've captured like this:
       
   628 
       
   629     \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 7
       
   630 
       
   631     This will capture the text from the first set of capturing
       
   632     parentheses (counting capturing left parentheses from left to
       
   633     right). The parentheses are counted from 1 since cap(0) is the
       
   634     whole matched regexp (equivalent to '&' in most regexp engines).
       
   635 
       
   636     \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 8
       
   637 
       
   638     Here we've passed the QRegExp to QString's replace() function to
       
   639     replace the matched text with new text.
       
   640 
       
   641     \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 9
       
   642 
       
   643     We've used the indexIn() function to repeatedly match the regexp in
       
   644     the string. Note that instead of moving forward by one character
       
   645     at a time \c pos++ we could have written \c {pos +=
       
   646     rx.matchedLength()} to skip over the already matched string. The
       
   647     count will equal 3, matching 'One \underline{Eric} another
       
   648     \underline{Eirik}, and an Ericsson. How many Eiriks, \underline{Eric}?'; it
       
   649     doesn't match 'Ericsson' or 'Eiriks' because they are not bounded
       
   650     by non-word boundaries.
       
   651 
       
   652     One common use of regexps is to split lines of delimited data into
       
   653     their component fields.
       
   654 
       
   655     \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 10
       
   656 
       
   657     In this example our input lines have the format company name, web
       
   658     address and country. Unfortunately the regexp is rather long and
       
   659     not very versatile -- the code will break if we add any more
       
   660     fields. A simpler and better solution is to look for the
       
   661     separator, '\\t' in this case, and take the surrounding text. The
       
   662     QString::split() function can take a separator string or regexp
       
   663     as an argument and split a string accordingly.
       
   664 
       
   665     \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 11
       
   666 
       
   667     Here field[0] is the company, field[1] the web address and so on.
       
   668 
       
   669     To imitate the matching of a shell we can use wildcard mode.
       
   670 
       
   671     \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 12
       
   672 
       
   673     Wildcard matching can be convenient because of its simplicity, but
       
   674     any wildcard regexp can be defined using full regexps, e.g.
       
   675     \bold{.*\.html$}. Notice that we can't match both \c .html and \c
       
   676     .htm files with a wildcard unless we use \bold{*.htm*} which will
       
   677     also match 'test.html.bak'. A full regexp gives us the precision
       
   678     we need, \bold{.*\\.html?$}.
       
   679 
       
   680     QRegExp can match case insensitively using setCaseSensitivity(),
       
   681     and can use non-greedy matching, see setMinimal(). By
       
   682     default QRegExp uses full regexps but this can be changed with
       
   683     setWildcard(). Searching can be forward with indexIn() or backward
       
   684     with lastIndexIn(). Captured text can be accessed using
       
   685     capturedTexts() which returns a string list of all captured
       
   686     strings, or using cap() which returns the captured string for the
       
   687     given index. The pos() function takes a match index and returns
       
   688     the position in the string where the match was made (or -1 if
       
   689     there was no match).
       
   690 
       
   691     \sa QString, QStringList, QRegExpValidator, QSortFilterProxyModel,
       
   692         {tools/regexp}{Regular Expression Example}
       
   693 */
       
   694 
       
   695 #if defined(Q_OS_VXWORKS) && defined(EOS)
       
   696 #  undef EOS
       
   697 #endif
       
   698 
       
   699 const int NumBadChars = 64;
       
   700 #define BadChar(ch) ((ch).unicode() % NumBadChars)
       
   701 
       
   702 const int NoOccurrence = INT_MAX;
       
   703 const int EmptyCapture = INT_MAX;
       
   704 const int InftyLen = INT_MAX;
       
   705 const int InftyRep = 1025;
       
   706 const int EOS = -1;
       
   707 
       
   708 static bool isWord(QChar ch)
       
   709 {
       
   710     return ch.isLetterOrNumber() || ch.isMark() || ch == QLatin1Char('_');
       
   711 }
       
   712 
       
   713 /*
       
   714   Merges two vectors of ints and puts the result into the first
       
   715   one.
       
   716 */
       
   717 static void mergeInto(QVector<int> *a, const QVector<int> &b)
       
   718 {
       
   719     int asize = a->size();
       
   720     int bsize = b.size();
       
   721     if (asize == 0) {
       
   722         *a = b;
       
   723 #ifndef QT_NO_REGEXP_OPTIM
       
   724     } else if (bsize == 1 && a->at(asize - 1) < b.at(0)) {
       
   725         a->resize(asize + 1);
       
   726         (*a)[asize] = b.at(0);
       
   727 #endif
       
   728     } else if (bsize >= 1) {
       
   729         int csize = asize + bsize;
       
   730         QVector<int> c(csize);
       
   731         int i = 0, j = 0, k = 0;
       
   732         while (i < asize) {
       
   733             if (j < bsize) {
       
   734                 if (a->at(i) == b.at(j)) {
       
   735                     ++i;
       
   736                     --csize;
       
   737                 } else if (a->at(i) < b.at(j)) {
       
   738                     c[k++] = a->at(i++);
       
   739                 } else {
       
   740                     c[k++] = b.at(j++);
       
   741                 }
       
   742             } else {
       
   743                 memcpy(c.data() + k, a->constData() + i, (asize - i) * sizeof(int));
       
   744                 break;
       
   745             }
       
   746         }
       
   747         c.resize(csize);
       
   748         if (j < bsize)
       
   749             memcpy(c.data() + k, b.constData() + j, (bsize - j) * sizeof(int));
       
   750         *a = c;
       
   751     }
       
   752 }
       
   753 
       
   754 #ifndef QT_NO_REGEXP_WILDCARD
       
   755 /*
       
   756   Translates a wildcard pattern to an equivalent regular expression
       
   757   pattern (e.g., *.cpp to .*\.cpp).
       
   758 
       
   759   If enableEscaping is true, it is possible to escape the wildcard
       
   760   characters with \
       
   761 */
       
   762 static QString wc2rx(const QString &wc_str, const bool enableEscaping)
       
   763 {
       
   764     const int wclen = wc_str.length();
       
   765     QString rx;
       
   766     int i = 0;
       
   767     bool isEscaping = false; // the previous character is '\'
       
   768     const QChar *wc = wc_str.unicode();
       
   769 
       
   770     while (i < wclen) {
       
   771         const QChar c = wc[i++];
       
   772         switch (c.unicode()) {
       
   773         case '\\':
       
   774             if (enableEscaping) {
       
   775                 if (isEscaping) {
       
   776                     rx += QLatin1String("\\\\");
       
   777                 } // we insert the \\ later if necessary
       
   778                 if (i+1 == wclen) { // the end
       
   779                     rx += QLatin1String("\\\\");
       
   780                 }
       
   781             } else {
       
   782                 rx += QLatin1String("\\\\");
       
   783             }
       
   784             isEscaping = true;
       
   785             break;
       
   786         case '*':
       
   787             if (isEscaping) {
       
   788                 rx += QLatin1String("\\*");
       
   789                 isEscaping = false;
       
   790             } else {
       
   791                 rx += QLatin1String(".*");
       
   792             }
       
   793             break;
       
   794         case '?':
       
   795             if (isEscaping) {
       
   796                 rx += QLatin1String("\\?");
       
   797                 isEscaping = false;
       
   798             } else {
       
   799                 rx += QLatin1Char('.');
       
   800             }
       
   801 
       
   802             break;
       
   803         case '$':
       
   804         case '(':
       
   805         case ')':
       
   806         case '+':
       
   807         case '.':
       
   808         case '^':
       
   809         case '{':
       
   810         case '|':
       
   811         case '}':
       
   812             if (isEscaping) {
       
   813                 isEscaping = false;
       
   814                 rx += QLatin1String("\\\\");
       
   815             }
       
   816             rx += QLatin1Char('\\');
       
   817             rx += c;
       
   818             break;
       
   819          case '[':
       
   820             if (isEscaping) {
       
   821                 isEscaping = false;
       
   822                 rx += QLatin1String("\\[");
       
   823             } else {
       
   824                 rx += c;
       
   825                 if (wc[i] == QLatin1Char('^'))
       
   826                     rx += wc[i++];
       
   827                 if (i < wclen) {
       
   828                     if (rx[i] == QLatin1Char(']'))
       
   829                         rx += wc[i++];
       
   830                     while (i < wclen && wc[i] != QLatin1Char(']')) {
       
   831                         if (wc[i] == QLatin1Char('\\'))
       
   832                             rx += QLatin1Char('\\');
       
   833                         rx += wc[i++];
       
   834                     }
       
   835                 }
       
   836             }
       
   837              break;
       
   838 
       
   839         case ']':
       
   840             if(isEscaping){
       
   841                 isEscaping = false;
       
   842                 rx += QLatin1String("\\");
       
   843             }
       
   844             rx += c;
       
   845             break;
       
   846 
       
   847         default:
       
   848             if(isEscaping){
       
   849                 isEscaping = false;
       
   850                 rx += QLatin1String("\\\\");
       
   851             }
       
   852             rx += c;
       
   853         }
       
   854     }
       
   855     return rx;
       
   856 }
       
   857 #endif
       
   858 
       
   859 static int caretIndex(int offset, QRegExp::CaretMode caretMode)
       
   860 {
       
   861     if (caretMode == QRegExp::CaretAtZero) {
       
   862         return 0;
       
   863     } else if (caretMode == QRegExp::CaretAtOffset) {
       
   864         return offset;
       
   865     } else { // QRegExp::CaretWontMatch
       
   866         return -1;
       
   867     }
       
   868 }
       
   869 
       
   870 /*
       
   871     The QRegExpEngineKey struct uniquely identifies an engine.
       
   872 */
       
   873 struct QRegExpEngineKey
       
   874 {
       
   875     QString pattern;
       
   876     QRegExp::PatternSyntax patternSyntax;
       
   877     Qt::CaseSensitivity cs;
       
   878 
       
   879     inline QRegExpEngineKey(const QString &pattern, QRegExp::PatternSyntax patternSyntax,
       
   880                             Qt::CaseSensitivity cs)
       
   881         : pattern(pattern), patternSyntax(patternSyntax), cs(cs) {}
       
   882 
       
   883     inline void clear() {
       
   884         pattern.clear();
       
   885         patternSyntax = QRegExp::RegExp;
       
   886         cs = Qt::CaseSensitive;
       
   887     }
       
   888 };
       
   889 
       
   890 Q_STATIC_GLOBAL_OPERATOR bool operator==(const QRegExpEngineKey &key1, const QRegExpEngineKey &key2)
       
   891 {
       
   892     return key1.pattern == key2.pattern && key1.patternSyntax == key2.patternSyntax
       
   893            && key1.cs == key2.cs;
       
   894 }
       
   895 
       
   896 class QRegExpEngine;
       
   897 
       
   898 //Q_DECLARE_TYPEINFO(QVector<int>, Q_MOVABLE_TYPE);
       
   899 
       
   900 /*
       
   901   This is the engine state during matching.
       
   902 */
       
   903 struct QRegExpMatchState
       
   904 {
       
   905     const QChar *in; // a pointer to the input string data
       
   906     int pos; // the current position in the string
       
   907     int caretPos;
       
   908     int len; // the length of the input string
       
   909     bool minimal; // minimal matching?
       
   910     int *bigArray; // big array holding the data for the next pointers
       
   911     int *inNextStack; // is state is nextStack?
       
   912     int *curStack; // stack of current states
       
   913     int *nextStack; // stack of next states
       
   914     int *curCapBegin; // start of current states' captures
       
   915     int *nextCapBegin; // start of next states' captures
       
   916     int *curCapEnd; // end of current states' captures
       
   917     int *nextCapEnd; // end of next states' captures
       
   918     int *tempCapBegin; // start of temporary captures
       
   919     int *tempCapEnd; // end of temporary captures
       
   920     int *capBegin; // start of captures for a next state
       
   921     int *capEnd; // end of captures for a next state
       
   922     int *slideTab; // bump-along slide table for bad-character heuristic
       
   923     int *captured; // what match() returned last
       
   924     int slideTabSize; // size of slide table
       
   925     int capturedSize;
       
   926 #ifndef QT_NO_REGEXP_BACKREF
       
   927     QList<QVector<int> > sleeping; // list of back-reference sleepers
       
   928 #endif
       
   929     int matchLen; // length of match
       
   930     int oneTestMatchedLen; // length of partial match
       
   931 
       
   932     const QRegExpEngine *eng;
       
   933 
       
   934     inline QRegExpMatchState() : bigArray(0), captured(0) {}
       
   935     inline ~QRegExpMatchState() { free(bigArray); }
       
   936 
       
   937     void drain() { free(bigArray); bigArray = 0; captured = 0; } // to save memory
       
   938     void prepareForMatch(QRegExpEngine *eng);
       
   939     void match(const QChar *str, int len, int pos, bool minimal,
       
   940         bool oneTest, int caretIndex);
       
   941     bool matchHere();
       
   942     bool testAnchor(int i, int a, const int *capBegin);
       
   943 };
       
   944 
       
   945 /*
       
   946   The struct QRegExpAutomatonState represents one state in a modified NFA. The
       
   947   input characters matched are stored in the state instead of on
       
   948   the transitions, something possible for an automaton
       
   949   constructed from a regular expression.
       
   950 */
       
   951 struct QRegExpAutomatonState
       
   952 {
       
   953 #ifndef QT_NO_REGEXP_CAPTURE
       
   954     int atom; // which atom does this state belong to?
       
   955 #endif
       
   956     int match; // what does it match? (see CharClassBit and BackRefBit)
       
   957     QVector<int> outs; // out-transitions
       
   958     QMap<int, int> reenter; // atoms reentered when transiting out
       
   959     QMap<int, int> anchors; // anchors met when transiting out
       
   960 
       
   961     inline QRegExpAutomatonState() { }
       
   962 #ifndef QT_NO_REGEXP_CAPTURE
       
   963     inline QRegExpAutomatonState(int a, int m)
       
   964         : atom(a), match(m) { }
       
   965 #else
       
   966     inline QRegExpAutomatonState(int m)
       
   967         : match(m) { }
       
   968 #endif
       
   969 };
       
   970 
       
   971 Q_DECLARE_TYPEINFO(QRegExpAutomatonState, Q_MOVABLE_TYPE);
       
   972 
       
   973 /*
       
   974   The struct QRegExpCharClassRange represents a range of characters (e.g.,
       
   975   [0-9] denotes range 48 to 57).
       
   976 */
       
   977 struct QRegExpCharClassRange
       
   978 {
       
   979     ushort from; // 48
       
   980     ushort len; // 10
       
   981 };
       
   982 
       
   983 Q_DECLARE_TYPEINFO(QRegExpCharClassRange, Q_PRIMITIVE_TYPE);
       
   984 
       
   985 #ifndef QT_NO_REGEXP_CAPTURE
       
   986 /*
       
   987   The struct QRegExpAtom represents one node in the hierarchy of regular
       
   988   expression atoms.
       
   989 */
       
   990 struct QRegExpAtom
       
   991 {
       
   992     enum { NoCapture = -1, OfficialCapture = -2, UnofficialCapture = -3 };
       
   993 
       
   994     int parent; // index of parent in array of atoms
       
   995     int capture; // index of capture, from 1 to ncap - 1
       
   996 };
       
   997 
       
   998 Q_DECLARE_TYPEINFO(QRegExpAtom, Q_PRIMITIVE_TYPE);
       
   999 #endif
       
  1000 
       
  1001 struct QRegExpLookahead;
       
  1002 
       
  1003 #ifndef QT_NO_REGEXP_ANCHOR_ALT
       
  1004 /*
       
  1005   The struct QRegExpAnchorAlternation represents a pair of anchors with
       
  1006   OR semantics.
       
  1007 */
       
  1008 struct QRegExpAnchorAlternation
       
  1009 {
       
  1010     int a; // this anchor...
       
  1011     int b; // ...or this one
       
  1012 };
       
  1013 
       
  1014 Q_DECLARE_TYPEINFO(QRegExpAnchorAlternation, Q_PRIMITIVE_TYPE);
       
  1015 #endif
       
  1016 
       
  1017 #ifndef QT_NO_REGEXP_CCLASS
       
  1018 /*
       
  1019   The class QRegExpCharClass represents a set of characters, such as can
       
  1020   be found in regular expressions (e.g., [a-z] denotes the set
       
  1021   {a, b, ..., z}).
       
  1022 */
       
  1023 class QRegExpCharClass
       
  1024 {
       
  1025 public:
       
  1026     QRegExpCharClass();
       
  1027     inline QRegExpCharClass(const QRegExpCharClass &cc) { operator=(cc); }
       
  1028 
       
  1029     QRegExpCharClass &operator=(const QRegExpCharClass &cc);
       
  1030 
       
  1031     void clear();
       
  1032     bool negative() const { return n; }
       
  1033     void setNegative(bool negative);
       
  1034     void addCategories(int cats);
       
  1035     void addRange(ushort from, ushort to);
       
  1036     void addSingleton(ushort ch) { addRange(ch, ch); }
       
  1037 
       
  1038     bool in(QChar ch) const;
       
  1039 #ifndef QT_NO_REGEXP_OPTIM
       
  1040     const QVector<int> &firstOccurrence() const { return occ1; }
       
  1041 #endif
       
  1042 
       
  1043 #if defined(QT_DEBUG)
       
  1044     void dump() const;
       
  1045 #endif
       
  1046 
       
  1047 private:
       
  1048     int c; // character classes
       
  1049     QVector<QRegExpCharClassRange> r; // character ranges
       
  1050     bool n; // negative?
       
  1051 #ifndef QT_NO_REGEXP_OPTIM
       
  1052     QVector<int> occ1; // first-occurrence array
       
  1053 #endif
       
  1054 };
       
  1055 #else
       
  1056 struct QRegExpCharClass
       
  1057 {
       
  1058     int dummy;
       
  1059 
       
  1060 #ifndef QT_NO_REGEXP_OPTIM
       
  1061     QRegExpCharClass() { occ1.fill(0, NumBadChars); }
       
  1062 
       
  1063     const QVector<int> &firstOccurrence() const { return occ1; }
       
  1064     QVector<int> occ1;
       
  1065 #endif
       
  1066 };
       
  1067 #endif
       
  1068 
       
  1069 Q_DECLARE_TYPEINFO(QRegExpCharClass, Q_MOVABLE_TYPE);
       
  1070 
       
  1071 /*
       
  1072   The QRegExpEngine class encapsulates a modified nondeterministic
       
  1073   finite automaton (NFA).
       
  1074 */
       
  1075 class QRegExpEngine
       
  1076 {
       
  1077 public:
       
  1078     QRegExpEngine(Qt::CaseSensitivity cs, bool greedyQuantifiers)
       
  1079         : cs(cs), greedyQuantifiers(greedyQuantifiers) { setup(); }
       
  1080 
       
  1081     QRegExpEngine(const QRegExpEngineKey &key);
       
  1082     ~QRegExpEngine();
       
  1083 
       
  1084     bool isValid() const { return valid; }
       
  1085     const QString &errorString() const { return yyError; }
       
  1086     int numCaptures() const { return officialncap; }
       
  1087 
       
  1088     int createState(QChar ch);
       
  1089     int createState(const QRegExpCharClass &cc);
       
  1090 #ifndef QT_NO_REGEXP_BACKREF
       
  1091     int createState(int bref);
       
  1092 #endif
       
  1093 
       
  1094     void addCatTransitions(const QVector<int> &from, const QVector<int> &to);
       
  1095 #ifndef QT_NO_REGEXP_CAPTURE
       
  1096     void addPlusTransitions(const QVector<int> &from, const QVector<int> &to, int atom);
       
  1097 #endif
       
  1098 
       
  1099 #ifndef QT_NO_REGEXP_ANCHOR_ALT
       
  1100     int anchorAlternation(int a, int b);
       
  1101     int anchorConcatenation(int a, int b);
       
  1102 #else
       
  1103     int anchorAlternation(int a, int b) { return a & b; }
       
  1104     int anchorConcatenation(int a, int b) { return a | b; }
       
  1105 #endif
       
  1106     void addAnchors(int from, int to, int a);
       
  1107 
       
  1108 #ifndef QT_NO_REGEXP_OPTIM
       
  1109     void heuristicallyChooseHeuristic();
       
  1110 #endif
       
  1111 
       
  1112 #if defined(QT_DEBUG)
       
  1113     void dump() const;
       
  1114 #endif
       
  1115 
       
  1116     QAtomicInt ref;
       
  1117 
       
  1118 private:
       
  1119     enum { CharClassBit = 0x10000, BackRefBit = 0x20000 };
       
  1120     enum { InitialState = 0, FinalState = 1 };
       
  1121 
       
  1122     void setup();
       
  1123     int setupState(int match);
       
  1124 
       
  1125     /*
       
  1126       Let's hope that 13 lookaheads and 14 back-references are
       
  1127       enough.
       
  1128      */
       
  1129     enum { MaxLookaheads = 13, MaxBackRefs = 14 };
       
  1130     enum { Anchor_Dollar = 0x00000001, Anchor_Caret = 0x00000002, Anchor_Word = 0x00000004,
       
  1131            Anchor_NonWord = 0x00000008, Anchor_FirstLookahead = 0x00000010,
       
  1132            Anchor_BackRef1Empty = Anchor_FirstLookahead << MaxLookaheads,
       
  1133            Anchor_BackRef0Empty = Anchor_BackRef1Empty >> 1,
       
  1134            Anchor_Alternation = unsigned(Anchor_BackRef1Empty) << MaxBackRefs,
       
  1135 
       
  1136            Anchor_LookaheadMask = (Anchor_FirstLookahead - 1) ^
       
  1137                    ((Anchor_FirstLookahead << MaxLookaheads) - 1) };
       
  1138 #ifndef QT_NO_REGEXP_CAPTURE
       
  1139     int startAtom(bool officialCapture);
       
  1140     void finishAtom(int atom, bool needCapture);
       
  1141 #endif
       
  1142 
       
  1143 #ifndef QT_NO_REGEXP_LOOKAHEAD
       
  1144     int addLookahead(QRegExpEngine *eng, bool negative);
       
  1145 #endif
       
  1146 
       
  1147 #ifndef QT_NO_REGEXP_OPTIM
       
  1148     bool goodStringMatch(QRegExpMatchState &matchState) const;
       
  1149     bool badCharMatch(QRegExpMatchState &matchState) const;
       
  1150 #else
       
  1151     bool bruteMatch(QRegExpMatchState &matchState) const;
       
  1152 #endif
       
  1153 
       
  1154     QVector<QRegExpAutomatonState> s; // array of states
       
  1155 #ifndef QT_NO_REGEXP_CAPTURE
       
  1156     QVector<QRegExpAtom> f; // atom hierarchy
       
  1157     int nf; // number of atoms
       
  1158     int cf; // current atom
       
  1159     QVector<int> captureForOfficialCapture;
       
  1160 #endif
       
  1161     int officialncap; // number of captures, seen from the outside
       
  1162     int ncap; // number of captures, seen from the inside
       
  1163 #ifndef QT_NO_REGEXP_CCLASS
       
  1164     QVector<QRegExpCharClass> cl; // array of character classes
       
  1165 #endif
       
  1166 #ifndef QT_NO_REGEXP_LOOKAHEAD
       
  1167     QVector<QRegExpLookahead *> ahead; // array of lookaheads
       
  1168 #endif
       
  1169 #ifndef QT_NO_REGEXP_ANCHOR_ALT
       
  1170     QVector<QRegExpAnchorAlternation> aa; // array of (a, b) pairs of anchors
       
  1171 #endif
       
  1172 #ifndef QT_NO_REGEXP_OPTIM
       
  1173     bool caretAnchored; // does the regexp start with ^?
       
  1174     bool trivial; // is the good-string all that needs to match?
       
  1175 #endif
       
  1176     bool valid; // is the regular expression valid?
       
  1177     Qt::CaseSensitivity cs; // case sensitive?
       
  1178     bool greedyQuantifiers; // RegExp2?
       
  1179     bool xmlSchemaExtensions;
       
  1180 #ifndef QT_NO_REGEXP_BACKREF
       
  1181     int nbrefs; // number of back-references
       
  1182 #endif
       
  1183 
       
  1184 #ifndef QT_NO_REGEXP_OPTIM
       
  1185     bool useGoodStringHeuristic; // use goodStringMatch? otherwise badCharMatch
       
  1186 
       
  1187     int goodEarlyStart; // the index where goodStr can first occur in a match
       
  1188     int goodLateStart; // the index where goodStr can last occur in a match
       
  1189     QString goodStr; // the string that any match has to contain
       
  1190 
       
  1191     int minl; // the minimum length of a match
       
  1192     QVector<int> occ1; // first-occurrence array
       
  1193 #endif
       
  1194 
       
  1195     /*
       
  1196       The class Box is an abstraction for a regular expression
       
  1197       fragment. It can also be seen as one node in the syntax tree of
       
  1198       a regular expression with synthetized attributes.
       
  1199 
       
  1200       Its interface is ugly for performance reasons.
       
  1201     */
       
  1202     class Box
       
  1203     {
       
  1204     public:
       
  1205         Box(QRegExpEngine *engine);
       
  1206         Box(const Box &b) { operator=(b); }
       
  1207 
       
  1208         Box &operator=(const Box &b);
       
  1209 
       
  1210         void clear() { operator=(Box(eng)); }
       
  1211         void set(QChar ch);
       
  1212         void set(const QRegExpCharClass &cc);
       
  1213 #ifndef QT_NO_REGEXP_BACKREF
       
  1214         void set(int bref);
       
  1215 #endif
       
  1216 
       
  1217         void cat(const Box &b);
       
  1218         void orx(const Box &b);
       
  1219         void plus(int atom);
       
  1220         void opt();
       
  1221         void catAnchor(int a);
       
  1222 #ifndef QT_NO_REGEXP_OPTIM
       
  1223         void setupHeuristics();
       
  1224 #endif
       
  1225 
       
  1226 #if defined(QT_DEBUG)
       
  1227         void dump() const;
       
  1228 #endif
       
  1229 
       
  1230     private:
       
  1231         void addAnchorsToEngine(const Box &to) const;
       
  1232 
       
  1233         QRegExpEngine *eng; // the automaton under construction
       
  1234         QVector<int> ls; // the left states (firstpos)
       
  1235         QVector<int> rs; // the right states (lastpos)
       
  1236         QMap<int, int> lanchors; // the left anchors
       
  1237         QMap<int, int> ranchors; // the right anchors
       
  1238         int skipanchors; // the anchors to match if the box is skipped
       
  1239 
       
  1240 #ifndef QT_NO_REGEXP_OPTIM
       
  1241         int earlyStart; // the index where str can first occur
       
  1242         int lateStart; // the index where str can last occur
       
  1243         QString str; // a string that has to occur in any match
       
  1244         QString leftStr; // a string occurring at the left of this box
       
  1245         QString rightStr; // a string occurring at the right of this box
       
  1246         int maxl; // the maximum length of this box (possibly InftyLen)
       
  1247 #endif
       
  1248 
       
  1249         int minl; // the minimum length of this box
       
  1250 #ifndef QT_NO_REGEXP_OPTIM
       
  1251         QVector<int> occ1; // first-occurrence array
       
  1252 #endif
       
  1253     };
       
  1254 
       
  1255     friend class Box;
       
  1256 
       
  1257     void setupCategoriesRangeMap();
       
  1258 
       
  1259     /*
       
  1260       This is the lexical analyzer for regular expressions.
       
  1261     */
       
  1262     enum { Tok_Eos, Tok_Dollar, Tok_LeftParen, Tok_MagicLeftParen, Tok_PosLookahead,
       
  1263            Tok_NegLookahead, Tok_RightParen, Tok_CharClass, Tok_Caret, Tok_Quantifier, Tok_Bar,
       
  1264            Tok_Word, Tok_NonWord, Tok_Char = 0x10000, Tok_BackRef = 0x20000 };
       
  1265     int getChar();
       
  1266     int getEscape();
       
  1267 #ifndef QT_NO_REGEXP_INTERVAL
       
  1268     int getRep(int def);
       
  1269 #endif
       
  1270 #ifndef QT_NO_REGEXP_LOOKAHEAD
       
  1271     void skipChars(int n);
       
  1272 #endif
       
  1273     void error(const char *msg);
       
  1274     void startTokenizer(const QChar *rx, int len);
       
  1275     int getToken();
       
  1276 
       
  1277     const QChar *yyIn; // a pointer to the input regular expression pattern
       
  1278     int yyPos0; // the position of yyTok in the input pattern
       
  1279     int yyPos; // the position of the next character to read
       
  1280     int yyLen; // the length of yyIn
       
  1281     int yyCh; // the last character read
       
  1282     QScopedPointer<QRegExpCharClass> yyCharClass; // attribute for Tok_CharClass tokens
       
  1283     int yyMinRep; // attribute for Tok_Quantifier
       
  1284     int yyMaxRep; // ditto
       
  1285     QString yyError; // syntax error or overflow during parsing?
       
  1286 
       
  1287     /*
       
  1288       This is the syntactic analyzer for regular expressions.
       
  1289     */
       
  1290     int parse(const QChar *rx, int len);
       
  1291     void parseAtom(Box *box);
       
  1292     void parseFactor(Box *box);
       
  1293     void parseTerm(Box *box);
       
  1294     void parseExpression(Box *box);
       
  1295 
       
  1296     int yyTok; // the last token read
       
  1297     bool yyMayCapture; // set this to false to disable capturing
       
  1298     QHash<QByteArray, QPair<int, int> > categoriesRangeMap; // fast lookup hash for xml schema extensions
       
  1299 
       
  1300     friend struct QRegExpMatchState;
       
  1301 };
       
  1302 
       
  1303 #ifndef QT_NO_REGEXP_LOOKAHEAD
       
  1304 /*
       
  1305   The struct QRegExpLookahead represents a lookahead a la Perl (e.g.,
       
  1306   (?=foo) and (?!bar)).
       
  1307 */
       
  1308 struct QRegExpLookahead
       
  1309 {
       
  1310     QRegExpEngine *eng; // NFA representing the embedded regular expression
       
  1311     bool neg; // negative lookahead?
       
  1312 
       
  1313     inline QRegExpLookahead(QRegExpEngine *eng0, bool neg0)
       
  1314         : eng(eng0), neg(neg0) { }
       
  1315     inline ~QRegExpLookahead() { delete eng; }
       
  1316 };
       
  1317 #endif
       
  1318 
       
  1319 /*! \internal
       
  1320     convert the pattern string to the RegExp syntax.
       
  1321 
       
  1322     This is also used by QScriptEngine::newRegExp to convert to a pattern that JavaScriptCore can understan
       
  1323  */
       
  1324 Q_CORE_EXPORT QString qt_regexp_toCanonical(const QString &pattern, QRegExp::PatternSyntax patternSyntax)
       
  1325 {
       
  1326     switch (patternSyntax) {
       
  1327 #ifndef QT_NO_REGEXP_WILDCARD
       
  1328     case QRegExp::Wildcard:
       
  1329         return wc2rx(pattern, false);
       
  1330         break;
       
  1331     case QRegExp::WildcardUnix:
       
  1332         return wc2rx(pattern, true);
       
  1333         break;
       
  1334 #endif
       
  1335     case QRegExp::FixedString:
       
  1336         return QRegExp::escape(pattern);
       
  1337         break;
       
  1338     case QRegExp::W3CXmlSchema11:
       
  1339     default:
       
  1340         return pattern;
       
  1341     }
       
  1342 }
       
  1343 
       
  1344 QRegExpEngine::QRegExpEngine(const QRegExpEngineKey &key)
       
  1345     : cs(key.cs), greedyQuantifiers(key.patternSyntax == QRegExp::RegExp2),
       
  1346       xmlSchemaExtensions(key.patternSyntax == QRegExp::W3CXmlSchema11)
       
  1347 {
       
  1348     setup();
       
  1349 
       
  1350     QString rx = qt_regexp_toCanonical(key.pattern, key.patternSyntax);
       
  1351 
       
  1352     valid = (parse(rx.unicode(), rx.length()) == rx.length());
       
  1353     if (!valid) {
       
  1354 #ifndef QT_NO_REGEXP_OPTIM
       
  1355         trivial = false;
       
  1356 #endif
       
  1357         error(RXERR_LEFTDELIM);
       
  1358     }
       
  1359 }
       
  1360 
       
  1361 QRegExpEngine::~QRegExpEngine()
       
  1362 {
       
  1363 #ifndef QT_NO_REGEXP_LOOKAHEAD
       
  1364     qDeleteAll(ahead);
       
  1365 #endif
       
  1366 }
       
  1367 
       
  1368 void QRegExpMatchState::prepareForMatch(QRegExpEngine *eng)
       
  1369 {
       
  1370     /*
       
  1371       We use one QVector<int> for all the big data used a lot in
       
  1372       matchHere() and friends.
       
  1373     */
       
  1374     int ns = eng->s.size(); // number of states
       
  1375     int ncap = eng->ncap;
       
  1376 #ifndef QT_NO_REGEXP_OPTIM
       
  1377     int newSlideTabSize = qMax(eng->minl + 1, 16);
       
  1378 #else
       
  1379     int newSlideTabSize = 0;
       
  1380 #endif
       
  1381     int numCaptures = eng->numCaptures();
       
  1382     int newCapturedSize = 2 + 2 * numCaptures;
       
  1383     bigArray = q_check_ptr((int *)realloc(bigArray, ((3 + 4 * ncap) * ns + 4 * ncap + newSlideTabSize + newCapturedSize)*sizeof(int)));
       
  1384 
       
  1385     // set all internal variables only _after_ bigArray is realloc'ed
       
  1386     // to prevent a broken regexp in oom case
       
  1387 
       
  1388     slideTabSize = newSlideTabSize;
       
  1389     capturedSize = newCapturedSize;
       
  1390     inNextStack = bigArray;
       
  1391     memset(inNextStack, -1, ns * sizeof(int));
       
  1392     curStack = inNextStack + ns;
       
  1393     nextStack = inNextStack + 2 * ns;
       
  1394 
       
  1395     curCapBegin = inNextStack + 3 * ns;
       
  1396     nextCapBegin = curCapBegin + ncap * ns;
       
  1397     curCapEnd = curCapBegin + 2 * ncap * ns;
       
  1398     nextCapEnd = curCapBegin + 3 * ncap * ns;
       
  1399 
       
  1400     tempCapBegin = curCapBegin + 4 * ncap * ns;
       
  1401     tempCapEnd = tempCapBegin + ncap;
       
  1402     capBegin = tempCapBegin + 2 * ncap;
       
  1403     capEnd = tempCapBegin + 3 * ncap;
       
  1404 
       
  1405     slideTab = tempCapBegin + 4 * ncap;
       
  1406     captured = slideTab + slideTabSize;
       
  1407     memset(captured, -1, capturedSize*sizeof(int));
       
  1408     this->eng = eng;
       
  1409 }
       
  1410 
       
  1411 /*
       
  1412   Tries to match in str and returns an array of (begin, length) pairs
       
  1413   for captured text. If there is no match, all pairs are (-1, -1).
       
  1414 */
       
  1415 void QRegExpMatchState::match(const QChar *str0, int len0, int pos0,
       
  1416     bool minimal0, bool oneTest, int caretIndex)
       
  1417 {
       
  1418     bool matched = false;
       
  1419     QChar char_null;
       
  1420 
       
  1421 #ifndef QT_NO_REGEXP_OPTIM
       
  1422     if (eng->trivial && !oneTest) {
       
  1423         pos = qFindString(str0, len0, pos0, eng->goodStr.unicode(), eng->goodStr.length(), eng->cs);
       
  1424         matchLen = eng->goodStr.length();
       
  1425         matched = (pos != -1);
       
  1426     } else
       
  1427 #endif
       
  1428     {
       
  1429         in = str0;
       
  1430         if (in == 0)
       
  1431             in = &char_null;
       
  1432         pos = pos0;
       
  1433         caretPos = caretIndex;
       
  1434         len = len0;
       
  1435         minimal = minimal0;
       
  1436         matchLen = 0;
       
  1437         oneTestMatchedLen = 0;
       
  1438 
       
  1439         if (eng->valid && pos >= 0 && pos <= len) {
       
  1440 #ifndef QT_NO_REGEXP_OPTIM
       
  1441             if (oneTest) {
       
  1442                 matched = matchHere();
       
  1443             } else {
       
  1444                 if (pos <= len - eng->minl) {
       
  1445                     if (eng->caretAnchored) {
       
  1446                         matched = matchHere();
       
  1447                     } else if (eng->useGoodStringHeuristic) {
       
  1448                         matched = eng->goodStringMatch(*this);
       
  1449                     } else {
       
  1450                         matched = eng->badCharMatch(*this);
       
  1451                     }
       
  1452                 }
       
  1453             }
       
  1454 #else
       
  1455             matched = oneTest ? matchHere() : eng->bruteMatch(*this);
       
  1456 #endif
       
  1457         }
       
  1458     }
       
  1459 
       
  1460     if (matched) {
       
  1461         int *c = captured;
       
  1462         *c++ = pos;
       
  1463         *c++ = matchLen;
       
  1464 
       
  1465         int numCaptures = (capturedSize - 2) >> 1;
       
  1466 #ifndef QT_NO_REGEXP_CAPTURE
       
  1467         for (int i = 0; i < numCaptures; ++i) {
       
  1468             int j = eng->captureForOfficialCapture.at(i);
       
  1469             int len = capEnd[j] - capBegin[j];
       
  1470             *c++ = (len > 0) ? pos + capBegin[j] : 0;
       
  1471             *c++ = len;
       
  1472         }
       
  1473 #endif
       
  1474     } else {
       
  1475         // we rely on 2's complement here
       
  1476         memset(captured, -1, capturedSize * sizeof(int));
       
  1477     }
       
  1478 }
       
  1479 
       
  1480 /*
       
  1481   The three following functions add one state to the automaton and
       
  1482   return the number of the state.
       
  1483 */
       
  1484 
       
  1485 int QRegExpEngine::createState(QChar ch)
       
  1486 {
       
  1487     return setupState(ch.unicode());
       
  1488 }
       
  1489 
       
  1490 int QRegExpEngine::createState(const QRegExpCharClass &cc)
       
  1491 {
       
  1492 #ifndef QT_NO_REGEXP_CCLASS
       
  1493     int n = cl.size();
       
  1494     cl += QRegExpCharClass(cc);
       
  1495     return setupState(CharClassBit | n);
       
  1496 #else
       
  1497     Q_UNUSED(cc);
       
  1498     return setupState(CharClassBit);
       
  1499 #endif
       
  1500 }
       
  1501 
       
  1502 #ifndef QT_NO_REGEXP_BACKREF
       
  1503 int QRegExpEngine::createState(int bref)
       
  1504 {
       
  1505     if (bref > nbrefs) {
       
  1506         nbrefs = bref;
       
  1507         if (nbrefs > MaxBackRefs) {
       
  1508             error(RXERR_LIMIT);
       
  1509             return 0;
       
  1510         }
       
  1511     }
       
  1512     return setupState(BackRefBit | bref);
       
  1513 }
       
  1514 #endif
       
  1515 
       
  1516 /*
       
  1517   The two following functions add a transition between all pairs of
       
  1518   states (i, j) where i is found in from, and j is found in to.
       
  1519 
       
  1520   Cat-transitions are distinguished from plus-transitions for
       
  1521   capturing.
       
  1522 */
       
  1523 
       
  1524 void QRegExpEngine::addCatTransitions(const QVector<int> &from, const QVector<int> &to)
       
  1525 {
       
  1526     for (int i = 0; i < from.size(); i++)
       
  1527         mergeInto(&s[from.at(i)].outs, to);
       
  1528 }
       
  1529 
       
  1530 #ifndef QT_NO_REGEXP_CAPTURE
       
  1531 void QRegExpEngine::addPlusTransitions(const QVector<int> &from, const QVector<int> &to, int atom)
       
  1532 {
       
  1533     for (int i = 0; i < from.size(); i++) {
       
  1534         QRegExpAutomatonState &st = s[from.at(i)];
       
  1535         const QVector<int> oldOuts = st.outs;
       
  1536         mergeInto(&st.outs, to);
       
  1537         if (f.at(atom).capture != QRegExpAtom::NoCapture) {
       
  1538             for (int j = 0; j < to.size(); j++) {
       
  1539                 // ### st.reenter.contains(to.at(j)) check looks suspicious
       
  1540                 if (!st.reenter.contains(to.at(j)) &&
       
  1541                      qBinaryFind(oldOuts.constBegin(), oldOuts.constEnd(), to.at(j)) == oldOuts.end())
       
  1542                     st.reenter.insert(to.at(j), atom);
       
  1543             }
       
  1544         }
       
  1545     }
       
  1546 }
       
  1547 #endif
       
  1548 
       
  1549 #ifndef QT_NO_REGEXP_ANCHOR_ALT
       
  1550 /*
       
  1551   Returns an anchor that means a OR b.
       
  1552 */
       
  1553 int QRegExpEngine::anchorAlternation(int a, int b)
       
  1554 {
       
  1555     if (((a & b) == a || (a & b) == b) && ((a | b) & Anchor_Alternation) == 0)
       
  1556         return a & b;
       
  1557 
       
  1558     int n = aa.size();
       
  1559 #ifndef QT_NO_REGEXP_OPTIM
       
  1560     if (n > 0 && aa.at(n - 1).a == a && aa.at(n - 1).b == b)
       
  1561         return Anchor_Alternation | (n - 1);
       
  1562 #endif
       
  1563 
       
  1564     QRegExpAnchorAlternation element = {a, b};
       
  1565     aa.append(element);
       
  1566     return Anchor_Alternation | n;
       
  1567 }
       
  1568 
       
  1569 /*
       
  1570   Returns an anchor that means a AND b.
       
  1571 */
       
  1572 int QRegExpEngine::anchorConcatenation(int a, int b)
       
  1573 {
       
  1574     if (((a | b) & Anchor_Alternation) == 0)
       
  1575         return a | b;
       
  1576     if ((b & Anchor_Alternation) != 0)
       
  1577         qSwap(a, b);
       
  1578 
       
  1579     int aprime = anchorConcatenation(aa.at(a ^ Anchor_Alternation).a, b);
       
  1580     int bprime = anchorConcatenation(aa.at(a ^ Anchor_Alternation).b, b);
       
  1581     return anchorAlternation(aprime, bprime);
       
  1582 }
       
  1583 #endif
       
  1584 
       
  1585 /*
       
  1586   Adds anchor a on a transition caracterised by its from state and
       
  1587   its to state.
       
  1588 */
       
  1589 void QRegExpEngine::addAnchors(int from, int to, int a)
       
  1590 {
       
  1591     QRegExpAutomatonState &st = s[from];
       
  1592     if (st.anchors.contains(to))
       
  1593         a = anchorAlternation(st.anchors.value(to), a);
       
  1594     st.anchors.insert(to, a);
       
  1595 }
       
  1596 
       
  1597 #ifndef QT_NO_REGEXP_OPTIM
       
  1598 /*
       
  1599   This function chooses between the good-string and the bad-character
       
  1600   heuristics. It computes two scores and chooses the heuristic with
       
  1601   the highest score.
       
  1602 
       
  1603   Here are some common-sense constraints on the scores that should be
       
  1604   respected if the formulas are ever modified: (1) If goodStr is
       
  1605   empty, the good-string heuristic scores 0. (2) If the regular
       
  1606   expression is trivial, the good-string heuristic should be used.
       
  1607   (3) If the search is case insensitive, the good-string heuristic
       
  1608   should be used, unless it scores 0. (Case insensitivity turns all
       
  1609   entries of occ1 to 0.) (4) If (goodLateStart - goodEarlyStart) is
       
  1610   big, the good-string heuristic should score less.
       
  1611 */
       
  1612 void QRegExpEngine::heuristicallyChooseHeuristic()
       
  1613 {
       
  1614     if (minl == 0) {
       
  1615         useGoodStringHeuristic = false;
       
  1616     } else if (trivial) {
       
  1617         useGoodStringHeuristic = true;
       
  1618     } else {
       
  1619         /*
       
  1620           Magic formula: The good string has to constitute a good
       
  1621           proportion of the minimum-length string, and appear at a
       
  1622           more-or-less known index.
       
  1623         */
       
  1624         int goodStringScore = (64 * goodStr.length() / minl) -
       
  1625                               (goodLateStart - goodEarlyStart);
       
  1626         /*
       
  1627           Less magic formula: We pick some characters at random, and
       
  1628           check whether they are good or bad.
       
  1629         */
       
  1630         int badCharScore = 0;
       
  1631         int step = qMax(1, NumBadChars / 32);
       
  1632         for (int i = 1; i < NumBadChars; i += step) {
       
  1633             if (occ1.at(i) == NoOccurrence)
       
  1634                 badCharScore += minl;
       
  1635             else
       
  1636                 badCharScore += occ1.at(i);
       
  1637         }
       
  1638         badCharScore /= minl;
       
  1639         useGoodStringHeuristic = (goodStringScore > badCharScore);
       
  1640     }
       
  1641 }
       
  1642 #endif
       
  1643 
       
  1644 #if defined(QT_DEBUG)
       
  1645 void QRegExpEngine::dump() const
       
  1646 {
       
  1647     int i, j;
       
  1648     qDebug("Case %ssensitive engine", cs ? "" : "in");
       
  1649     qDebug("  States");
       
  1650     for (i = 0; i < s.size(); i++) {
       
  1651         qDebug("  %d%s", i, i == InitialState ? " (initial)" : i == FinalState ? " (final)" : "");
       
  1652 #ifndef QT_NO_REGEXP_CAPTURE
       
  1653         if (nf > 0)
       
  1654             qDebug("    in atom %d", s[i].atom);
       
  1655 #endif
       
  1656         int m = s[i].match;
       
  1657         if ((m & CharClassBit) != 0) {
       
  1658             qDebug("    match character class %d", m ^ CharClassBit);
       
  1659 #ifndef QT_NO_REGEXP_CCLASS
       
  1660             cl[m ^ CharClassBit].dump();
       
  1661 #else
       
  1662             qDebug("    negative character class");
       
  1663 #endif
       
  1664         } else if ((m & BackRefBit) != 0) {
       
  1665             qDebug("    match back-reference %d", m ^ BackRefBit);
       
  1666         } else if (m >= 0x20 && m <= 0x7e) {
       
  1667             qDebug("    match 0x%.4x (%c)", m, m);
       
  1668         } else {
       
  1669             qDebug("    match 0x%.4x", m);
       
  1670         }
       
  1671         for (j = 0; j < s[i].outs.size(); j++) {
       
  1672             int next = s[i].outs[j];
       
  1673             qDebug("    -> %d", next);
       
  1674             if (s[i].reenter.contains(next))
       
  1675                 qDebug("       [reenter %d]", s[i].reenter[next]);
       
  1676             if (s[i].anchors.value(next) != 0)
       
  1677                 qDebug("       [anchors 0x%.8x]", s[i].anchors[next]);
       
  1678         }
       
  1679     }
       
  1680 #ifndef QT_NO_REGEXP_CAPTURE
       
  1681     if (nf > 0) {
       
  1682         qDebug("  Atom    Parent  Capture");
       
  1683         for (i = 0; i < nf; i++) {
       
  1684             if (f[i].capture == QRegExpAtom::NoCapture) {
       
  1685                 qDebug("  %6d  %6d     nil", i, f[i].parent);
       
  1686             } else {
       
  1687                 int cap = f[i].capture;
       
  1688                 bool official = captureForOfficialCapture.contains(cap);
       
  1689                 qDebug("  %6d  %6d  %6d  %s", i, f[i].parent, f[i].capture,
       
  1690                        official ? "official" : "");
       
  1691             }
       
  1692         }
       
  1693     }
       
  1694 #endif
       
  1695 #ifndef QT_NO_REGEXP_ANCHOR_ALT
       
  1696     for (i = 0; i < aa.size(); i++)
       
  1697         qDebug("  Anchor alternation 0x%.8x: 0x%.8x 0x%.9x", i, aa[i].a, aa[i].b);
       
  1698 #endif
       
  1699 }
       
  1700 #endif
       
  1701 
       
  1702 void QRegExpEngine::setup()
       
  1703 {
       
  1704     ref = 1;
       
  1705 #ifndef QT_NO_REGEXP_CAPTURE
       
  1706     f.resize(32);
       
  1707     nf = 0;
       
  1708     cf = -1;
       
  1709 #endif
       
  1710     officialncap = 0;
       
  1711     ncap = 0;
       
  1712 #ifndef QT_NO_REGEXP_OPTIM
       
  1713     caretAnchored = true;
       
  1714     trivial = true;
       
  1715 #endif
       
  1716     valid = false;
       
  1717 #ifndef QT_NO_REGEXP_BACKREF
       
  1718     nbrefs = 0;
       
  1719 #endif
       
  1720 #ifndef QT_NO_REGEXP_OPTIM
       
  1721     useGoodStringHeuristic = true;
       
  1722     minl = 0;
       
  1723     occ1.fill(0, NumBadChars);
       
  1724 #endif
       
  1725 }
       
  1726 
       
  1727 int QRegExpEngine::setupState(int match)
       
  1728 {
       
  1729 #ifndef QT_NO_REGEXP_CAPTURE
       
  1730     s += QRegExpAutomatonState(cf, match);
       
  1731 #else
       
  1732     s += QRegExpAutomatonState(match);
       
  1733 #endif
       
  1734     return s.size() - 1;
       
  1735 }
       
  1736 
       
  1737 #ifndef QT_NO_REGEXP_CAPTURE
       
  1738 /*
       
  1739   Functions startAtom() and finishAtom() should be called to delimit
       
  1740   atoms. When a state is created, it is assigned to the current atom.
       
  1741   The information is later used for capturing.
       
  1742 */
       
  1743 int QRegExpEngine::startAtom(bool officialCapture)
       
  1744 {
       
  1745     if ((nf & (nf + 1)) == 0 && nf + 1 >= f.size())
       
  1746         f.resize((nf + 1) << 1);
       
  1747     f[nf].parent = cf;
       
  1748     cf = nf++;
       
  1749     f[cf].capture = officialCapture ? QRegExpAtom::OfficialCapture : QRegExpAtom::NoCapture;
       
  1750     return cf;
       
  1751 }
       
  1752 
       
  1753 void QRegExpEngine::finishAtom(int atom, bool needCapture)
       
  1754 {
       
  1755     if (greedyQuantifiers && needCapture && f[atom].capture == QRegExpAtom::NoCapture)
       
  1756         f[atom].capture = QRegExpAtom::UnofficialCapture;
       
  1757     cf = f.at(atom).parent;
       
  1758 }
       
  1759 #endif
       
  1760 
       
  1761 #ifndef QT_NO_REGEXP_LOOKAHEAD
       
  1762 /*
       
  1763   Creates a lookahead anchor.
       
  1764 */
       
  1765 int QRegExpEngine::addLookahead(QRegExpEngine *eng, bool negative)
       
  1766 {
       
  1767     int n = ahead.size();
       
  1768     if (n == MaxLookaheads) {
       
  1769         error(RXERR_LIMIT);
       
  1770         return 0;
       
  1771     }
       
  1772     ahead += new QRegExpLookahead(eng, negative);
       
  1773     return Anchor_FirstLookahead << n;
       
  1774 }
       
  1775 #endif
       
  1776 
       
  1777 #ifndef QT_NO_REGEXP_CAPTURE
       
  1778 /*
       
  1779   We want the longest leftmost captures.
       
  1780 */
       
  1781 static bool isBetterCapture(int ncap, const int *begin1, const int *end1, const int *begin2,
       
  1782                             const int *end2)
       
  1783 {
       
  1784     for (int i = 0; i < ncap; i++) {
       
  1785         int delta = begin2[i] - begin1[i]; // it has to start early...
       
  1786         if (delta == 0)
       
  1787             delta = end1[i] - end2[i]; // ...and end late
       
  1788 
       
  1789         if (delta != 0)
       
  1790             return delta > 0;
       
  1791     }
       
  1792     return false;
       
  1793 }
       
  1794 #endif
       
  1795 
       
  1796 /*
       
  1797   Returns true if anchor a matches at position pos + i in the input
       
  1798   string, otherwise false.
       
  1799 */
       
  1800 bool QRegExpMatchState::testAnchor(int i, int a, const int *capBegin)
       
  1801 {
       
  1802     int j;
       
  1803 
       
  1804 #ifndef QT_NO_REGEXP_ANCHOR_ALT
       
  1805     if ((a & QRegExpEngine::Anchor_Alternation) != 0)
       
  1806         return testAnchor(i, eng->aa.at(a ^ QRegExpEngine::Anchor_Alternation).a, capBegin)
       
  1807                || testAnchor(i, eng->aa.at(a ^ QRegExpEngine::Anchor_Alternation).b, capBegin);
       
  1808 #endif
       
  1809 
       
  1810     if ((a & QRegExpEngine::Anchor_Caret) != 0) {
       
  1811         if (pos + i != caretPos)
       
  1812             return false;
       
  1813     }
       
  1814     if ((a & QRegExpEngine::Anchor_Dollar) != 0) {
       
  1815         if (pos + i != len)
       
  1816             return false;
       
  1817     }
       
  1818 #ifndef QT_NO_REGEXP_ESCAPE
       
  1819     if ((a & (QRegExpEngine::Anchor_Word | QRegExpEngine::Anchor_NonWord)) != 0) {
       
  1820         bool before = false;
       
  1821         bool after = false;
       
  1822         if (pos + i != 0)
       
  1823             before = isWord(in[pos + i - 1]);
       
  1824         if (pos + i != len)
       
  1825             after = isWord(in[pos + i]);
       
  1826         if ((a & QRegExpEngine::Anchor_Word) != 0 && (before == after))
       
  1827             return false;
       
  1828         if ((a & QRegExpEngine::Anchor_NonWord) != 0 && (before != after))
       
  1829             return false;
       
  1830     }
       
  1831 #endif
       
  1832 #ifndef QT_NO_REGEXP_LOOKAHEAD
       
  1833     if ((a & QRegExpEngine::Anchor_LookaheadMask) != 0) {
       
  1834         const QVector<QRegExpLookahead *> &ahead = eng->ahead;
       
  1835         for (j = 0; j < ahead.size(); j++) {
       
  1836             if ((a & (QRegExpEngine::Anchor_FirstLookahead << j)) != 0) {
       
  1837                 QRegExpMatchState matchState;
       
  1838                 matchState.prepareForMatch(ahead[j]->eng);
       
  1839                 matchState.match(in + pos + i, len - pos - i, 0,
       
  1840                     true, true, matchState.caretPos - matchState.pos - i);
       
  1841                 if ((matchState.captured[0] == 0) == ahead[j]->neg)
       
  1842                     return false;
       
  1843             }
       
  1844         }
       
  1845     }
       
  1846 #endif
       
  1847 #ifndef QT_NO_REGEXP_CAPTURE
       
  1848 #ifndef QT_NO_REGEXP_BACKREF
       
  1849     for (j = 0; j < eng->nbrefs; j++) {
       
  1850         if ((a & (QRegExpEngine::Anchor_BackRef1Empty << j)) != 0) {
       
  1851             int i = eng->captureForOfficialCapture.at(j);
       
  1852             if (capBegin[i] != EmptyCapture)
       
  1853                 return false;
       
  1854         }
       
  1855     }
       
  1856 #endif
       
  1857 #endif
       
  1858     return true;
       
  1859 }
       
  1860 
       
  1861 #ifndef QT_NO_REGEXP_OPTIM
       
  1862 /*
       
  1863   The three following functions are what Jeffrey Friedl would call
       
  1864   transmissions (or bump-alongs). Using one or the other should make
       
  1865   no difference except in performance.
       
  1866 */
       
  1867 
       
  1868 bool QRegExpEngine::goodStringMatch(QRegExpMatchState &matchState) const
       
  1869 {
       
  1870     int k = matchState.pos + goodEarlyStart;
       
  1871     QStringMatcher matcher(goodStr.unicode(), goodStr.length(), cs);
       
  1872     while ((k = matcher.indexIn(matchState.in, matchState.len, k)) != -1) {
       
  1873         int from = k - goodLateStart;
       
  1874         int to = k - goodEarlyStart;
       
  1875         if (from > matchState.pos)
       
  1876             matchState.pos = from;
       
  1877 
       
  1878         while (matchState.pos <= to) {
       
  1879             if (matchState.matchHere())
       
  1880                 return true;
       
  1881             ++matchState.pos;
       
  1882         }
       
  1883         ++k;
       
  1884     }
       
  1885     return false;
       
  1886 }
       
  1887 
       
  1888 bool QRegExpEngine::badCharMatch(QRegExpMatchState &matchState) const
       
  1889 {
       
  1890     int slideHead = 0;
       
  1891     int slideNext = 0;
       
  1892     int i;
       
  1893     int lastPos = matchState.len - minl;
       
  1894     memset(matchState.slideTab, 0, matchState.slideTabSize * sizeof(int));
       
  1895 
       
  1896     /*
       
  1897       Set up the slide table, used for the bad-character heuristic,
       
  1898       using the table of first occurrence of each character.
       
  1899     */
       
  1900     for (i = 0; i < minl; i++) {
       
  1901         int sk = occ1[BadChar(matchState.in[matchState.pos + i])];
       
  1902         if (sk == NoOccurrence)
       
  1903             sk = i + 1;
       
  1904         if (sk > 0) {
       
  1905             int k = i + 1 - sk;
       
  1906             if (k < 0) {
       
  1907                 sk = i + 1;
       
  1908                 k = 0;
       
  1909             }
       
  1910             if (sk > matchState.slideTab[k])
       
  1911                 matchState.slideTab[k] = sk;
       
  1912         }
       
  1913     }
       
  1914 
       
  1915     if (matchState.pos > lastPos)
       
  1916         return false;
       
  1917 
       
  1918     for (;;) {
       
  1919         if (++slideNext >= matchState.slideTabSize)
       
  1920             slideNext = 0;
       
  1921         if (matchState.slideTab[slideHead] > 0) {
       
  1922             if (matchState.slideTab[slideHead] - 1 > matchState.slideTab[slideNext])
       
  1923                 matchState.slideTab[slideNext] = matchState.slideTab[slideHead] - 1;
       
  1924             matchState.slideTab[slideHead] = 0;
       
  1925         } else {
       
  1926             if (matchState.matchHere())
       
  1927                 return true;
       
  1928         }
       
  1929 
       
  1930         if (matchState.pos == lastPos)
       
  1931             break;
       
  1932 
       
  1933         /*
       
  1934           Update the slide table. This code has much in common with
       
  1935           the initialization code.
       
  1936         */
       
  1937         int sk = occ1[BadChar(matchState.in[matchState.pos + minl])];
       
  1938         if (sk == NoOccurrence) {
       
  1939             matchState.slideTab[slideNext] = minl;
       
  1940         } else if (sk > 0) {
       
  1941             int k = slideNext + minl - sk;
       
  1942             if (k >= matchState.slideTabSize)
       
  1943                 k -= matchState.slideTabSize;
       
  1944             if (sk > matchState.slideTab[k])
       
  1945                 matchState.slideTab[k] = sk;
       
  1946         }
       
  1947         slideHead = slideNext;
       
  1948         ++matchState.pos;
       
  1949     }
       
  1950     return false;
       
  1951 }
       
  1952 #else
       
  1953 bool QRegExpEngine::bruteMatch(QRegExpMatchState &matchState) const
       
  1954 {
       
  1955     while (matchState.pos <= matchState.len) {
       
  1956         if (matchState.matchHere())
       
  1957             return true;
       
  1958         ++matchState.pos;
       
  1959     }
       
  1960     return false;
       
  1961 }
       
  1962 #endif
       
  1963 
       
  1964 /*
       
  1965   Here's the core of the engine. It tries to do a match here and now.
       
  1966 */
       
  1967 bool QRegExpMatchState::matchHere()
       
  1968 {
       
  1969     int ncur = 1, nnext = 0;
       
  1970     int i = 0, j, k, m;
       
  1971     bool stop = false;
       
  1972 
       
  1973     matchLen = -1;
       
  1974     oneTestMatchedLen = -1;
       
  1975     curStack[0] = QRegExpEngine::InitialState;
       
  1976 
       
  1977     int ncap = eng->ncap;
       
  1978 #ifndef QT_NO_REGEXP_CAPTURE
       
  1979     if (ncap > 0) {
       
  1980         for (j = 0; j < ncap; j++) {
       
  1981             curCapBegin[j] = EmptyCapture;
       
  1982             curCapEnd[j] = EmptyCapture;
       
  1983         }
       
  1984     }
       
  1985 #endif
       
  1986 
       
  1987 #ifndef QT_NO_REGEXP_BACKREF
       
  1988     while ((ncur > 0 || !sleeping.isEmpty()) && i <= len - pos && !stop)
       
  1989 #else
       
  1990     while (ncur > 0 && i <= len - pos && !stop)
       
  1991 #endif
       
  1992     {
       
  1993         int ch = (i < len - pos) ? in[pos + i].unicode() : 0;
       
  1994         for (j = 0; j < ncur; j++) {
       
  1995             int cur = curStack[j];
       
  1996             const QRegExpAutomatonState &scur = eng->s.at(cur);
       
  1997             const QVector<int> &outs = scur.outs;
       
  1998             for (k = 0; k < outs.size(); k++) {
       
  1999                 int next = outs.at(k);
       
  2000                 const QRegExpAutomatonState &snext = eng->s.at(next);
       
  2001                 bool inside = true;
       
  2002 #if !defined(QT_NO_REGEXP_BACKREF) && !defined(QT_NO_REGEXP_CAPTURE)
       
  2003                 int needSomeSleep = 0;
       
  2004 #endif
       
  2005 
       
  2006                 /*
       
  2007                   First, check if the anchors are anchored properly.
       
  2008                 */
       
  2009                 int a = scur.anchors.value(next);
       
  2010                 if (a != 0 && !testAnchor(i, a, curCapBegin + j * ncap))
       
  2011                     inside = false;
       
  2012 
       
  2013                 /*
       
  2014                   If indeed they are, check if the input character is
       
  2015                   correct for this transition.
       
  2016                 */
       
  2017                 if (inside) {
       
  2018                     m = snext.match;
       
  2019                     if ((m & (QRegExpEngine::CharClassBit | QRegExpEngine::BackRefBit)) == 0) {
       
  2020                         if (eng->cs)
       
  2021                             inside = (m == ch);
       
  2022                         else
       
  2023                             inside = (QChar(m).toLower() == QChar(ch).toLower());
       
  2024                     } else if (next == QRegExpEngine::FinalState) {
       
  2025                         matchLen = i;
       
  2026                         stop = minimal;
       
  2027                         inside = true;
       
  2028                     } else if ((m & QRegExpEngine::CharClassBit) != 0) {
       
  2029 #ifndef QT_NO_REGEXP_CCLASS
       
  2030                         const QRegExpCharClass &cc = eng->cl.at(m ^ QRegExpEngine::CharClassBit);
       
  2031                         if (eng->cs)
       
  2032                             inside = cc.in(ch);
       
  2033                         else if (cc.negative())
       
  2034                             inside = cc.in(QChar(ch).toLower()) &&
       
  2035                                      cc.in(QChar(ch).toUpper());
       
  2036                         else
       
  2037                             inside = cc.in(QChar(ch).toLower()) ||
       
  2038                                      cc.in(QChar(ch).toUpper());
       
  2039 #endif
       
  2040 #if !defined(QT_NO_REGEXP_BACKREF) && !defined(QT_NO_REGEXP_CAPTURE)
       
  2041                     } else { /* ((m & QRegExpEngine::BackRefBit) != 0) */
       
  2042                         int bref = m ^ QRegExpEngine::BackRefBit;
       
  2043                         int ell = j * ncap + eng->captureForOfficialCapture.at(bref - 1);
       
  2044 
       
  2045                         inside = bref <= ncap && curCapBegin[ell] != EmptyCapture;
       
  2046                         if (inside) {
       
  2047                             if (eng->cs)
       
  2048                                 inside = (in[pos + curCapBegin[ell]] == QChar(ch));
       
  2049                             else
       
  2050                                 inside = (in[pos + curCapBegin[ell]].toLower()
       
  2051                                        == QChar(ch).toLower());
       
  2052                         }
       
  2053 
       
  2054                         if (inside) {
       
  2055                             int delta;
       
  2056                             if (curCapEnd[ell] == EmptyCapture)
       
  2057                                 delta = i - curCapBegin[ell];
       
  2058                             else
       
  2059                                 delta = curCapEnd[ell] - curCapBegin[ell];
       
  2060 
       
  2061                             inside = (delta <= len - (pos + i));
       
  2062                             if (inside && delta > 1) {
       
  2063                                 int n = 1;
       
  2064                                 if (eng->cs) {
       
  2065                                     while (n < delta) {
       
  2066                                         if (in[pos + curCapBegin[ell] + n]
       
  2067                                             != in[pos + i + n])
       
  2068                                             break;
       
  2069                                         ++n;
       
  2070                                     }
       
  2071                                 } else {
       
  2072                                     while (n < delta) {
       
  2073                                         QChar a = in[pos + curCapBegin[ell] + n];
       
  2074                                         QChar b = in[pos + i + n];
       
  2075                                         if (a.toLower() != b.toLower())
       
  2076                                             break;
       
  2077                                         ++n;
       
  2078                                     }
       
  2079                                 }
       
  2080                                 inside = (n == delta);
       
  2081                                 if (inside)
       
  2082                                     needSomeSleep = delta - 1;
       
  2083                             }
       
  2084                         }
       
  2085 #endif
       
  2086                     }
       
  2087                 }
       
  2088 
       
  2089                 /*
       
  2090                   We must now update our data structures.
       
  2091                 */
       
  2092                 if (inside) {
       
  2093 #ifndef QT_NO_REGEXP_CAPTURE
       
  2094                     int *capBegin, *capEnd;
       
  2095 #endif
       
  2096                     /*
       
  2097                       If the next state was not encountered yet, all
       
  2098                       is fine.
       
  2099                     */
       
  2100                     if ((m = inNextStack[next]) == -1) {
       
  2101                         m = nnext++;
       
  2102                         nextStack[m] = next;
       
  2103                         inNextStack[next] = m;
       
  2104 #ifndef QT_NO_REGEXP_CAPTURE
       
  2105                         capBegin = nextCapBegin + m * ncap;
       
  2106                         capEnd = nextCapEnd + m * ncap;
       
  2107 
       
  2108                     /*
       
  2109                       Otherwise, we'll first maintain captures in
       
  2110                       temporary arrays, and decide at the end whether
       
  2111                       it's best to keep the previous capture zones or
       
  2112                       the new ones.
       
  2113                     */
       
  2114                     } else {
       
  2115                         capBegin = tempCapBegin;
       
  2116                         capEnd = tempCapEnd;
       
  2117 #endif
       
  2118                     }
       
  2119 
       
  2120 #ifndef QT_NO_REGEXP_CAPTURE
       
  2121                     /*
       
  2122                       Updating the capture zones is much of a task.
       
  2123                     */
       
  2124                     if (ncap > 0) {
       
  2125                         memcpy(capBegin, curCapBegin + j * ncap, ncap * sizeof(int));
       
  2126                         memcpy(capEnd, curCapEnd + j * ncap, ncap * sizeof(int));
       
  2127                         int c = scur.atom, n = snext.atom;
       
  2128                         int p = -1, q = -1;
       
  2129                         int cap;
       
  2130 
       
  2131                         /*
       
  2132                           Lemma 1. For any x in the range [0..nf), we
       
  2133                           have f[x].parent < x.
       
  2134 
       
  2135                           Proof. By looking at startAtom(), it is
       
  2136                           clear that cf < nf holds all the time, and
       
  2137                           thus that f[nf].parent < nf.
       
  2138                         */
       
  2139 
       
  2140                         /*
       
  2141                           If we are reentering an atom, we empty all
       
  2142                           capture zones inside it.
       
  2143                         */
       
  2144                         if ((q = scur.reenter.value(next)) != 0) {
       
  2145                             QBitArray b(eng->nf, false);
       
  2146                             b.setBit(q, true);
       
  2147                             for (int ell = q + 1; ell < eng->nf; ell++) {
       
  2148                                 if (b.testBit(eng->f.at(ell).parent)) {
       
  2149                                     b.setBit(ell, true);
       
  2150                                     cap = eng->f.at(ell).capture;
       
  2151                                     if (cap >= 0) {
       
  2152                                         capBegin[cap] = EmptyCapture;
       
  2153                                         capEnd[cap] = EmptyCapture;
       
  2154                                     }
       
  2155                                 }
       
  2156                             }
       
  2157                             p = eng->f.at(q).parent;
       
  2158 
       
  2159                         /*
       
  2160                           Otherwise, close the capture zones we are
       
  2161                           leaving. We are leaving f[c].capture,
       
  2162                           f[f[c].parent].capture,
       
  2163                           f[f[f[c].parent].parent].capture, ...,
       
  2164                           until f[x].capture, with x such that
       
  2165                           f[x].parent is the youngest common ancestor
       
  2166                           for c and n.
       
  2167 
       
  2168                           We go up along c's and n's ancestry until
       
  2169                           we find x.
       
  2170                         */
       
  2171                         } else {
       
  2172                             p = c;
       
  2173                             q = n;
       
  2174                             while (p != q) {
       
  2175                                 if (p > q) {
       
  2176                                     cap = eng->f.at(p).capture;
       
  2177                                     if (cap >= 0) {
       
  2178                                         if (capBegin[cap] == i) {
       
  2179                                             capBegin[cap] = EmptyCapture;
       
  2180                                             capEnd[cap] = EmptyCapture;
       
  2181                                         } else {
       
  2182                                             capEnd[cap] = i;
       
  2183                                         }
       
  2184                                     }
       
  2185                                     p = eng->f.at(p).parent;
       
  2186                                 } else {
       
  2187                                     q = eng->f.at(q).parent;
       
  2188                                 }
       
  2189                             }
       
  2190                         }
       
  2191 
       
  2192                         /*
       
  2193                           In any case, we now open the capture zones
       
  2194                           we are entering. We work upwards from n
       
  2195                           until we reach p (the parent of the atom we
       
  2196                           reenter or the youngest common ancestor).
       
  2197                         */
       
  2198                         while (n > p) {
       
  2199                             cap = eng->f.at(n).capture;
       
  2200                             if (cap >= 0) {
       
  2201                                 capBegin[cap] = i;
       
  2202                                 capEnd[cap] = EmptyCapture;
       
  2203                             }
       
  2204                             n = eng->f.at(n).parent;
       
  2205                         }
       
  2206                         /*
       
  2207                           If the next state was already in
       
  2208                           nextStack, we must choose carefully which
       
  2209                           capture zones we want to keep.
       
  2210                         */
       
  2211                         if (capBegin == tempCapBegin &&
       
  2212                                 isBetterCapture(ncap, capBegin, capEnd, nextCapBegin + m * ncap,
       
  2213                                                 nextCapEnd + m * ncap)) {
       
  2214                             memcpy(nextCapBegin + m * ncap, capBegin, ncap * sizeof(int));
       
  2215                             memcpy(nextCapEnd + m * ncap, capEnd, ncap * sizeof(int));
       
  2216                         }
       
  2217                     }
       
  2218 #ifndef QT_NO_REGEXP_BACKREF
       
  2219                     /*
       
  2220                       We are done with updating the capture zones.
       
  2221                       It's now time to put the next state to sleep,
       
  2222                       if it needs to, and to remove it from
       
  2223                       nextStack.
       
  2224                     */
       
  2225                     if (needSomeSleep > 0) {
       
  2226                         QVector<int> zzZ(2 + 2 * ncap);
       
  2227                         zzZ[0] = i + needSomeSleep;
       
  2228                         zzZ[1] = next;
       
  2229                         if (ncap > 0) {
       
  2230                             memcpy(zzZ.data() + 2, capBegin, ncap * sizeof(int));
       
  2231                             memcpy(zzZ.data() + 2 + ncap, capEnd, ncap * sizeof(int));
       
  2232                         }
       
  2233                         inNextStack[nextStack[--nnext]] = -1;
       
  2234                         sleeping.append(zzZ);
       
  2235                     }
       
  2236 #endif
       
  2237 #endif
       
  2238                 }
       
  2239             }
       
  2240         }
       
  2241 #ifndef QT_NO_REGEXP_CAPTURE
       
  2242         /*
       
  2243           If we reached the final state, hurray! Copy the captured
       
  2244           zone.
       
  2245         */
       
  2246         if (ncap > 0 && (m = inNextStack[QRegExpEngine::FinalState]) != -1) {
       
  2247             memcpy(capBegin, nextCapBegin + m * ncap, ncap * sizeof(int));
       
  2248             memcpy(capEnd, nextCapEnd + m * ncap, ncap * sizeof(int));
       
  2249         }
       
  2250 #ifndef QT_NO_REGEXP_BACKREF
       
  2251         /*
       
  2252           It's time to wake up the sleepers.
       
  2253         */
       
  2254         j = 0;
       
  2255         while (j < sleeping.count()) {
       
  2256             if (sleeping.at(j)[0] == i) {
       
  2257                 const QVector<int> &zzZ = sleeping.at(j);
       
  2258                 int next = zzZ[1];
       
  2259                 const int *capBegin = zzZ.data() + 2;
       
  2260                 const int *capEnd = zzZ.data() + 2 + ncap;
       
  2261                 bool copyOver = true;
       
  2262 
       
  2263                 if ((m = inNextStack[next]) == -1) {
       
  2264                     m = nnext++;
       
  2265                     nextStack[m] = next;
       
  2266                     inNextStack[next] = m;
       
  2267                 } else {
       
  2268                     copyOver = isBetterCapture(ncap, nextCapBegin + m * ncap, nextCapEnd + m * ncap,
       
  2269                                                capBegin, capEnd);
       
  2270                 }
       
  2271                 if (copyOver) {
       
  2272                     memcpy(nextCapBegin + m * ncap, capBegin, ncap * sizeof(int));
       
  2273                     memcpy(nextCapEnd + m * ncap, capEnd, ncap * sizeof(int));
       
  2274                 }
       
  2275 
       
  2276                 sleeping.removeAt(j);
       
  2277             } else {
       
  2278                 ++j;
       
  2279             }
       
  2280         }
       
  2281 #endif
       
  2282 #endif
       
  2283         for (j = 0; j < nnext; j++)
       
  2284             inNextStack[nextStack[j]] = -1;
       
  2285 
       
  2286         // avoid needless iteration that confuses oneTestMatchedLen
       
  2287         if (nnext == 1 && nextStack[0] == QRegExpEngine::FinalState
       
  2288 #ifndef QT_NO_REGEXP_BACKREF
       
  2289              && sleeping.isEmpty()
       
  2290 #endif
       
  2291            )
       
  2292             stop = true;
       
  2293 
       
  2294         qSwap(curStack, nextStack);
       
  2295 #ifndef QT_NO_REGEXP_CAPTURE
       
  2296         qSwap(curCapBegin, nextCapBegin);
       
  2297         qSwap(curCapEnd, nextCapEnd);
       
  2298 #endif
       
  2299         ncur = nnext;
       
  2300         nnext = 0;
       
  2301         ++i;
       
  2302     }
       
  2303 
       
  2304 #ifndef QT_NO_REGEXP_BACKREF
       
  2305     /*
       
  2306       If minimal matching is enabled, we might have some sleepers
       
  2307       left.
       
  2308     */
       
  2309     if (!sleeping.isEmpty())
       
  2310         sleeping.clear();
       
  2311 #endif
       
  2312 
       
  2313     oneTestMatchedLen = i - 1;
       
  2314     return (matchLen >= 0);
       
  2315 }
       
  2316 
       
  2317 #ifndef QT_NO_REGEXP_CCLASS
       
  2318 
       
  2319 QRegExpCharClass::QRegExpCharClass()
       
  2320     : c(0), n(false)
       
  2321 {
       
  2322 #ifndef QT_NO_REGEXP_OPTIM
       
  2323     occ1.fill(NoOccurrence, NumBadChars);
       
  2324 #endif
       
  2325 }
       
  2326 
       
  2327 QRegExpCharClass &QRegExpCharClass::operator=(const QRegExpCharClass &cc)
       
  2328 {
       
  2329     c = cc.c;
       
  2330     r = cc.r;
       
  2331     n = cc.n;
       
  2332 #ifndef QT_NO_REGEXP_OPTIM
       
  2333     occ1 = cc.occ1;
       
  2334 #endif
       
  2335     return *this;
       
  2336 }
       
  2337 
       
  2338 void QRegExpCharClass::clear()
       
  2339 {
       
  2340     c = 0;
       
  2341     r.resize(0);
       
  2342     n = false;
       
  2343 }
       
  2344 
       
  2345 void QRegExpCharClass::setNegative(bool negative)
       
  2346 {
       
  2347     n = negative;
       
  2348 #ifndef QT_NO_REGEXP_OPTIM
       
  2349     occ1.fill(0, NumBadChars);
       
  2350 #endif
       
  2351 }
       
  2352 
       
  2353 void QRegExpCharClass::addCategories(int cats)
       
  2354 {
       
  2355     c |= cats;
       
  2356 #ifndef QT_NO_REGEXP_OPTIM
       
  2357     occ1.fill(0, NumBadChars);
       
  2358 #endif
       
  2359 }
       
  2360 
       
  2361 void QRegExpCharClass::addRange(ushort from, ushort to)
       
  2362 {
       
  2363     if (from > to)
       
  2364         qSwap(from, to);
       
  2365     int m = r.size();
       
  2366     r.resize(m + 1);
       
  2367     r[m].from = from;
       
  2368     r[m].len = to - from + 1;
       
  2369 
       
  2370 #ifndef QT_NO_REGEXP_OPTIM
       
  2371     int i;
       
  2372 
       
  2373     if (to - from < NumBadChars) {
       
  2374         if (from % NumBadChars <= to % NumBadChars) {
       
  2375             for (i = from % NumBadChars; i <= to % NumBadChars; i++)
       
  2376                 occ1[i] = 0;
       
  2377         } else {
       
  2378             for (i = 0; i <= to % NumBadChars; i++)
       
  2379                 occ1[i] = 0;
       
  2380             for (i = from % NumBadChars; i < NumBadChars; i++)
       
  2381                 occ1[i] = 0;
       
  2382         }
       
  2383     } else {
       
  2384         occ1.fill(0, NumBadChars);
       
  2385     }
       
  2386 #endif
       
  2387 }
       
  2388 
       
  2389 bool QRegExpCharClass::in(QChar ch) const
       
  2390 {
       
  2391 #ifndef QT_NO_REGEXP_OPTIM
       
  2392     if (occ1.at(BadChar(ch)) == NoOccurrence)
       
  2393         return n;
       
  2394 #endif
       
  2395 
       
  2396     if (c != 0 && (c & (1 << (int)ch.category())) != 0)
       
  2397         return !n;
       
  2398 
       
  2399     const int uc = ch.unicode();
       
  2400     int size = r.size();
       
  2401 
       
  2402     for (int i = 0; i < size; ++i) {
       
  2403         const QRegExpCharClassRange &range = r.at(i);
       
  2404         if (uint(uc - range.from) < uint(r.at(i).len))
       
  2405             return !n;
       
  2406     }
       
  2407     return n;
       
  2408 }
       
  2409 
       
  2410 #if defined(QT_DEBUG)
       
  2411 void QRegExpCharClass::dump() const
       
  2412 {
       
  2413     int i;
       
  2414     qDebug("    %stive character class", n ? "nega" : "posi");
       
  2415 #ifndef QT_NO_REGEXP_CCLASS
       
  2416     if (c != 0)
       
  2417         qDebug("      categories 0x%.8x", c);
       
  2418 #endif
       
  2419     for (i = 0; i < r.size(); i++)
       
  2420         qDebug("      0x%.4x through 0x%.4x", r[i].from, r[i].from + r[i].len - 1);
       
  2421 }
       
  2422 #endif
       
  2423 #endif
       
  2424 
       
  2425 QRegExpEngine::Box::Box(QRegExpEngine *engine)
       
  2426     : eng(engine), skipanchors(0)
       
  2427 #ifndef QT_NO_REGEXP_OPTIM
       
  2428       , earlyStart(0), lateStart(0), maxl(0)
       
  2429 #endif
       
  2430 {
       
  2431 #ifndef QT_NO_REGEXP_OPTIM
       
  2432     occ1.fill(NoOccurrence, NumBadChars);
       
  2433 #endif
       
  2434     minl = 0;
       
  2435 }
       
  2436 
       
  2437 QRegExpEngine::Box &QRegExpEngine::Box::operator=(const Box &b)
       
  2438 {
       
  2439     eng = b.eng;
       
  2440     ls = b.ls;
       
  2441     rs = b.rs;
       
  2442     lanchors = b.lanchors;
       
  2443     ranchors = b.ranchors;
       
  2444     skipanchors = b.skipanchors;
       
  2445 #ifndef QT_NO_REGEXP_OPTIM
       
  2446     earlyStart = b.earlyStart;
       
  2447     lateStart = b.lateStart;
       
  2448     str = b.str;
       
  2449     leftStr = b.leftStr;
       
  2450     rightStr = b.rightStr;
       
  2451     maxl = b.maxl;
       
  2452     occ1 = b.occ1;
       
  2453 #endif
       
  2454     minl = b.minl;
       
  2455     return *this;
       
  2456 }
       
  2457 
       
  2458 void QRegExpEngine::Box::set(QChar ch)
       
  2459 {
       
  2460     ls.resize(1);
       
  2461     ls[0] = eng->createState(ch);
       
  2462     rs = ls;
       
  2463 #ifndef QT_NO_REGEXP_OPTIM
       
  2464     str = ch;
       
  2465     leftStr = ch;
       
  2466     rightStr = ch;
       
  2467     maxl = 1;
       
  2468     occ1[BadChar(ch)] = 0;
       
  2469 #endif
       
  2470     minl = 1;
       
  2471 }
       
  2472 
       
  2473 void QRegExpEngine::Box::set(const QRegExpCharClass &cc)
       
  2474 {
       
  2475     ls.resize(1);
       
  2476     ls[0] = eng->createState(cc);
       
  2477     rs = ls;
       
  2478 #ifndef QT_NO_REGEXP_OPTIM
       
  2479     maxl = 1;
       
  2480     occ1 = cc.firstOccurrence();
       
  2481 #endif
       
  2482     minl = 1;
       
  2483 }
       
  2484 
       
  2485 #ifndef QT_NO_REGEXP_BACKREF
       
  2486 void QRegExpEngine::Box::set(int bref)
       
  2487 {
       
  2488     ls.resize(1);
       
  2489     ls[0] = eng->createState(bref);
       
  2490     rs = ls;
       
  2491     if (bref >= 1 && bref <= MaxBackRefs)
       
  2492         skipanchors = Anchor_BackRef0Empty << bref;
       
  2493 #ifndef QT_NO_REGEXP_OPTIM
       
  2494     maxl = InftyLen;
       
  2495 #endif
       
  2496     minl = 0;
       
  2497 }
       
  2498 #endif
       
  2499 
       
  2500 void QRegExpEngine::Box::cat(const Box &b)
       
  2501 {
       
  2502     eng->addCatTransitions(rs, b.ls);
       
  2503     addAnchorsToEngine(b);
       
  2504     if (minl == 0) {
       
  2505         lanchors.unite(b.lanchors);
       
  2506         if (skipanchors != 0) {
       
  2507             for (int i = 0; i < b.ls.size(); i++) {
       
  2508                 int a = eng->anchorConcatenation(lanchors.value(b.ls.at(i), 0), skipanchors);
       
  2509                 lanchors.insert(b.ls.at(i), a);
       
  2510             }
       
  2511         }
       
  2512         mergeInto(&ls, b.ls);
       
  2513     }
       
  2514     if (b.minl == 0) {
       
  2515         ranchors.unite(b.ranchors);
       
  2516         if (b.skipanchors != 0) {
       
  2517             for (int i = 0; i < rs.size(); i++) {
       
  2518                 int a = eng->anchorConcatenation(ranchors.value(rs.at(i), 0), b.skipanchors);
       
  2519                 ranchors.insert(rs.at(i), a);
       
  2520             }
       
  2521         }
       
  2522         mergeInto(&rs, b.rs);
       
  2523     } else {
       
  2524         ranchors = b.ranchors;
       
  2525         rs = b.rs;
       
  2526     }
       
  2527 
       
  2528 #ifndef QT_NO_REGEXP_OPTIM
       
  2529     if (maxl != InftyLen) {
       
  2530         if (rightStr.length() + b.leftStr.length() >
       
  2531              qMax(str.length(), b.str.length())) {
       
  2532             earlyStart = minl - rightStr.length();
       
  2533             lateStart = maxl - rightStr.length();
       
  2534             str = rightStr + b.leftStr;
       
  2535         } else if (b.str.length() > str.length()) {
       
  2536             earlyStart = minl + b.earlyStart;
       
  2537             lateStart = maxl + b.lateStart;
       
  2538             str = b.str;
       
  2539         }
       
  2540     }
       
  2541 
       
  2542     if (leftStr.length() == maxl)
       
  2543         leftStr += b.leftStr;
       
  2544 
       
  2545     if (b.rightStr.length() == b.maxl) {
       
  2546         rightStr += b.rightStr;
       
  2547     } else {
       
  2548         rightStr = b.rightStr;
       
  2549     }
       
  2550 
       
  2551     if (maxl == InftyLen || b.maxl == InftyLen) {
       
  2552         maxl = InftyLen;
       
  2553     } else {
       
  2554         maxl += b.maxl;
       
  2555     }
       
  2556 
       
  2557     for (int i = 0; i < NumBadChars; i++) {
       
  2558         if (b.occ1.at(i) != NoOccurrence && minl + b.occ1.at(i) < occ1.at(i))
       
  2559             occ1[i] = minl + b.occ1.at(i);
       
  2560     }
       
  2561 #endif
       
  2562 
       
  2563     minl += b.minl;
       
  2564     if (minl == 0)
       
  2565         skipanchors = eng->anchorConcatenation(skipanchors, b.skipanchors);
       
  2566     else
       
  2567         skipanchors = 0;
       
  2568 }
       
  2569 
       
  2570 void QRegExpEngine::Box::orx(const Box &b)
       
  2571 {
       
  2572     mergeInto(&ls, b.ls);
       
  2573     lanchors.unite(b.lanchors);
       
  2574     mergeInto(&rs, b.rs);
       
  2575     ranchors.unite(b.ranchors);
       
  2576 
       
  2577     if (b.minl == 0) {
       
  2578         if (minl == 0)
       
  2579             skipanchors = eng->anchorAlternation(skipanchors, b.skipanchors);
       
  2580         else
       
  2581             skipanchors = b.skipanchors;
       
  2582     }
       
  2583 
       
  2584 #ifndef QT_NO_REGEXP_OPTIM
       
  2585     for (int i = 0; i < NumBadChars; i++) {
       
  2586         if (occ1.at(i) > b.occ1.at(i))
       
  2587             occ1[i] = b.occ1.at(i);
       
  2588     }
       
  2589     earlyStart = 0;
       
  2590     lateStart = 0;
       
  2591     str = QString();
       
  2592     leftStr = QString();
       
  2593     rightStr = QString();
       
  2594     if (b.maxl > maxl)
       
  2595         maxl = b.maxl;
       
  2596 #endif
       
  2597     if (b.minl < minl)
       
  2598         minl = b.minl;
       
  2599 }
       
  2600 
       
  2601 void QRegExpEngine::Box::plus(int atom)
       
  2602 {
       
  2603 #ifndef QT_NO_REGEXP_CAPTURE
       
  2604     eng->addPlusTransitions(rs, ls, atom);
       
  2605 #else
       
  2606     Q_UNUSED(atom);
       
  2607     eng->addCatTransitions(rs, ls);
       
  2608 #endif
       
  2609     addAnchorsToEngine(*this);
       
  2610 #ifndef QT_NO_REGEXP_OPTIM
       
  2611     maxl = InftyLen;
       
  2612 #endif
       
  2613 }
       
  2614 
       
  2615 void QRegExpEngine::Box::opt()
       
  2616 {
       
  2617 #ifndef QT_NO_REGEXP_OPTIM
       
  2618     earlyStart = 0;
       
  2619     lateStart = 0;
       
  2620     str = QString();
       
  2621     leftStr = QString();
       
  2622     rightStr = QString();
       
  2623 #endif
       
  2624     skipanchors = 0;
       
  2625     minl = 0;
       
  2626 }
       
  2627 
       
  2628 void QRegExpEngine::Box::catAnchor(int a)
       
  2629 {
       
  2630     if (a != 0) {
       
  2631         for (int i = 0; i < rs.size(); i++) {
       
  2632             a = eng->anchorConcatenation(ranchors.value(rs.at(i), 0), a);
       
  2633             ranchors.insert(rs.at(i), a);
       
  2634         }
       
  2635         if (minl == 0)
       
  2636             skipanchors = eng->anchorConcatenation(skipanchors, a);
       
  2637     }
       
  2638 }
       
  2639 
       
  2640 #ifndef QT_NO_REGEXP_OPTIM
       
  2641 void QRegExpEngine::Box::setupHeuristics()
       
  2642 {
       
  2643     eng->goodEarlyStart = earlyStart;
       
  2644     eng->goodLateStart = lateStart;
       
  2645     eng->goodStr = eng->cs ? str : str.toLower();
       
  2646 
       
  2647     eng->minl = minl;
       
  2648     if (eng->cs) {
       
  2649         /*
       
  2650           A regular expression such as 112|1 has occ1['2'] = 2 and minl =
       
  2651           1 at this point. An entry of occ1 has to be at most minl or
       
  2652           infinity for the rest of the algorithm to go well.
       
  2653 
       
  2654           We waited until here before normalizing these cases (instead of
       
  2655           doing it in Box::orx()) because sometimes things improve by
       
  2656           themselves. Consider for example (112|1)34.
       
  2657         */
       
  2658         for (int i = 0; i < NumBadChars; i++) {
       
  2659             if (occ1.at(i) != NoOccurrence && occ1.at(i) >= minl)
       
  2660                 occ1[i] = minl;
       
  2661         }
       
  2662         eng->occ1 = occ1;
       
  2663     } else {
       
  2664         eng->occ1.fill(0, NumBadChars);
       
  2665     }
       
  2666 
       
  2667     eng->heuristicallyChooseHeuristic();
       
  2668 }
       
  2669 #endif
       
  2670 
       
  2671 #if defined(QT_DEBUG)
       
  2672 void QRegExpEngine::Box::dump() const
       
  2673 {
       
  2674     int i;
       
  2675     qDebug("Box of at least %d character%s", minl, minl == 1 ? "" : "s");
       
  2676     qDebug("  Left states:");
       
  2677     for (i = 0; i < ls.size(); i++) {
       
  2678         if (lanchors.value(ls[i], 0) == 0)
       
  2679             qDebug("    %d", ls[i]);
       
  2680         else
       
  2681             qDebug("    %d [anchors 0x%.8x]", ls[i], lanchors[ls[i]]);
       
  2682     }
       
  2683     qDebug("  Right states:");
       
  2684     for (i = 0; i < rs.size(); i++) {
       
  2685         if (ranchors.value(rs[i], 0) == 0)
       
  2686             qDebug("    %d", rs[i]);
       
  2687         else
       
  2688             qDebug("    %d [anchors 0x%.8x]", rs[i], ranchors[rs[i]]);
       
  2689     }
       
  2690     qDebug("  Skip anchors: 0x%.8x", skipanchors);
       
  2691 }
       
  2692 #endif
       
  2693 
       
  2694 void QRegExpEngine::Box::addAnchorsToEngine(const Box &to) const
       
  2695 {
       
  2696     for (int i = 0; i < to.ls.size(); i++) {
       
  2697         for (int j = 0; j < rs.size(); j++) {
       
  2698             int a = eng->anchorConcatenation(ranchors.value(rs.at(j), 0),
       
  2699                                              to.lanchors.value(to.ls.at(i), 0));
       
  2700             eng->addAnchors(rs[j], to.ls[i], a);
       
  2701         }
       
  2702     }
       
  2703 }
       
  2704 
       
  2705 void QRegExpEngine::setupCategoriesRangeMap()
       
  2706 {
       
  2707    categoriesRangeMap.insert("IsBasicLatin",                           qMakePair(0x0000, 0x007F));
       
  2708    categoriesRangeMap.insert("IsLatin-1Supplement",                    qMakePair(0x0080, 0x00FF));
       
  2709    categoriesRangeMap.insert("IsLatinExtended-A",                      qMakePair(0x0100, 0x017F));
       
  2710    categoriesRangeMap.insert("IsLatinExtended-B",                      qMakePair(0x0180, 0x024F));
       
  2711    categoriesRangeMap.insert("IsIPAExtensions",                        qMakePair(0x0250, 0x02AF));
       
  2712    categoriesRangeMap.insert("IsSpacingModifierLetters",               qMakePair(0x02B0, 0x02FF));
       
  2713    categoriesRangeMap.insert("IsCombiningDiacriticalMarks",            qMakePair(0x0300, 0x036F));
       
  2714    categoriesRangeMap.insert("IsGreek",                                qMakePair(0x0370, 0x03FF));
       
  2715    categoriesRangeMap.insert("IsCyrillic",                             qMakePair(0x0400, 0x04FF));
       
  2716    categoriesRangeMap.insert("IsCyrillicSupplement",                   qMakePair(0x0500, 0x052F));
       
  2717    categoriesRangeMap.insert("IsArmenian",                             qMakePair(0x0530, 0x058F));
       
  2718    categoriesRangeMap.insert("IsHebrew",                               qMakePair(0x0590, 0x05FF));
       
  2719    categoriesRangeMap.insert("IsArabic",                               qMakePair(0x0600, 0x06FF));
       
  2720    categoriesRangeMap.insert("IsSyriac",                               qMakePair(0x0700, 0x074F));
       
  2721    categoriesRangeMap.insert("IsArabicSupplement",                     qMakePair(0x0750, 0x077F));
       
  2722    categoriesRangeMap.insert("IsThaana",                               qMakePair(0x0780, 0x07BF));
       
  2723    categoriesRangeMap.insert("IsDevanagari",                           qMakePair(0x0900, 0x097F));
       
  2724    categoriesRangeMap.insert("IsBengali",                              qMakePair(0x0980, 0x09FF));
       
  2725    categoriesRangeMap.insert("IsGurmukhi",                             qMakePair(0x0A00, 0x0A7F));
       
  2726    categoriesRangeMap.insert("IsGujarati",                             qMakePair(0x0A80, 0x0AFF));
       
  2727    categoriesRangeMap.insert("IsOriya",                                qMakePair(0x0B00, 0x0B7F));
       
  2728    categoriesRangeMap.insert("IsTamil",                                qMakePair(0x0B80, 0x0BFF));
       
  2729    categoriesRangeMap.insert("IsTelugu",                               qMakePair(0x0C00, 0x0C7F));
       
  2730    categoriesRangeMap.insert("IsKannada",                              qMakePair(0x0C80, 0x0CFF));
       
  2731    categoriesRangeMap.insert("IsMalayalam",                            qMakePair(0x0D00, 0x0D7F));
       
  2732    categoriesRangeMap.insert("IsSinhala",                              qMakePair(0x0D80, 0x0DFF));
       
  2733    categoriesRangeMap.insert("IsThai",                                 qMakePair(0x0E00, 0x0E7F));
       
  2734    categoriesRangeMap.insert("IsLao",                                  qMakePair(0x0E80, 0x0EFF));
       
  2735    categoriesRangeMap.insert("IsTibetan",                              qMakePair(0x0F00, 0x0FFF));
       
  2736    categoriesRangeMap.insert("IsMyanmar",                              qMakePair(0x1000, 0x109F));
       
  2737    categoriesRangeMap.insert("IsGeorgian",                             qMakePair(0x10A0, 0x10FF));
       
  2738    categoriesRangeMap.insert("IsHangulJamo",                           qMakePair(0x1100, 0x11FF));
       
  2739    categoriesRangeMap.insert("IsEthiopic",                             qMakePair(0x1200, 0x137F));
       
  2740    categoriesRangeMap.insert("IsEthiopicSupplement",                   qMakePair(0x1380, 0x139F));
       
  2741    categoriesRangeMap.insert("IsCherokee",                             qMakePair(0x13A0, 0x13FF));
       
  2742    categoriesRangeMap.insert("IsUnifiedCanadianAboriginalSyllabics",   qMakePair(0x1400, 0x167F));
       
  2743    categoriesRangeMap.insert("IsOgham",                                qMakePair(0x1680, 0x169F));
       
  2744    categoriesRangeMap.insert("IsRunic",                                qMakePair(0x16A0, 0x16FF));
       
  2745    categoriesRangeMap.insert("IsTagalog",                              qMakePair(0x1700, 0x171F));
       
  2746    categoriesRangeMap.insert("IsHanunoo",                              qMakePair(0x1720, 0x173F));
       
  2747    categoriesRangeMap.insert("IsBuhid",                                qMakePair(0x1740, 0x175F));
       
  2748    categoriesRangeMap.insert("IsTagbanwa",                             qMakePair(0x1760, 0x177F));
       
  2749    categoriesRangeMap.insert("IsKhmer",                                qMakePair(0x1780, 0x17FF));
       
  2750    categoriesRangeMap.insert("IsMongolian",                            qMakePair(0x1800, 0x18AF));
       
  2751    categoriesRangeMap.insert("IsLimbu",                                qMakePair(0x1900, 0x194F));
       
  2752    categoriesRangeMap.insert("IsTaiLe",                                qMakePair(0x1950, 0x197F));
       
  2753    categoriesRangeMap.insert("IsNewTaiLue",                            qMakePair(0x1980, 0x19DF));
       
  2754    categoriesRangeMap.insert("IsKhmerSymbols",                         qMakePair(0x19E0, 0x19FF));
       
  2755    categoriesRangeMap.insert("IsBuginese",                             qMakePair(0x1A00, 0x1A1F));
       
  2756    categoriesRangeMap.insert("IsPhoneticExtensions",                   qMakePair(0x1D00, 0x1D7F));
       
  2757    categoriesRangeMap.insert("IsPhoneticExtensionsSupplement",         qMakePair(0x1D80, 0x1DBF));
       
  2758    categoriesRangeMap.insert("IsCombiningDiacriticalMarksSupplement",  qMakePair(0x1DC0, 0x1DFF));
       
  2759    categoriesRangeMap.insert("IsLatinExtendedAdditional",              qMakePair(0x1E00, 0x1EFF));
       
  2760    categoriesRangeMap.insert("IsGreekExtended",                        qMakePair(0x1F00, 0x1FFF));
       
  2761    categoriesRangeMap.insert("IsGeneralPunctuation",                   qMakePair(0x2000, 0x206F));
       
  2762    categoriesRangeMap.insert("IsSuperscriptsandSubscripts",            qMakePair(0x2070, 0x209F));
       
  2763    categoriesRangeMap.insert("IsCurrencySymbols",                      qMakePair(0x20A0, 0x20CF));
       
  2764    categoriesRangeMap.insert("IsCombiningMarksforSymbols",             qMakePair(0x20D0, 0x20FF));
       
  2765    categoriesRangeMap.insert("IsLetterlikeSymbols",                    qMakePair(0x2100, 0x214F));
       
  2766    categoriesRangeMap.insert("IsNumberForms",                          qMakePair(0x2150, 0x218F));
       
  2767    categoriesRangeMap.insert("IsArrows",                               qMakePair(0x2190, 0x21FF));
       
  2768    categoriesRangeMap.insert("IsMathematicalOperators",                qMakePair(0x2200, 0x22FF));
       
  2769    categoriesRangeMap.insert("IsMiscellaneousTechnical",               qMakePair(0x2300, 0x23FF));
       
  2770    categoriesRangeMap.insert("IsControlPictures",                      qMakePair(0x2400, 0x243F));
       
  2771    categoriesRangeMap.insert("IsOpticalCharacterRecognition",          qMakePair(0x2440, 0x245F));
       
  2772    categoriesRangeMap.insert("IsEnclosedAlphanumerics",                qMakePair(0x2460, 0x24FF));
       
  2773    categoriesRangeMap.insert("IsBoxDrawing",                           qMakePair(0x2500, 0x257F));
       
  2774    categoriesRangeMap.insert("IsBlockElements",                        qMakePair(0x2580, 0x259F));
       
  2775    categoriesRangeMap.insert("IsGeometricShapes",                      qMakePair(0x25A0, 0x25FF));
       
  2776    categoriesRangeMap.insert("IsMiscellaneousSymbols",                 qMakePair(0x2600, 0x26FF));
       
  2777    categoriesRangeMap.insert("IsDingbats",                             qMakePair(0x2700, 0x27BF));
       
  2778    categoriesRangeMap.insert("IsMiscellaneousMathematicalSymbols-A",   qMakePair(0x27C0, 0x27EF));
       
  2779    categoriesRangeMap.insert("IsSupplementalArrows-A",                 qMakePair(0x27F0, 0x27FF));
       
  2780    categoriesRangeMap.insert("IsBraillePatterns",                      qMakePair(0x2800, 0x28FF));
       
  2781    categoriesRangeMap.insert("IsSupplementalArrows-B",                 qMakePair(0x2900, 0x297F));
       
  2782    categoriesRangeMap.insert("IsMiscellaneousMathematicalSymbols-B",   qMakePair(0x2980, 0x29FF));
       
  2783    categoriesRangeMap.insert("IsSupplementalMathematicalOperators",    qMakePair(0x2A00, 0x2AFF));
       
  2784    categoriesRangeMap.insert("IsMiscellaneousSymbolsandArrows",        qMakePair(0x2B00, 0x2BFF));
       
  2785    categoriesRangeMap.insert("IsGlagolitic",                           qMakePair(0x2C00, 0x2C5F));
       
  2786    categoriesRangeMap.insert("IsCoptic",                               qMakePair(0x2C80, 0x2CFF));
       
  2787    categoriesRangeMap.insert("IsGeorgianSupplement",                   qMakePair(0x2D00, 0x2D2F));
       
  2788    categoriesRangeMap.insert("IsTifinagh",                             qMakePair(0x2D30, 0x2D7F));
       
  2789    categoriesRangeMap.insert("IsEthiopicExtended",                     qMakePair(0x2D80, 0x2DDF));
       
  2790    categoriesRangeMap.insert("IsSupplementalPunctuation",              qMakePair(0x2E00, 0x2E7F));
       
  2791    categoriesRangeMap.insert("IsCJKRadicalsSupplement",                qMakePair(0x2E80, 0x2EFF));
       
  2792    categoriesRangeMap.insert("IsKangxiRadicals",                       qMakePair(0x2F00, 0x2FDF));
       
  2793    categoriesRangeMap.insert("IsIdeographicDescriptionCharacters",     qMakePair(0x2FF0, 0x2FFF));
       
  2794    categoriesRangeMap.insert("IsCJKSymbolsandPunctuation",             qMakePair(0x3000, 0x303F));
       
  2795    categoriesRangeMap.insert("IsHiragana",                             qMakePair(0x3040, 0x309F));
       
  2796    categoriesRangeMap.insert("IsKatakana",                             qMakePair(0x30A0, 0x30FF));
       
  2797    categoriesRangeMap.insert("IsBopomofo",                             qMakePair(0x3100, 0x312F));
       
  2798    categoriesRangeMap.insert("IsHangulCompatibilityJamo",              qMakePair(0x3130, 0x318F));
       
  2799    categoriesRangeMap.insert("IsKanbun",                               qMakePair(0x3190, 0x319F));
       
  2800    categoriesRangeMap.insert("IsBopomofoExtended",                     qMakePair(0x31A0, 0x31BF));
       
  2801    categoriesRangeMap.insert("IsCJKStrokes",                           qMakePair(0x31C0, 0x31EF));
       
  2802    categoriesRangeMap.insert("IsKatakanaPhoneticExtensions",           qMakePair(0x31F0, 0x31FF));
       
  2803    categoriesRangeMap.insert("IsEnclosedCJKLettersandMonths",          qMakePair(0x3200, 0x32FF));
       
  2804    categoriesRangeMap.insert("IsCJKCompatibility",                     qMakePair(0x3300, 0x33FF));
       
  2805    categoriesRangeMap.insert("IsCJKUnifiedIdeographsExtensionA",       qMakePair(0x3400, 0x4DB5));
       
  2806    categoriesRangeMap.insert("IsYijingHexagramSymbols",                qMakePair(0x4DC0, 0x4DFF));
       
  2807    categoriesRangeMap.insert("IsCJKUnifiedIdeographs",                 qMakePair(0x4E00, 0x9FFF));
       
  2808    categoriesRangeMap.insert("IsYiSyllables",                          qMakePair(0xA000, 0xA48F));
       
  2809    categoriesRangeMap.insert("IsYiRadicals",                           qMakePair(0xA490, 0xA4CF));
       
  2810    categoriesRangeMap.insert("IsModifierToneLetters",                  qMakePair(0xA700, 0xA71F));
       
  2811    categoriesRangeMap.insert("IsSylotiNagri",                          qMakePair(0xA800, 0xA82F));
       
  2812    categoriesRangeMap.insert("IsHangulSyllables",                      qMakePair(0xAC00, 0xD7A3));
       
  2813    categoriesRangeMap.insert("IsPrivateUse",                           qMakePair(0xE000, 0xF8FF));
       
  2814    categoriesRangeMap.insert("IsCJKCompatibilityIdeographs",           qMakePair(0xF900, 0xFAFF));
       
  2815    categoriesRangeMap.insert("IsAlphabeticPresentationForms",          qMakePair(0xFB00, 0xFB4F));
       
  2816    categoriesRangeMap.insert("IsArabicPresentationForms-A",            qMakePair(0xFB50, 0xFDFF));
       
  2817    categoriesRangeMap.insert("IsVariationSelectors",                   qMakePair(0xFE00, 0xFE0F));
       
  2818    categoriesRangeMap.insert("IsVerticalForms",                        qMakePair(0xFE10, 0xFE1F));
       
  2819    categoriesRangeMap.insert("IsCombiningHalfMarks",                   qMakePair(0xFE20, 0xFE2F));
       
  2820    categoriesRangeMap.insert("IsCJKCompatibilityForms",                qMakePair(0xFE30, 0xFE4F));
       
  2821    categoriesRangeMap.insert("IsSmallFormVariants",                    qMakePair(0xFE50, 0xFE6F));
       
  2822    categoriesRangeMap.insert("IsArabicPresentationForms-B",            qMakePair(0xFE70, 0xFEFF));
       
  2823    categoriesRangeMap.insert("IsHalfwidthandFullwidthForms",           qMakePair(0xFF00, 0xFFEF));
       
  2824    categoriesRangeMap.insert("IsSpecials",                             qMakePair(0xFFF0, 0xFFFF));
       
  2825    categoriesRangeMap.insert("IsLinearBSyllabary",                     qMakePair(0x10000, 0x1007F));
       
  2826    categoriesRangeMap.insert("IsLinearBIdeograms",                     qMakePair(0x10080, 0x100FF));
       
  2827    categoriesRangeMap.insert("IsAegeanNumbers",                        qMakePair(0x10100, 0x1013F));
       
  2828    categoriesRangeMap.insert("IsAncientGreekNumbers",                  qMakePair(0x10140, 0x1018F));
       
  2829    categoriesRangeMap.insert("IsOldItalic",                            qMakePair(0x10300, 0x1032F));
       
  2830    categoriesRangeMap.insert("IsGothic",                               qMakePair(0x10330, 0x1034F));
       
  2831    categoriesRangeMap.insert("IsUgaritic",                             qMakePair(0x10380, 0x1039F));
       
  2832    categoriesRangeMap.insert("IsOldPersian",                           qMakePair(0x103A0, 0x103DF));
       
  2833    categoriesRangeMap.insert("IsDeseret",                              qMakePair(0x10400, 0x1044F));
       
  2834    categoriesRangeMap.insert("IsShavian",                              qMakePair(0x10450, 0x1047F));
       
  2835    categoriesRangeMap.insert("IsOsmanya",                              qMakePair(0x10480, 0x104AF));
       
  2836    categoriesRangeMap.insert("IsCypriotSyllabary",                     qMakePair(0x10800, 0x1083F));
       
  2837    categoriesRangeMap.insert("IsKharoshthi",                           qMakePair(0x10A00, 0x10A5F));
       
  2838    categoriesRangeMap.insert("IsByzantineMusicalSymbols",              qMakePair(0x1D000, 0x1D0FF));
       
  2839    categoriesRangeMap.insert("IsMusicalSymbols",                       qMakePair(0x1D100, 0x1D1FF));
       
  2840    categoriesRangeMap.insert("IsAncientGreekMusicalNotation",          qMakePair(0x1D200, 0x1D24F));
       
  2841    categoriesRangeMap.insert("IsTaiXuanJingSymbols",                   qMakePair(0x1D300, 0x1D35F));
       
  2842    categoriesRangeMap.insert("IsMathematicalAlphanumericSymbols",      qMakePair(0x1D400, 0x1D7FF));
       
  2843    categoriesRangeMap.insert("IsCJKUnifiedIdeographsExtensionB",       qMakePair(0x20000, 0x2A6DF));
       
  2844    categoriesRangeMap.insert("IsCJKCompatibilityIdeographsSupplement", qMakePair(0x2F800, 0x2FA1F));
       
  2845    categoriesRangeMap.insert("IsTags",                                 qMakePair(0xE0000, 0xE007F));
       
  2846    categoriesRangeMap.insert("IsVariationSelectorsSupplement",         qMakePair(0xE0100, 0xE01EF));
       
  2847    categoriesRangeMap.insert("IsSupplementaryPrivateUseArea-A",        qMakePair(0xF0000, 0xFFFFF));
       
  2848    categoriesRangeMap.insert("IsSupplementaryPrivateUseArea-B",        qMakePair(0x100000, 0x10FFFF));
       
  2849 }
       
  2850 
       
  2851 int QRegExpEngine::getChar()
       
  2852 {
       
  2853     return (yyPos == yyLen) ? EOS : yyIn[yyPos++].unicode();
       
  2854 }
       
  2855 
       
  2856 int QRegExpEngine::getEscape()
       
  2857 {
       
  2858 #ifndef QT_NO_REGEXP_ESCAPE
       
  2859     const char tab[] = "afnrtv"; // no b, as \b means word boundary
       
  2860     const char backTab[] = "\a\f\n\r\t\v";
       
  2861     ushort low;
       
  2862     int i;
       
  2863 #endif
       
  2864     ushort val;
       
  2865     int prevCh = yyCh;
       
  2866 
       
  2867     if (prevCh == EOS) {
       
  2868         error(RXERR_END);
       
  2869         return Tok_Char | '\\';
       
  2870     }
       
  2871     yyCh = getChar();
       
  2872 #ifndef QT_NO_REGEXP_ESCAPE
       
  2873     if ((prevCh & ~0xff) == 0) {
       
  2874         const char *p = strchr(tab, prevCh);
       
  2875         if (p != 0)
       
  2876             return Tok_Char | backTab[p - tab];
       
  2877     }
       
  2878 #endif
       
  2879 
       
  2880     switch (prevCh) {
       
  2881 #ifndef QT_NO_REGEXP_ESCAPE
       
  2882     case '0':
       
  2883         val = 0;
       
  2884         for (i = 0; i < 3; i++) {
       
  2885             if (yyCh >= '0' && yyCh <= '7')
       
  2886                 val = (val << 3) | (yyCh - '0');
       
  2887             else
       
  2888                 break;
       
  2889             yyCh = getChar();
       
  2890         }
       
  2891         if ((val & ~0377) != 0)
       
  2892             error(RXERR_OCTAL);
       
  2893         return Tok_Char | val;
       
  2894 #endif
       
  2895 #ifndef QT_NO_REGEXP_ESCAPE
       
  2896     case 'B':
       
  2897         return Tok_NonWord;
       
  2898 #endif
       
  2899 #ifndef QT_NO_REGEXP_CCLASS
       
  2900     case 'D':
       
  2901         // see QChar::isDigit()
       
  2902         yyCharClass->addCategories(0x7fffffef);
       
  2903         return Tok_CharClass;
       
  2904     case 'S':
       
  2905         // see QChar::isSpace()
       
  2906         yyCharClass->addCategories(0x7ffff87f);
       
  2907         yyCharClass->addRange(0x0000, 0x0008);
       
  2908         yyCharClass->addRange(0x000e, 0x001f);
       
  2909         yyCharClass->addRange(0x007f, 0x009f);
       
  2910         return Tok_CharClass;
       
  2911     case 'W':
       
  2912         // see QChar::isLetterOrNumber() and QChar::isMark()
       
  2913         yyCharClass->addCategories(0x7fe07f81);
       
  2914         yyCharClass->addRange(0x203f, 0x2040);
       
  2915         yyCharClass->addSingleton(0x2040);
       
  2916         yyCharClass->addSingleton(0x2054);
       
  2917         yyCharClass->addSingleton(0x30fb);
       
  2918         yyCharClass->addRange(0xfe33, 0xfe34);
       
  2919         yyCharClass->addRange(0xfe4d, 0xfe4f);
       
  2920         yyCharClass->addSingleton(0xff3f);
       
  2921         yyCharClass->addSingleton(0xff65);
       
  2922         return Tok_CharClass;
       
  2923 #endif
       
  2924 #ifndef QT_NO_REGEXP_ESCAPE
       
  2925     case 'b':
       
  2926         return Tok_Word;
       
  2927 #endif
       
  2928 #ifndef QT_NO_REGEXP_CCLASS
       
  2929     case 'd':
       
  2930         // see QChar::isDigit()
       
  2931         yyCharClass->addCategories(0x00000010);
       
  2932         return Tok_CharClass;
       
  2933     case 's':
       
  2934         // see QChar::isSpace()
       
  2935         yyCharClass->addCategories(0x00000380);
       
  2936         yyCharClass->addRange(0x0009, 0x000d);
       
  2937         return Tok_CharClass;
       
  2938     case 'w':
       
  2939         // see QChar::isLetterOrNumber() and QChar::isMark()
       
  2940         yyCharClass->addCategories(0x000f807e);
       
  2941         yyCharClass->addSingleton(0x005f); // '_'
       
  2942         return Tok_CharClass;
       
  2943     case 'I':
       
  2944         if (xmlSchemaExtensions) {
       
  2945             yyCharClass->setNegative(!yyCharClass->negative());
       
  2946             // fall through
       
  2947         }
       
  2948     case 'i':
       
  2949         if (xmlSchemaExtensions) {
       
  2950             yyCharClass->addCategories(0x000f807e);
       
  2951             yyCharClass->addSingleton(0x003a); // ':'
       
  2952             yyCharClass->addSingleton(0x005f); // '_'
       
  2953             yyCharClass->addRange(0x0041, 0x005a); // [A-Z]
       
  2954             yyCharClass->addRange(0x0061, 0x007a); // [a-z]
       
  2955             yyCharClass->addRange(0xc0, 0xd6);
       
  2956             yyCharClass->addRange(0xd8, 0xf6);
       
  2957             yyCharClass->addRange(0xf8, 0x2ff);
       
  2958             yyCharClass->addRange(0x370, 0x37d);
       
  2959             yyCharClass->addRange(0x37f, 0x1fff);
       
  2960             yyCharClass->addRange(0x200c, 0x200d);
       
  2961             yyCharClass->addRange(0x2070, 0x218f);
       
  2962             yyCharClass->addRange(0x2c00, 0x2fef);
       
  2963             yyCharClass->addRange(0x3001, 0xd7ff);
       
  2964             yyCharClass->addRange(0xf900, 0xfdcf);
       
  2965             yyCharClass->addRange(0xfdf0, 0xfffd);
       
  2966             yyCharClass->addRange((ushort)0x10000, (ushort)0xeffff);
       
  2967         }
       
  2968         return Tok_CharClass;
       
  2969     case 'C':
       
  2970         if (xmlSchemaExtensions) {
       
  2971             yyCharClass->setNegative(!yyCharClass->negative());
       
  2972             // fall through
       
  2973         }
       
  2974     case 'c':
       
  2975         if (xmlSchemaExtensions) {
       
  2976             yyCharClass->addCategories(0x000f807e);
       
  2977             yyCharClass->addSingleton(0x002d); // '-'
       
  2978             yyCharClass->addSingleton(0x002e); // '.'
       
  2979             yyCharClass->addSingleton(0x003a); // ':'
       
  2980             yyCharClass->addSingleton(0x005f); // '_'
       
  2981             yyCharClass->addSingleton(0xb7);
       
  2982             yyCharClass->addRange(0x0030, 0x0039); // [0-9]
       
  2983             yyCharClass->addRange(0x0041, 0x005a); // [A-Z]
       
  2984             yyCharClass->addRange(0x0061, 0x007a); // [a-z]
       
  2985             yyCharClass->addRange(0xc0, 0xd6);
       
  2986             yyCharClass->addRange(0xd8, 0xf6);
       
  2987             yyCharClass->addRange(0xf8, 0x2ff);
       
  2988             yyCharClass->addRange(0x370, 0x37d);
       
  2989             yyCharClass->addRange(0x37f, 0x1fff);
       
  2990             yyCharClass->addRange(0x200c, 0x200d);
       
  2991             yyCharClass->addRange(0x2070, 0x218f);
       
  2992             yyCharClass->addRange(0x2c00, 0x2fef);
       
  2993             yyCharClass->addRange(0x3001, 0xd7ff);
       
  2994             yyCharClass->addRange(0xf900, 0xfdcf);
       
  2995             yyCharClass->addRange(0xfdf0, 0xfffd);
       
  2996             yyCharClass->addRange((ushort)0x10000, (ushort)0xeffff);
       
  2997             yyCharClass->addRange(0x0300, 0x036f);
       
  2998             yyCharClass->addRange(0x203f, 0x2040);
       
  2999         }
       
  3000         return Tok_CharClass;
       
  3001     case 'P':
       
  3002         if (xmlSchemaExtensions) {
       
  3003             yyCharClass->setNegative(!yyCharClass->negative());
       
  3004             // fall through
       
  3005         }
       
  3006     case 'p':
       
  3007         if (xmlSchemaExtensions) {
       
  3008             if (yyCh != '{') {
       
  3009                 error(RXERR_CHARCLASS);
       
  3010                 return Tok_CharClass;
       
  3011             }
       
  3012 
       
  3013             QByteArray category;
       
  3014             yyCh = getChar();
       
  3015             while (yyCh != '}') {
       
  3016                 if (yyCh == EOS) {
       
  3017                     error(RXERR_END);
       
  3018                     return Tok_CharClass;
       
  3019                 }
       
  3020                 category.append(yyCh);
       
  3021                 yyCh = getChar();
       
  3022             }
       
  3023             yyCh = getChar(); // skip closing '}'
       
  3024 
       
  3025             if (category == "M") {
       
  3026                 yyCharClass->addCategories(0x0000000e);
       
  3027             } else if (category == "Mn") {
       
  3028                 yyCharClass->addCategories(0x00000002);
       
  3029             } else if (category == "Mc") {
       
  3030                 yyCharClass->addCategories(0x00000004);
       
  3031             } else if (category == "Me") {
       
  3032                 yyCharClass->addCategories(0x00000008);
       
  3033             } else if (category == "N") {
       
  3034                 yyCharClass->addCategories(0x00000070);
       
  3035             } else if (category == "Nd") {
       
  3036                 yyCharClass->addCategories(0x00000010);
       
  3037             } else if (category == "Nl") {
       
  3038                 yyCharClass->addCategories(0x00000020);
       
  3039             } else if (category == "No") {
       
  3040                 yyCharClass->addCategories(0x00000040);
       
  3041             } else if (category == "Z") {
       
  3042                 yyCharClass->addCategories(0x00000380);
       
  3043             } else if (category == "Zs") {
       
  3044                 yyCharClass->addCategories(0x00000080);
       
  3045             } else if (category == "Zl") {
       
  3046                 yyCharClass->addCategories(0x00000100);
       
  3047             } else if (category == "Zp") {
       
  3048                 yyCharClass->addCategories(0x00000200);
       
  3049             } else if (category == "C") {
       
  3050                 yyCharClass->addCategories(0x00006c00);
       
  3051             } else if (category == "Cc") {
       
  3052                 yyCharClass->addCategories(0x00000400);
       
  3053             } else if (category == "Cf") {
       
  3054                 yyCharClass->addCategories(0x00000800);
       
  3055             } else if (category == "Cs") {
       
  3056                 yyCharClass->addCategories(0x00001000);
       
  3057             } else if (category == "Co") {
       
  3058                 yyCharClass->addCategories(0x00002000);
       
  3059             } else if (category == "Cn") {
       
  3060                 yyCharClass->addCategories(0x00004000);
       
  3061             } else if (category == "L") {
       
  3062                 yyCharClass->addCategories(0x000f8000);
       
  3063             } else if (category == "Lu") {
       
  3064                 yyCharClass->addCategories(0x00008000);
       
  3065             } else if (category == "Ll") {
       
  3066                 yyCharClass->addCategories(0x00010000);
       
  3067             } else if (category == "Lt") {
       
  3068                 yyCharClass->addCategories(0x00020000);
       
  3069             } else if (category == "Lm") {
       
  3070                 yyCharClass->addCategories(0x00040000);
       
  3071             } else if (category == "Lo") {
       
  3072                 yyCharClass->addCategories(0x00080000);
       
  3073             } else if (category == "P") {
       
  3074                 yyCharClass->addCategories(0x4f580780);
       
  3075             } else if (category == "Pc") {
       
  3076                 yyCharClass->addCategories(0x00100000);
       
  3077             } else if (category == "Pd") {
       
  3078                 yyCharClass->addCategories(0x00200000);
       
  3079             } else if (category == "Ps") {
       
  3080                 yyCharClass->addCategories(0x00400000);
       
  3081             } else if (category == "Pe") {
       
  3082                 yyCharClass->addCategories(0x00800000);
       
  3083             } else if (category == "Pi") {
       
  3084                 yyCharClass->addCategories(0x01000000);
       
  3085             } else if (category == "Pf") {
       
  3086                 yyCharClass->addCategories(0x02000000);
       
  3087             } else if (category == "Po") {
       
  3088                 yyCharClass->addCategories(0x04000000);
       
  3089             } else if (category == "S") {
       
  3090                 yyCharClass->addCategories(0x78000000);
       
  3091             } else if (category == "Sm") {
       
  3092                 yyCharClass->addCategories(0x08000000);
       
  3093             } else if (category == "Sc") {
       
  3094                 yyCharClass->addCategories(0x10000000);
       
  3095             } else if (category == "Sk") {
       
  3096                 yyCharClass->addCategories(0x20000000);
       
  3097             } else if (category == "So") {
       
  3098                 yyCharClass->addCategories(0x40000000);
       
  3099             } else if (category.startsWith("Is")) {
       
  3100                 if (categoriesRangeMap.isEmpty())
       
  3101                     setupCategoriesRangeMap();
       
  3102 
       
  3103                 if (categoriesRangeMap.contains(category)) {
       
  3104                     const QPair<int, int> range = categoriesRangeMap.value(category);
       
  3105                     yyCharClass->addRange(range.first, range.second);
       
  3106                 } else {
       
  3107                     error(RXERR_CATEGORY);
       
  3108                 }
       
  3109             } else {
       
  3110                 error(RXERR_CATEGORY);
       
  3111             }
       
  3112         }
       
  3113         return Tok_CharClass;
       
  3114 #endif
       
  3115 #ifndef QT_NO_REGEXP_ESCAPE
       
  3116     case 'x':
       
  3117         val = 0;
       
  3118         for (i = 0; i < 4; i++) {
       
  3119             low = QChar(yyCh).toLower().unicode();
       
  3120             if (low >= '0' && low <= '9')
       
  3121                 val = (val << 4) | (low - '0');
       
  3122             else if (low >= 'a' && low <= 'f')
       
  3123                 val = (val << 4) | (low - 'a' + 10);
       
  3124             else
       
  3125                 break;
       
  3126             yyCh = getChar();
       
  3127         }
       
  3128         return Tok_Char | val;
       
  3129 #endif
       
  3130     default:
       
  3131         if (prevCh >= '1' && prevCh <= '9') {
       
  3132 #ifndef QT_NO_REGEXP_BACKREF
       
  3133             val = prevCh - '0';
       
  3134             while (yyCh >= '0' && yyCh <= '9') {
       
  3135                 val = (val * 10) + (yyCh - '0');
       
  3136                 yyCh = getChar();
       
  3137             }
       
  3138             return Tok_BackRef | val;
       
  3139 #else
       
  3140             error(RXERR_DISABLED);
       
  3141 #endif
       
  3142         }
       
  3143         return Tok_Char | prevCh;
       
  3144     }
       
  3145 }
       
  3146 
       
  3147 #ifndef QT_NO_REGEXP_INTERVAL
       
  3148 int QRegExpEngine::getRep(int def)
       
  3149 {
       
  3150     if (yyCh >= '0' && yyCh <= '9') {
       
  3151         int rep = 0;
       
  3152         do {
       
  3153             rep = 10 * rep + yyCh - '0';
       
  3154             if (rep >= InftyRep) {
       
  3155                 error(RXERR_REPETITION);
       
  3156                 rep = def;
       
  3157             }
       
  3158             yyCh = getChar();
       
  3159         } while (yyCh >= '0' && yyCh <= '9');
       
  3160         return rep;
       
  3161     } else {
       
  3162         return def;
       
  3163     }
       
  3164 }
       
  3165 #endif
       
  3166 
       
  3167 #ifndef QT_NO_REGEXP_LOOKAHEAD
       
  3168 void QRegExpEngine::skipChars(int n)
       
  3169 {
       
  3170     if (n > 0) {
       
  3171         yyPos += n - 1;
       
  3172         yyCh = getChar();
       
  3173     }
       
  3174 }
       
  3175 #endif
       
  3176 
       
  3177 void QRegExpEngine::error(const char *msg)
       
  3178 {
       
  3179     if (yyError.isEmpty())
       
  3180         yyError = QLatin1String(msg);
       
  3181 }
       
  3182 
       
  3183 void QRegExpEngine::startTokenizer(const QChar *rx, int len)
       
  3184 {
       
  3185     yyIn = rx;
       
  3186     yyPos0 = 0;
       
  3187     yyPos = 0;
       
  3188     yyLen = len;
       
  3189     yyCh = getChar();
       
  3190     yyCharClass.reset(new QRegExpCharClass);
       
  3191     yyMinRep = 0;
       
  3192     yyMaxRep = 0;
       
  3193     yyError = QString();
       
  3194 }
       
  3195 
       
  3196 int QRegExpEngine::getToken()
       
  3197 {
       
  3198 #ifndef QT_NO_REGEXP_CCLASS
       
  3199     ushort pendingCh = 0;
       
  3200     bool charPending;
       
  3201     bool rangePending;
       
  3202     int tok;
       
  3203 #endif
       
  3204     int prevCh = yyCh;
       
  3205 
       
  3206     yyPos0 = yyPos - 1;
       
  3207 #ifndef QT_NO_REGEXP_CCLASS
       
  3208     yyCharClass->clear();
       
  3209 #endif
       
  3210     yyMinRep = 0;
       
  3211     yyMaxRep = 0;
       
  3212     yyCh = getChar();
       
  3213 
       
  3214     switch (prevCh) {
       
  3215     case EOS:
       
  3216         yyPos0 = yyPos;
       
  3217         return Tok_Eos;
       
  3218     case '$':
       
  3219         return Tok_Dollar;
       
  3220     case '(':
       
  3221         if (yyCh == '?') {
       
  3222             prevCh = getChar();
       
  3223             yyCh = getChar();
       
  3224             switch (prevCh) {
       
  3225 #ifndef QT_NO_REGEXP_LOOKAHEAD
       
  3226             case '!':
       
  3227                 return Tok_NegLookahead;
       
  3228             case '=':
       
  3229                 return Tok_PosLookahead;
       
  3230 #endif
       
  3231             case ':':
       
  3232                 return Tok_MagicLeftParen;
       
  3233             default:
       
  3234                 error(RXERR_LOOKAHEAD);
       
  3235                 return Tok_MagicLeftParen;
       
  3236             }
       
  3237         } else {
       
  3238             return Tok_LeftParen;
       
  3239         }
       
  3240     case ')':
       
  3241         return Tok_RightParen;
       
  3242     case '*':
       
  3243         yyMinRep = 0;
       
  3244         yyMaxRep = InftyRep;
       
  3245         return Tok_Quantifier;
       
  3246     case '+':
       
  3247         yyMinRep = 1;
       
  3248         yyMaxRep = InftyRep;
       
  3249         return Tok_Quantifier;
       
  3250     case '.':
       
  3251 #ifndef QT_NO_REGEXP_CCLASS
       
  3252         yyCharClass->setNegative(true);
       
  3253 #endif
       
  3254         return Tok_CharClass;
       
  3255     case '?':
       
  3256         yyMinRep = 0;
       
  3257         yyMaxRep = 1;
       
  3258         return Tok_Quantifier;
       
  3259     case '[':
       
  3260 #ifndef QT_NO_REGEXP_CCLASS
       
  3261         if (yyCh == '^') {
       
  3262             yyCharClass->setNegative(true);
       
  3263             yyCh = getChar();
       
  3264         }
       
  3265         charPending = false;
       
  3266         rangePending = false;
       
  3267         do {
       
  3268             if (yyCh == '-' && charPending && !rangePending) {
       
  3269                 rangePending = true;
       
  3270                 yyCh = getChar();
       
  3271             } else {
       
  3272                 if (charPending && !rangePending) {
       
  3273                     yyCharClass->addSingleton(pendingCh);
       
  3274                     charPending = false;
       
  3275                 }
       
  3276                 if (yyCh == '\\') {
       
  3277                     yyCh = getChar();
       
  3278                     tok = getEscape();
       
  3279                     if (tok == Tok_Word)
       
  3280                         tok = '\b';
       
  3281                 } else {
       
  3282                     tok = Tok_Char | yyCh;
       
  3283                     yyCh = getChar();
       
  3284                 }
       
  3285                 if (tok == Tok_CharClass) {
       
  3286                     if (rangePending) {
       
  3287                         yyCharClass->addSingleton('-');
       
  3288                         yyCharClass->addSingleton(pendingCh);
       
  3289                         charPending = false;
       
  3290                         rangePending = false;
       
  3291                     }
       
  3292                 } else if ((tok & Tok_Char) != 0) {
       
  3293                     if (rangePending) {
       
  3294                         yyCharClass->addRange(pendingCh, tok ^ Tok_Char);
       
  3295                         charPending = false;
       
  3296                         rangePending = false;
       
  3297                     } else {
       
  3298                         pendingCh = tok ^ Tok_Char;
       
  3299                         charPending = true;
       
  3300                     }
       
  3301                 } else {
       
  3302                     error(RXERR_CHARCLASS);
       
  3303                 }
       
  3304             }
       
  3305         }  while (yyCh != ']' && yyCh != EOS);
       
  3306         if (rangePending)
       
  3307             yyCharClass->addSingleton('-');
       
  3308         if (charPending)
       
  3309             yyCharClass->addSingleton(pendingCh);
       
  3310         if (yyCh == EOS)
       
  3311             error(RXERR_END);
       
  3312         else
       
  3313             yyCh = getChar();
       
  3314         return Tok_CharClass;
       
  3315 #else
       
  3316         error(RXERR_END);
       
  3317         return Tok_Char | '[';
       
  3318 #endif
       
  3319     case '\\':
       
  3320         return getEscape();
       
  3321     case ']':
       
  3322         error(RXERR_LEFTDELIM);
       
  3323         return Tok_Char | ']';
       
  3324     case '^':
       
  3325         return Tok_Caret;
       
  3326     case '{':
       
  3327 #ifndef QT_NO_REGEXP_INTERVAL
       
  3328         yyMinRep = getRep(0);
       
  3329         yyMaxRep = yyMinRep;
       
  3330         if (yyCh == ',') {
       
  3331             yyCh = getChar();
       
  3332             yyMaxRep = getRep(InftyRep);
       
  3333         }
       
  3334         if (yyMaxRep < yyMinRep)
       
  3335             error(RXERR_INTERVAL);
       
  3336         if (yyCh != '}')
       
  3337             error(RXERR_REPETITION);
       
  3338         yyCh = getChar();
       
  3339         return Tok_Quantifier;
       
  3340 #else
       
  3341         error(RXERR_DISABLED);
       
  3342         return Tok_Char | '{';
       
  3343 #endif
       
  3344     case '|':
       
  3345         return Tok_Bar;
       
  3346     case '}':
       
  3347         error(RXERR_LEFTDELIM);
       
  3348         return Tok_Char | '}';
       
  3349     default:
       
  3350         return Tok_Char | prevCh;
       
  3351     }
       
  3352 }
       
  3353 
       
  3354 int QRegExpEngine::parse(const QChar *pattern, int len)
       
  3355 {
       
  3356     valid = true;
       
  3357     startTokenizer(pattern, len);
       
  3358     yyTok = getToken();
       
  3359 #ifndef QT_NO_REGEXP_CAPTURE
       
  3360     yyMayCapture = true;
       
  3361 #else
       
  3362     yyMayCapture = false;
       
  3363 #endif
       
  3364 
       
  3365 #ifndef QT_NO_REGEXP_CAPTURE
       
  3366     int atom = startAtom(false);
       
  3367 #endif
       
  3368     QRegExpCharClass anything;
       
  3369     Box box(this); // create InitialState
       
  3370     box.set(anything);
       
  3371     Box rightBox(this); // create FinalState
       
  3372     rightBox.set(anything);
       
  3373 
       
  3374     Box middleBox(this);
       
  3375     parseExpression(&middleBox);
       
  3376 #ifndef QT_NO_REGEXP_CAPTURE
       
  3377     finishAtom(atom, false);
       
  3378 #endif
       
  3379 #ifndef QT_NO_REGEXP_OPTIM
       
  3380     middleBox.setupHeuristics();
       
  3381 #endif
       
  3382     box.cat(middleBox);
       
  3383     box.cat(rightBox);
       
  3384     yyCharClass.reset(0);
       
  3385 
       
  3386 #ifndef QT_NO_REGEXP_CAPTURE
       
  3387     for (int i = 0; i < nf; ++i) {
       
  3388         switch (f[i].capture) {
       
  3389         case QRegExpAtom::NoCapture:
       
  3390             break;
       
  3391         case QRegExpAtom::OfficialCapture:
       
  3392             f[i].capture = ncap;
       
  3393             captureForOfficialCapture.append(ncap);
       
  3394             ++ncap;
       
  3395             ++officialncap;
       
  3396             break;
       
  3397         case QRegExpAtom::UnofficialCapture:
       
  3398             f[i].capture = greedyQuantifiers ? ncap++ : QRegExpAtom::NoCapture;
       
  3399         }
       
  3400     }
       
  3401 
       
  3402 #ifndef QT_NO_REGEXP_BACKREF
       
  3403 #ifndef QT_NO_REGEXP_OPTIM
       
  3404     if (officialncap == 0 && nbrefs == 0) {
       
  3405         ncap = nf = 0;
       
  3406         f.clear();
       
  3407     }
       
  3408 #endif
       
  3409     // handle the case where there's a \5 with no corresponding capture
       
  3410     // (captureForOfficialCapture.size() != officialncap)
       
  3411     for (int i = 0; i < nbrefs - officialncap; ++i) {
       
  3412         captureForOfficialCapture.append(ncap);
       
  3413         ++ncap;
       
  3414     }
       
  3415 #endif
       
  3416 #endif
       
  3417 
       
  3418     if (!yyError.isEmpty())
       
  3419         return -1;
       
  3420 
       
  3421 #ifndef QT_NO_REGEXP_OPTIM
       
  3422     const QRegExpAutomatonState &sinit = s.at(InitialState);
       
  3423     caretAnchored = !sinit.anchors.isEmpty();
       
  3424     if (caretAnchored) {
       
  3425         const QMap<int, int> &anchors = sinit.anchors;
       
  3426         QMap<int, int>::const_iterator a;
       
  3427         for (a = anchors.constBegin(); a != anchors.constEnd(); ++a) {
       
  3428             if (
       
  3429 #ifndef QT_NO_REGEXP_ANCHOR_ALT
       
  3430                 (*a & Anchor_Alternation) != 0 ||
       
  3431 #endif
       
  3432                 (*a & Anchor_Caret) == 0)
       
  3433             {
       
  3434                 caretAnchored = false;
       
  3435                 break;
       
  3436             }
       
  3437         }
       
  3438     }
       
  3439 #endif
       
  3440 
       
  3441     // cleanup anchors
       
  3442     int numStates = s.count();
       
  3443     for (int i = 0; i < numStates; ++i) {
       
  3444         QRegExpAutomatonState &state = s[i];
       
  3445         if (!state.anchors.isEmpty()) {
       
  3446             QMap<int, int>::iterator a = state.anchors.begin();
       
  3447             while (a != state.anchors.end()) {
       
  3448                 if (a.value() == 0)
       
  3449                     a = state.anchors.erase(a);
       
  3450                 else
       
  3451                     ++a;
       
  3452             }
       
  3453         }
       
  3454     }
       
  3455 
       
  3456     return yyPos0;
       
  3457 }
       
  3458 
       
  3459 void QRegExpEngine::parseAtom(Box *box)
       
  3460 {
       
  3461 #ifndef QT_NO_REGEXP_LOOKAHEAD
       
  3462     QRegExpEngine *eng = 0;
       
  3463     bool neg;
       
  3464     int len;
       
  3465 #endif
       
  3466 
       
  3467     if ((yyTok & Tok_Char) != 0) {
       
  3468         box->set(QChar(yyTok ^ Tok_Char));
       
  3469     } else {
       
  3470 #ifndef QT_NO_REGEXP_OPTIM
       
  3471         trivial = false;
       
  3472 #endif
       
  3473         switch (yyTok) {
       
  3474         case Tok_Dollar:
       
  3475             box->catAnchor(Anchor_Dollar);
       
  3476             break;
       
  3477         case Tok_Caret:
       
  3478             box->catAnchor(Anchor_Caret);
       
  3479             break;
       
  3480 #ifndef QT_NO_REGEXP_LOOKAHEAD
       
  3481         case Tok_PosLookahead:
       
  3482         case Tok_NegLookahead:
       
  3483             neg = (yyTok == Tok_NegLookahead);
       
  3484             eng = new QRegExpEngine(cs, greedyQuantifiers);
       
  3485             len = eng->parse(yyIn + yyPos - 1, yyLen - yyPos + 1);
       
  3486             if (len >= 0)
       
  3487                 skipChars(len);
       
  3488             else
       
  3489                 error(RXERR_LOOKAHEAD);
       
  3490             box->catAnchor(addLookahead(eng, neg));
       
  3491             yyTok = getToken();
       
  3492             if (yyTok != Tok_RightParen)
       
  3493                 error(RXERR_LOOKAHEAD);
       
  3494             break;
       
  3495 #endif
       
  3496 #ifndef QT_NO_REGEXP_ESCAPE
       
  3497         case Tok_Word:
       
  3498             box->catAnchor(Anchor_Word);
       
  3499             break;
       
  3500         case Tok_NonWord:
       
  3501             box->catAnchor(Anchor_NonWord);
       
  3502             break;
       
  3503 #endif
       
  3504         case Tok_LeftParen:
       
  3505         case Tok_MagicLeftParen:
       
  3506             yyTok = getToken();
       
  3507             parseExpression(box);
       
  3508             if (yyTok != Tok_RightParen)
       
  3509                 error(RXERR_END);
       
  3510             break;
       
  3511         case Tok_CharClass:
       
  3512             box->set(*yyCharClass);
       
  3513             break;
       
  3514         case Tok_Quantifier:
       
  3515             error(RXERR_REPETITION);
       
  3516             break;
       
  3517         default:
       
  3518 #ifndef QT_NO_REGEXP_BACKREF
       
  3519             if ((yyTok & Tok_BackRef) != 0)
       
  3520                 box->set(yyTok ^ Tok_BackRef);
       
  3521             else
       
  3522 #endif
       
  3523                 error(RXERR_DISABLED);
       
  3524         }
       
  3525     }
       
  3526     yyTok = getToken();
       
  3527 }
       
  3528 
       
  3529 void QRegExpEngine::parseFactor(Box *box)
       
  3530 {
       
  3531 #ifndef QT_NO_REGEXP_CAPTURE
       
  3532     int outerAtom = greedyQuantifiers ? startAtom(false) : -1;
       
  3533     int innerAtom = startAtom(yyMayCapture && yyTok == Tok_LeftParen);
       
  3534     bool magicLeftParen = (yyTok == Tok_MagicLeftParen);
       
  3535 #else
       
  3536     const int innerAtom = -1;
       
  3537 #endif
       
  3538 
       
  3539 #ifndef QT_NO_REGEXP_INTERVAL
       
  3540 #define YYREDO() \
       
  3541         yyIn = in, yyPos0 = pos0, yyPos = pos, yyLen = len, yyCh = ch, \
       
  3542         *yyCharClass = charClass, yyMinRep = 0, yyMaxRep = 0, yyTok = tok
       
  3543 
       
  3544     const QChar *in = yyIn;
       
  3545     int pos0 = yyPos0;
       
  3546     int pos = yyPos;
       
  3547     int len = yyLen;
       
  3548     int ch = yyCh;
       
  3549     QRegExpCharClass charClass;
       
  3550     if (yyTok == Tok_CharClass)
       
  3551         charClass = *yyCharClass;
       
  3552     int tok = yyTok;
       
  3553     bool mayCapture = yyMayCapture;
       
  3554 #endif
       
  3555 
       
  3556     parseAtom(box);
       
  3557 #ifndef QT_NO_REGEXP_CAPTURE
       
  3558     finishAtom(innerAtom, magicLeftParen);
       
  3559 #endif
       
  3560 
       
  3561     bool hasQuantifier = (yyTok == Tok_Quantifier);
       
  3562     if (hasQuantifier) {
       
  3563 #ifndef QT_NO_REGEXP_OPTIM
       
  3564         trivial = false;
       
  3565 #endif
       
  3566         if (yyMaxRep == InftyRep) {
       
  3567             box->plus(innerAtom);
       
  3568 #ifndef QT_NO_REGEXP_INTERVAL
       
  3569         } else if (yyMaxRep == 0) {
       
  3570             box->clear();
       
  3571 #endif
       
  3572         }
       
  3573         if (yyMinRep == 0)
       
  3574             box->opt();
       
  3575 
       
  3576 #ifndef QT_NO_REGEXP_INTERVAL
       
  3577         yyMayCapture = false;
       
  3578         int alpha = (yyMinRep == 0) ? 0 : yyMinRep - 1;
       
  3579         int beta = (yyMaxRep == InftyRep) ? 0 : yyMaxRep - (alpha + 1);
       
  3580 
       
  3581         Box rightBox(this);
       
  3582         int i;
       
  3583 
       
  3584         for (i = 0; i < beta; i++) {
       
  3585             YYREDO();
       
  3586             Box leftBox(this);
       
  3587             parseAtom(&leftBox);
       
  3588             leftBox.cat(rightBox);
       
  3589             leftBox.opt();
       
  3590             rightBox = leftBox;
       
  3591         }
       
  3592         for (i = 0; i < alpha; i++) {
       
  3593             YYREDO();
       
  3594             Box leftBox(this);
       
  3595             parseAtom(&leftBox);
       
  3596             leftBox.cat(rightBox);
       
  3597             rightBox = leftBox;
       
  3598         }
       
  3599         rightBox.cat(*box);
       
  3600         *box = rightBox;
       
  3601 #endif
       
  3602         yyTok = getToken();
       
  3603 #ifndef QT_NO_REGEXP_INTERVAL
       
  3604         yyMayCapture = mayCapture;
       
  3605 #endif
       
  3606     }
       
  3607 #undef YYREDO
       
  3608 #ifndef QT_NO_REGEXP_CAPTURE
       
  3609     if (greedyQuantifiers)
       
  3610         finishAtom(outerAtom, hasQuantifier);
       
  3611 #endif
       
  3612 }
       
  3613 
       
  3614 void QRegExpEngine::parseTerm(Box *box)
       
  3615 {
       
  3616 #ifndef QT_NO_REGEXP_OPTIM
       
  3617     if (yyTok != Tok_Eos && yyTok != Tok_RightParen && yyTok != Tok_Bar)
       
  3618         parseFactor(box);
       
  3619 #endif
       
  3620     while (yyTok != Tok_Eos && yyTok != Tok_RightParen && yyTok != Tok_Bar) {
       
  3621         Box rightBox(this);
       
  3622         parseFactor(&rightBox);
       
  3623         box->cat(rightBox);
       
  3624     }
       
  3625 }
       
  3626 
       
  3627 void QRegExpEngine::parseExpression(Box *box)
       
  3628 {
       
  3629     parseTerm(box);
       
  3630     while (yyTok == Tok_Bar) {
       
  3631 #ifndef QT_NO_REGEXP_OPTIM
       
  3632         trivial = false;
       
  3633 #endif
       
  3634         Box rightBox(this);
       
  3635         yyTok = getToken();
       
  3636         parseTerm(&rightBox);
       
  3637         box->orx(rightBox);
       
  3638     }
       
  3639 }
       
  3640 
       
  3641 /*
       
  3642   The struct QRegExpPrivate contains the private data of a regular
       
  3643   expression other than the automaton. It makes it possible for many
       
  3644   QRegExp objects to use the same QRegExpEngine object with different
       
  3645   QRegExpPrivate objects.
       
  3646 */
       
  3647 struct QRegExpPrivate
       
  3648 {
       
  3649     QRegExpEngine *eng;
       
  3650     QRegExpEngineKey engineKey;
       
  3651     bool minimal;
       
  3652 #ifndef QT_NO_REGEXP_CAPTURE
       
  3653     QString t; // last string passed to QRegExp::indexIn() or lastIndexIn()
       
  3654     QStringList capturedCache; // what QRegExp::capturedTexts() returned last
       
  3655 #endif
       
  3656     QRegExpMatchState matchState;
       
  3657 
       
  3658     inline QRegExpPrivate()
       
  3659         : eng(0), engineKey(QString(), QRegExp::RegExp, Qt::CaseSensitive), minimal(false) { }
       
  3660     inline QRegExpPrivate(const QRegExpEngineKey &key)
       
  3661         : eng(0), engineKey(key), minimal(false) {}
       
  3662 };
       
  3663 
       
  3664 #if !defined(QT_NO_REGEXP_OPTIM)
       
  3665 uint qHash(const QRegExpEngineKey &key)
       
  3666 {
       
  3667     return qHash(key.pattern);
       
  3668 }
       
  3669 
       
  3670 typedef QCache<QRegExpEngineKey, QRegExpEngine> EngineCache;
       
  3671 Q_GLOBAL_STATIC(EngineCache, globalEngineCache)
       
  3672 Q_GLOBAL_STATIC(QMutex, mutex)
       
  3673 #endif // QT_NO_REGEXP_OPTIM
       
  3674 
       
  3675 static void derefEngine(QRegExpEngine *eng, const QRegExpEngineKey &key)
       
  3676 {
       
  3677     if (!eng->ref.deref()) {
       
  3678 #if !defined(QT_NO_REGEXP_OPTIM)
       
  3679         if (globalEngineCache()) {
       
  3680             QMutexLocker locker(mutex());
       
  3681             QT_TRY {
       
  3682                 globalEngineCache()->insert(key, eng, 4 + key.pattern.length() / 4);
       
  3683             } QT_CATCH(const std::bad_alloc &) {
       
  3684                 // in case of an exception (e.g. oom), just delete the engine
       
  3685                 delete eng;
       
  3686             }
       
  3687         } else {
       
  3688             delete eng;
       
  3689         }
       
  3690 #else
       
  3691         Q_UNUSED(key);
       
  3692         delete eng;
       
  3693 #endif
       
  3694     }
       
  3695 }
       
  3696 
       
  3697 static void prepareEngine_helper(QRegExpPrivate *priv)
       
  3698 {
       
  3699     bool initMatchState = !priv->eng;
       
  3700 #if !defined(QT_NO_REGEXP_OPTIM)
       
  3701     if (!priv->eng && globalEngineCache()) {
       
  3702         QMutexLocker locker(mutex());
       
  3703         priv->eng = globalEngineCache()->take(priv->engineKey);
       
  3704         if (priv->eng != 0)
       
  3705             priv->eng->ref.ref();
       
  3706     }
       
  3707 #endif // QT_NO_REGEXP_OPTIM
       
  3708 
       
  3709     if (!priv->eng)
       
  3710         priv->eng = new QRegExpEngine(priv->engineKey);
       
  3711 
       
  3712     if (initMatchState)
       
  3713         priv->matchState.prepareForMatch(priv->eng);
       
  3714 }
       
  3715 
       
  3716 inline static void prepareEngine(QRegExpPrivate *priv)
       
  3717 {
       
  3718     if (priv->eng)
       
  3719         return;
       
  3720     prepareEngine_helper(priv);
       
  3721 }
       
  3722 
       
  3723 static void prepareEngineForMatch(QRegExpPrivate *priv, const QString &str)
       
  3724 {
       
  3725     prepareEngine(priv);
       
  3726     priv->matchState.prepareForMatch(priv->eng);
       
  3727 #ifndef QT_NO_REGEXP_CAPTURE
       
  3728     priv->t = str;
       
  3729     priv->capturedCache.clear();
       
  3730 #else
       
  3731     Q_UNUSED(str);
       
  3732 #endif
       
  3733 }
       
  3734 
       
  3735 static void invalidateEngine(QRegExpPrivate *priv)
       
  3736 {
       
  3737     if (priv->eng != 0) {
       
  3738         derefEngine(priv->eng, priv->engineKey);
       
  3739         priv->eng = 0;
       
  3740         priv->matchState.drain();
       
  3741     }
       
  3742 }
       
  3743 
       
  3744 /*!
       
  3745     \enum QRegExp::CaretMode
       
  3746 
       
  3747     The CaretMode enum defines the different meanings of the caret
       
  3748     (\bold{^}) in a regular expression. The possible values are:
       
  3749 
       
  3750     \value CaretAtZero
       
  3751            The caret corresponds to index 0 in the searched string.
       
  3752 
       
  3753     \value CaretAtOffset
       
  3754            The caret corresponds to the start offset of the search.
       
  3755 
       
  3756     \value CaretWontMatch
       
  3757            The caret never matches.
       
  3758 */
       
  3759 
       
  3760 /*!
       
  3761     \enum QRegExp::PatternSyntax
       
  3762 
       
  3763     The syntax used to interpret the meaning of the pattern.
       
  3764 
       
  3765     \value RegExp A rich Perl-like pattern matching syntax. This is
       
  3766     the default.
       
  3767 
       
  3768     \value RegExp2 Like RegExp, but with \l{greedy quantifiers}. This
       
  3769     will be the default in Qt 5. (Introduced in Qt 4.2.)
       
  3770 
       
  3771     \value Wildcard This provides a simple pattern matching syntax
       
  3772     similar to that used by shells (command interpreters) for "file
       
  3773     globbing". See \l{Wildcard Matching}.
       
  3774 
       
  3775     \value WildcardUnix This is similar to Wildcard but with the
       
  3776     behavior of a Unix shell. The wildcard characters can be escaped
       
  3777     with the character "\".
       
  3778 
       
  3779     \value FixedString The pattern is a fixed string. This is
       
  3780     equivalent to using the RegExp pattern on a string in
       
  3781     which all metacharacters are escaped using escape().
       
  3782 
       
  3783     \value W3CXmlSchema11 The pattern is a regular expression as
       
  3784     defined by the W3C XML Schema 1.1 specification.
       
  3785 
       
  3786     \sa setPatternSyntax()
       
  3787 */
       
  3788 
       
  3789 /*!
       
  3790     Constructs an empty regexp.
       
  3791 
       
  3792     \sa isValid(), errorString()
       
  3793 */
       
  3794 QRegExp::QRegExp()
       
  3795 {
       
  3796     priv = new QRegExpPrivate;
       
  3797 }
       
  3798 
       
  3799 /*!
       
  3800     Constructs a regular expression object for the given \a pattern
       
  3801     string. The pattern must be given using wildcard notation if \a
       
  3802     syntax is \l Wildcard; the default is \l RegExp. The pattern is
       
  3803     case sensitive, unless \a cs is Qt::CaseInsensitive. Matching is
       
  3804     greedy (maximal), but can be changed by calling
       
  3805     setMinimal().
       
  3806 
       
  3807     \sa setPattern(), setCaseSensitivity(), setPatternSyntax()
       
  3808 */
       
  3809 QRegExp::QRegExp(const QString &pattern, Qt::CaseSensitivity cs, PatternSyntax syntax)
       
  3810 {
       
  3811     priv = new QRegExpPrivate(QRegExpEngineKey(pattern, syntax, cs));
       
  3812 }
       
  3813 
       
  3814 /*!
       
  3815     Constructs a regular expression as a copy of \a rx.
       
  3816 
       
  3817     \sa operator=()
       
  3818 */
       
  3819 QRegExp::QRegExp(const QRegExp &rx)
       
  3820 {
       
  3821     priv = new QRegExpPrivate;
       
  3822     operator=(rx);
       
  3823 }
       
  3824 
       
  3825 /*!
       
  3826     Destroys the regular expression and cleans up its internal data.
       
  3827 */
       
  3828 QRegExp::~QRegExp()
       
  3829 {
       
  3830     invalidateEngine(priv);
       
  3831     delete priv;
       
  3832 }
       
  3833 
       
  3834 /*!
       
  3835     Copies the regular expression \a rx and returns a reference to the
       
  3836     copy. The case sensitivity, wildcard, and minimal matching options
       
  3837     are also copied.
       
  3838 */
       
  3839 QRegExp &QRegExp::operator=(const QRegExp &rx)
       
  3840 {
       
  3841     prepareEngine(rx.priv); // to allow sharing
       
  3842     QRegExpEngine *otherEng = rx.priv->eng;
       
  3843     if (otherEng)
       
  3844         otherEng->ref.ref();
       
  3845     invalidateEngine(priv);
       
  3846     priv->eng = otherEng;
       
  3847     priv->engineKey = rx.priv->engineKey;
       
  3848     priv->minimal = rx.priv->minimal;
       
  3849 #ifndef QT_NO_REGEXP_CAPTURE
       
  3850     priv->t = rx.priv->t;
       
  3851     priv->capturedCache = rx.priv->capturedCache;
       
  3852 #endif
       
  3853     if (priv->eng)
       
  3854         priv->matchState.prepareForMatch(priv->eng);
       
  3855     priv->matchState.captured = rx.priv->matchState.captured;
       
  3856     return *this;
       
  3857 }
       
  3858 
       
  3859 /*!
       
  3860     Returns true if this regular expression is equal to \a rx;
       
  3861     otherwise returns false.
       
  3862 
       
  3863     Two QRegExp objects are equal if they have the same pattern
       
  3864     strings and the same settings for case sensitivity, wildcard and
       
  3865     minimal matching.
       
  3866 */
       
  3867 bool QRegExp::operator==(const QRegExp &rx) const
       
  3868 {
       
  3869     return priv->engineKey == rx.priv->engineKey && priv->minimal == rx.priv->minimal;
       
  3870 }
       
  3871 
       
  3872 /*!
       
  3873     \fn bool QRegExp::operator!=(const QRegExp &rx) const
       
  3874 
       
  3875     Returns true if this regular expression is not equal to \a rx;
       
  3876     otherwise returns false.
       
  3877 
       
  3878     \sa operator==()
       
  3879 */
       
  3880 
       
  3881 /*!
       
  3882     Returns true if the pattern string is empty; otherwise returns
       
  3883     false.
       
  3884 
       
  3885     If you call exactMatch() with an empty pattern on an empty string
       
  3886     it will return true; otherwise it returns false since it operates
       
  3887     over the whole string. If you call indexIn() with an empty pattern
       
  3888     on \e any string it will return the start offset (0 by default)
       
  3889     because the empty pattern matches the 'emptiness' at the start of
       
  3890     the string. In this case the length of the match returned by
       
  3891     matchedLength() will be 0.
       
  3892 
       
  3893     See QString::isEmpty().
       
  3894 */
       
  3895 
       
  3896 bool QRegExp::isEmpty() const
       
  3897 {
       
  3898     return priv->engineKey.pattern.isEmpty();
       
  3899 }
       
  3900 
       
  3901 /*!
       
  3902     Returns true if the regular expression is valid; otherwise returns
       
  3903     false. An invalid regular expression never matches.
       
  3904 
       
  3905     The pattern \bold{[a-z} is an example of an invalid pattern, since
       
  3906     it lacks a closing square bracket.
       
  3907 
       
  3908     Note that the validity of a regexp may also depend on the setting
       
  3909     of the wildcard flag, for example \bold{*.html} is a valid
       
  3910     wildcard regexp but an invalid full regexp.
       
  3911 
       
  3912     \sa errorString()
       
  3913 */
       
  3914 bool QRegExp::isValid() const
       
  3915 {
       
  3916     if (priv->engineKey.pattern.isEmpty()) {
       
  3917         return true;
       
  3918     } else {
       
  3919         prepareEngine(priv);
       
  3920         return priv->eng->isValid();
       
  3921     }
       
  3922 }
       
  3923 
       
  3924 /*!
       
  3925     Returns the pattern string of the regular expression. The pattern
       
  3926     has either regular expression syntax or wildcard syntax, depending
       
  3927     on patternSyntax().
       
  3928 
       
  3929     \sa patternSyntax(), caseSensitivity()
       
  3930 */
       
  3931 QString QRegExp::pattern() const
       
  3932 {
       
  3933     return priv->engineKey.pattern;
       
  3934 }
       
  3935 
       
  3936 /*!
       
  3937     Sets the pattern string to \a pattern. The case sensitivity,
       
  3938     wildcard, and minimal matching options are not changed.
       
  3939 
       
  3940     \sa setPatternSyntax(), setCaseSensitivity()
       
  3941 */
       
  3942 void QRegExp::setPattern(const QString &pattern)
       
  3943 {
       
  3944     if (priv->engineKey.pattern != pattern) {
       
  3945         invalidateEngine(priv);
       
  3946         priv->engineKey.pattern = pattern;
       
  3947     }
       
  3948 }
       
  3949 
       
  3950 /*!
       
  3951     Returns Qt::CaseSensitive if the regexp is matched case
       
  3952     sensitively; otherwise returns Qt::CaseInsensitive.
       
  3953 
       
  3954     \sa patternSyntax(), pattern(), isMinimal()
       
  3955 */
       
  3956 Qt::CaseSensitivity QRegExp::caseSensitivity() const
       
  3957 {
       
  3958     return priv->engineKey.cs;
       
  3959 }
       
  3960 
       
  3961 /*!
       
  3962     Sets case sensitive matching to \a cs.
       
  3963 
       
  3964     If \a cs is Qt::CaseSensitive, \bold{\\.txt$} matches
       
  3965     \c{readme.txt} but not \c{README.TXT}.
       
  3966 
       
  3967     \sa setPatternSyntax(), setPattern(), setMinimal()
       
  3968 */
       
  3969 void QRegExp::setCaseSensitivity(Qt::CaseSensitivity cs)
       
  3970 {
       
  3971     if ((bool)cs != (bool)priv->engineKey.cs) {
       
  3972         invalidateEngine(priv);
       
  3973         priv->engineKey.cs = cs;
       
  3974     }
       
  3975 }
       
  3976 
       
  3977 /*!
       
  3978     Returns the syntax used by the regular expression. The default is
       
  3979     QRegExp::RegExp.
       
  3980 
       
  3981     \sa pattern(), caseSensitivity()
       
  3982 */
       
  3983 QRegExp::PatternSyntax QRegExp::patternSyntax() const
       
  3984 {
       
  3985     return priv->engineKey.patternSyntax;
       
  3986 }
       
  3987 
       
  3988 /*!
       
  3989     Sets the syntax mode for the regular expression. The default is
       
  3990     QRegExp::RegExp.
       
  3991 
       
  3992     Setting \a syntax to QRegExp::Wildcard enables simple shell-like
       
  3993     \l{wildcard matching}. For example, \bold{r*.txt} matches the
       
  3994     string \c{readme.txt} in wildcard mode, but does not match
       
  3995     \c{readme}.
       
  3996 
       
  3997     Setting \a syntax to QRegExp::FixedString means that the pattern
       
  3998     is interpreted as a plain string. Special characters (e.g.,
       
  3999     backslash) don't need to be escaped then.
       
  4000 
       
  4001     \sa setPattern(), setCaseSensitivity(), escape()
       
  4002 */
       
  4003 void QRegExp::setPatternSyntax(PatternSyntax syntax)
       
  4004 {
       
  4005     if (syntax != priv->engineKey.patternSyntax) {
       
  4006         invalidateEngine(priv);
       
  4007         priv->engineKey.patternSyntax = syntax;
       
  4008     }
       
  4009 }
       
  4010 
       
  4011 /*!
       
  4012     Returns true if minimal (non-greedy) matching is enabled;
       
  4013     otherwise returns false.
       
  4014 
       
  4015     \sa caseSensitivity(), setMinimal()
       
  4016 */
       
  4017 bool QRegExp::isMinimal() const
       
  4018 {
       
  4019     return priv->minimal;
       
  4020 }
       
  4021 
       
  4022 /*!
       
  4023     Enables or disables minimal matching. If \a minimal is false,
       
  4024     matching is greedy (maximal) which is the default.
       
  4025 
       
  4026     For example, suppose we have the input string "We must be
       
  4027     <b>bold</b>, very <b>bold</b>!" and the pattern
       
  4028     \bold{<b>.*</b>}. With the default greedy (maximal) matching,
       
  4029     the match is "We must be \underline{<b>bold</b>, very
       
  4030     <b>bold</b>}!". But with minimal (non-greedy) matching, the
       
  4031     first match is: "We must be \underline{<b>bold</b>}, very
       
  4032     <b>bold</b>!" and the second match is "We must be <b>bold</b>,
       
  4033     very \underline{<b>bold</b>}!". In practice we might use the pattern
       
  4034     \bold{<b>[^<]*\</b>} instead, although this will still fail for
       
  4035     nested tags.
       
  4036 
       
  4037     \sa setCaseSensitivity()
       
  4038 */
       
  4039 void QRegExp::setMinimal(bool minimal)
       
  4040 {
       
  4041     priv->minimal = minimal;
       
  4042 }
       
  4043 
       
  4044 // ### Qt 5: make non-const
       
  4045 /*!
       
  4046     Returns true if \a str is matched exactly by this regular
       
  4047     expression; otherwise returns false. You can determine how much of
       
  4048     the string was matched by calling matchedLength().
       
  4049 
       
  4050     For a given regexp string R, exactMatch("R") is the equivalent of
       
  4051     indexIn("^R$") since exactMatch() effectively encloses the regexp
       
  4052     in the start of string and end of string anchors, except that it
       
  4053     sets matchedLength() differently.
       
  4054 
       
  4055     For example, if the regular expression is \bold{blue}, then
       
  4056     exactMatch() returns true only for input \c blue. For inputs \c
       
  4057     bluebell, \c blutak and \c lightblue, exactMatch() returns false
       
  4058     and matchedLength() will return 4, 3 and 0 respectively.
       
  4059 
       
  4060     Although const, this function sets matchedLength(),
       
  4061     capturedTexts(), and pos().
       
  4062 
       
  4063     \sa indexIn(), lastIndexIn()
       
  4064 */
       
  4065 bool QRegExp::exactMatch(const QString &str) const
       
  4066 {
       
  4067     prepareEngineForMatch(priv, str);
       
  4068     priv->matchState.match(str.unicode(), str.length(), 0, priv->minimal, true, 0);
       
  4069     if (priv->matchState.captured[1] == str.length()) {
       
  4070         return true;
       
  4071     } else {
       
  4072         priv->matchState.captured[0] = 0;
       
  4073         priv->matchState.captured[1] = priv->matchState.oneTestMatchedLen;
       
  4074         return false;
       
  4075     }
       
  4076 }
       
  4077 
       
  4078 // ### Qt 5: make non-const
       
  4079 /*!
       
  4080     Attempts to find a match in \a str from position \a offset (0 by
       
  4081     default). If \a offset is -1, the search starts at the last
       
  4082     character; if -2, at the next to last character; etc.
       
  4083 
       
  4084     Returns the position of the first match, or -1 if there was no
       
  4085     match.
       
  4086 
       
  4087     The \a caretMode parameter can be used to instruct whether \bold{^}
       
  4088     should match at index 0 or at \a offset.
       
  4089 
       
  4090     You might prefer to use QString::indexOf(), QString::contains(),
       
  4091     or even QStringList::filter(). To replace matches use
       
  4092     QString::replace().
       
  4093 
       
  4094     Example:
       
  4095     \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 13
       
  4096 
       
  4097     Although const, this function sets matchedLength(),
       
  4098     capturedTexts() and pos().
       
  4099 
       
  4100     If the QRegExp is a wildcard expression (see setPatternSyntax())
       
  4101     and want to test a string against the whole wildcard expression,
       
  4102     use exactMatch() instead of this function.
       
  4103 
       
  4104     \sa lastIndexIn(), exactMatch()
       
  4105 */
       
  4106 
       
  4107 int QRegExp::indexIn(const QString &str, int offset, CaretMode caretMode) const
       
  4108 {
       
  4109     prepareEngineForMatch(priv, str);
       
  4110     if (offset < 0)
       
  4111         offset += str.length();
       
  4112     priv->matchState.match(str.unicode(), str.length(), offset,
       
  4113         priv->minimal, false, caretIndex(offset, caretMode));
       
  4114     return priv->matchState.captured[0];
       
  4115 }
       
  4116 
       
  4117 // ### Qt 5: make non-const
       
  4118 /*!
       
  4119     Attempts to find a match backwards in \a str from position \a
       
  4120     offset. If \a offset is -1 (the default), the search starts at the
       
  4121     last character; if -2, at the next to last character; etc.
       
  4122 
       
  4123     Returns the position of the first match, or -1 if there was no
       
  4124     match.
       
  4125 
       
  4126     The \a caretMode parameter can be used to instruct whether \bold{^}
       
  4127     should match at index 0 or at \a offset.
       
  4128 
       
  4129     Although const, this function sets matchedLength(),
       
  4130     capturedTexts() and pos().
       
  4131 
       
  4132     \warning Searching backwards is much slower than searching
       
  4133     forwards.
       
  4134 
       
  4135     \sa indexIn(), exactMatch()
       
  4136 */
       
  4137 
       
  4138 int QRegExp::lastIndexIn(const QString &str, int offset, CaretMode caretMode) const
       
  4139 {
       
  4140     prepareEngineForMatch(priv, str);
       
  4141     if (offset < 0)
       
  4142         offset += str.length();
       
  4143     if (offset < 0 || offset > str.length()) {
       
  4144         memset(priv->matchState.captured, -1, priv->matchState.capturedSize*sizeof(int));
       
  4145         return -1;
       
  4146     }
       
  4147 
       
  4148     while (offset >= 0) {
       
  4149         priv->matchState.match(str.unicode(), str.length(), offset,
       
  4150             priv->minimal, true, caretIndex(offset, caretMode));
       
  4151         if (priv->matchState.captured[0] == offset)
       
  4152             return offset;
       
  4153         --offset;
       
  4154     }
       
  4155     return -1;
       
  4156 }
       
  4157 
       
  4158 /*!
       
  4159     Returns the length of the last matched string, or -1 if there was
       
  4160     no match.
       
  4161 
       
  4162     \sa exactMatch(), indexIn(), lastIndexIn()
       
  4163 */
       
  4164 int QRegExp::matchedLength() const
       
  4165 {
       
  4166     return priv->matchState.captured[1];
       
  4167 }
       
  4168 
       
  4169 #ifndef QT_NO_REGEXP_CAPTURE
       
  4170 /*!
       
  4171   Returns the number of captures contained in the regular expression.
       
  4172  */
       
  4173 int QRegExp::numCaptures() const
       
  4174 {
       
  4175     prepareEngine(priv);
       
  4176     return priv->eng->numCaptures();
       
  4177 }
       
  4178 
       
  4179 /*!
       
  4180     Returns a list of the captured text strings.
       
  4181 
       
  4182     The first string in the list is the entire matched string. Each
       
  4183     subsequent list element contains a string that matched a
       
  4184     (capturing) subexpression of the regexp.
       
  4185 
       
  4186     For example:
       
  4187     \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 14
       
  4188 
       
  4189     The above example also captures elements that may be present but
       
  4190     which we have no interest in. This problem can be solved by using
       
  4191     non-capturing parentheses:
       
  4192 
       
  4193     \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 15
       
  4194 
       
  4195     Note that if you want to iterate over the list, you should iterate
       
  4196     over a copy, e.g.
       
  4197     \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 16
       
  4198 
       
  4199     Some regexps can match an indeterminate number of times. For
       
  4200     example if the input string is "Offsets: 12 14 99 231 7" and the
       
  4201     regexp, \c{rx}, is \bold{(\\d+)+}, we would hope to get a list of
       
  4202     all the numbers matched. However, after calling
       
  4203     \c{rx.indexIn(str)}, capturedTexts() will return the list ("12",
       
  4204     "12"), i.e. the entire match was "12" and the first subexpression
       
  4205     matched was "12". The correct approach is to use cap() in a
       
  4206     \l{QRegExp#cap_in_a_loop}{loop}.
       
  4207 
       
  4208     The order of elements in the string list is as follows. The first
       
  4209     element is the entire matching string. Each subsequent element
       
  4210     corresponds to the next capturing open left parentheses. Thus
       
  4211     capturedTexts()[1] is the text of the first capturing parentheses,
       
  4212     capturedTexts()[2] is the text of the second and so on
       
  4213     (corresponding to $1, $2, etc., in some other regexp languages).
       
  4214 
       
  4215     \sa cap(), pos()
       
  4216 */
       
  4217 QStringList QRegExp::capturedTexts() const
       
  4218 {
       
  4219     if (priv->capturedCache.isEmpty()) {
       
  4220         prepareEngine(priv);
       
  4221         const int *captured = priv->matchState.captured;
       
  4222         int n = priv->matchState.capturedSize;
       
  4223 
       
  4224         for (int i = 0; i < n; i += 2) {
       
  4225             QString m;
       
  4226             if (captured[i + 1] == 0)
       
  4227                 m = QLatin1String(""); // ### Qt 5: don't distinguish between null and empty
       
  4228             else if (captured[i] >= 0)
       
  4229                 m = priv->t.mid(captured[i], captured[i + 1]);
       
  4230             priv->capturedCache.append(m);
       
  4231         }
       
  4232         priv->t.clear();
       
  4233     }
       
  4234     return priv->capturedCache;
       
  4235 }
       
  4236 
       
  4237 /*!
       
  4238     \internal
       
  4239 */
       
  4240 QStringList QRegExp::capturedTexts()
       
  4241 {
       
  4242     return const_cast<const QRegExp *>(this)->capturedTexts();
       
  4243 }
       
  4244 
       
  4245 /*!
       
  4246     Returns the text captured by the \a nth subexpression. The entire
       
  4247     match has index 0 and the parenthesized subexpressions have
       
  4248     indexes starting from 1 (excluding non-capturing parentheses).
       
  4249 
       
  4250     \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 17
       
  4251 
       
  4252     The order of elements matched by cap() is as follows. The first
       
  4253     element, cap(0), is the entire matching string. Each subsequent
       
  4254     element corresponds to the next capturing open left parentheses.
       
  4255     Thus cap(1) is the text of the first capturing parentheses, cap(2)
       
  4256     is the text of the second, and so on.
       
  4257 
       
  4258     \sa capturedTexts(), pos()
       
  4259 */
       
  4260 QString QRegExp::cap(int nth) const
       
  4261 {
       
  4262     return capturedTexts().value(nth);
       
  4263 }
       
  4264 
       
  4265 /*!
       
  4266     \internal
       
  4267 */
       
  4268 QString QRegExp::cap(int nth)
       
  4269 {
       
  4270     return const_cast<const QRegExp *>(this)->cap(nth);
       
  4271 }
       
  4272 
       
  4273 /*!
       
  4274     Returns the position of the \a nth captured text in the searched
       
  4275     string. If \a nth is 0 (the default), pos() returns the position
       
  4276     of the whole match.
       
  4277 
       
  4278     Example:
       
  4279     \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 18
       
  4280 
       
  4281     For zero-length matches, pos() always returns -1. (For example, if
       
  4282     cap(4) would return an empty string, pos(4) returns -1.) This is
       
  4283     a feature of the implementation.
       
  4284 
       
  4285     \sa cap(), capturedTexts()
       
  4286 */
       
  4287 int QRegExp::pos(int nth) const
       
  4288 {
       
  4289     if (nth < 0 || nth >= priv->matchState.capturedSize / 2)
       
  4290         return -1;
       
  4291     else
       
  4292         return priv->matchState.captured[2 * nth];
       
  4293 }
       
  4294 
       
  4295 /*!
       
  4296     \internal
       
  4297 */
       
  4298 int QRegExp::pos(int nth)
       
  4299 {
       
  4300     return const_cast<const QRegExp *>(this)->pos(nth);
       
  4301 }
       
  4302 
       
  4303 /*!
       
  4304   Returns a text string that explains why a regexp pattern is
       
  4305   invalid the case being; otherwise returns "no error occurred".
       
  4306 
       
  4307   \sa isValid()
       
  4308 */
       
  4309 QString QRegExp::errorString() const
       
  4310 {
       
  4311     if (isValid()) {
       
  4312         return QString::fromLatin1(RXERR_OK);
       
  4313     } else {
       
  4314         return priv->eng->errorString();
       
  4315     }
       
  4316 }
       
  4317 
       
  4318 /*!
       
  4319     \internal
       
  4320 */
       
  4321 QString QRegExp::errorString()
       
  4322 {
       
  4323     return const_cast<const QRegExp *>(this)->errorString();
       
  4324 }
       
  4325 #endif
       
  4326 
       
  4327 /*!
       
  4328     Returns the string \a str with every regexp special character
       
  4329     escaped with a backslash. The special characters are $, (,), *, +,
       
  4330     ., ?, [, \,], ^, {, | and }.
       
  4331 
       
  4332     Example:
       
  4333 
       
  4334     \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 19
       
  4335 
       
  4336     This function is useful to construct regexp patterns dynamically:
       
  4337 
       
  4338     \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 20
       
  4339 
       
  4340     \sa setPatternSyntax()
       
  4341 */
       
  4342 QString QRegExp::escape(const QString &str)
       
  4343 {
       
  4344     QString quoted;
       
  4345     const int count = str.count();
       
  4346     quoted.reserve(count * 2);
       
  4347     const QLatin1Char backslash('\\');
       
  4348     for (int i = 0; i < count; i++) {
       
  4349         switch (str.at(i).toLatin1()) {
       
  4350         case '$':
       
  4351         case '(':
       
  4352         case ')':
       
  4353         case '*':
       
  4354         case '+':
       
  4355         case '.':
       
  4356         case '?':
       
  4357         case '[':
       
  4358         case '\\':
       
  4359         case ']':
       
  4360         case '^':
       
  4361         case '{':
       
  4362         case '|':
       
  4363         case '}':
       
  4364             quoted.append(backslash);
       
  4365         }
       
  4366         quoted.append(str.at(i));
       
  4367     }
       
  4368     return quoted;
       
  4369 }
       
  4370 
       
  4371 /*!
       
  4372     \fn bool QRegExp::caseSensitive() const
       
  4373 
       
  4374     Use \l caseSensitivity() instead.
       
  4375 */
       
  4376 
       
  4377 /*!
       
  4378     \fn void QRegExp::setCaseSensitive(bool sensitive)
       
  4379 
       
  4380     Use \l setCaseSensitivity() instead.
       
  4381 */
       
  4382 
       
  4383 /*!
       
  4384     \fn bool QRegExp::wildcard() const
       
  4385 
       
  4386     Use \l patternSyntax() instead.
       
  4387 
       
  4388     \oldcode
       
  4389         bool wc = rx.wildcard();
       
  4390     \newcode
       
  4391         bool wc = (rx.patternSyntax() == QRegExp::Wildcard);
       
  4392     \endcode
       
  4393 */
       
  4394 
       
  4395 /*!
       
  4396     \fn void QRegExp::setWildcard(bool wildcard)
       
  4397 
       
  4398     Use \l setPatternSyntax() instead.
       
  4399 
       
  4400     \oldcode
       
  4401         rx.setWildcard(wc);
       
  4402     \newcode
       
  4403         rx.setPatternSyntax(wc ? QRegExp::Wildcard : QRegExp::RegExp);
       
  4404     \endcode
       
  4405 */
       
  4406 
       
  4407 /*!
       
  4408     \fn bool QRegExp::minimal() const
       
  4409 
       
  4410     Use \l isMinimal() instead.
       
  4411 */
       
  4412 
       
  4413 /*!
       
  4414     \fn int QRegExp::search(const QString &str, int from = 0,
       
  4415                             CaretMode caretMode = CaretAtZero) const
       
  4416 
       
  4417     Use \l indexIn() instead.
       
  4418 */
       
  4419 
       
  4420 /*!
       
  4421     \fn int QRegExp::searchRev(const QString &str, int from = -1, \
       
  4422                                CaretMode caretMode = CaretAtZero) const
       
  4423 
       
  4424     Use \l lastIndexIn() instead.
       
  4425 */
       
  4426 
       
  4427 /*!
       
  4428     \fn QRegExp::QRegExp(const QString &pattern, bool cs, bool wildcard = false)
       
  4429 
       
  4430     Use another constructor instead.
       
  4431 
       
  4432     \oldcode
       
  4433         QRegExp rx("*.txt", false, true);
       
  4434     \newcode
       
  4435         QRegExp rx("*.txt", Qt::CaseInsensitive, QRegExp::Wildcard);
       
  4436     \endcode
       
  4437 */
       
  4438 
       
  4439 #ifndef QT_NO_DATASTREAM
       
  4440 /*!
       
  4441     \relates QRegExp
       
  4442 
       
  4443     Writes the regular expression \a regExp to stream \a out.
       
  4444 
       
  4445     \sa {Format of the QDataStream Operators}
       
  4446 */
       
  4447 QDataStream &operator<<(QDataStream &out, const QRegExp &regExp)
       
  4448 {
       
  4449     return out << regExp.pattern() << (quint8)regExp.caseSensitivity()
       
  4450                << (quint8)regExp.patternSyntax()
       
  4451                << (quint8)!!regExp.isMinimal();
       
  4452 }
       
  4453 
       
  4454 /*!
       
  4455     \relates QRegExp
       
  4456 
       
  4457     Reads a regular expression from stream \a in into \a regExp.
       
  4458 
       
  4459     \sa {Format of the QDataStream Operators}
       
  4460 */
       
  4461 QDataStream &operator>>(QDataStream &in, QRegExp &regExp)
       
  4462 {
       
  4463     QString pattern;
       
  4464     quint8 cs;
       
  4465     quint8 patternSyntax;
       
  4466     quint8 isMinimal;
       
  4467 
       
  4468     in >> pattern >> cs >> patternSyntax >> isMinimal;
       
  4469 
       
  4470     QRegExp newRegExp(pattern, Qt::CaseSensitivity(cs),
       
  4471                       QRegExp::PatternSyntax(patternSyntax));
       
  4472 
       
  4473     newRegExp.setMinimal(isMinimal);
       
  4474     regExp = newRegExp;
       
  4475     return in;
       
  4476 }
       
  4477 #endif // QT_NO_DATASTREAM
       
  4478 
       
  4479 QT_END_NAMESPACE