Orb/Doxygen/qtools/qregexp.cpp
changeset 0 42188c7ea2d9
equal deleted inserted replaced
-1:000000000000 0:42188c7ea2d9
       
     1 /****************************************************************************
       
     2 ** 
       
     3 **
       
     4 ** Implementation of QRegExp class
       
     5 **
       
     6 ** Created : 950126
       
     7 **
       
     8 ** Copyright (C) 1992-2000 Trolltech AS.  All rights reserved.
       
     9 **
       
    10 ** This file is part of the tools module of the Qt GUI Toolkit.
       
    11 **
       
    12 ** This file may be distributed under the terms of the Q Public License
       
    13 ** as defined by Trolltech AS of Norway and appearing in the file
       
    14 ** LICENSE.QPL included in the packaging of this file.
       
    15 **
       
    16 ** This file may be distributed and/or modified under the terms of the
       
    17 ** GNU General Public License version 2 as published by the Free Software
       
    18 ** Foundation and appearing in the file LICENSE.GPL included in the
       
    19 ** packaging of this file.
       
    20 **
       
    21 ** Licensees holding valid Qt Enterprise Edition or Qt Professional Edition
       
    22 ** licenses may use this file in accordance with the Qt Commercial License
       
    23 ** Agreement provided with the Software.
       
    24 **
       
    25 ** This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE
       
    26 ** WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
       
    27 **
       
    28 ** See http://www.trolltech.com/pricing.html or email sales@trolltech.com for
       
    29 **   information about Qt Commercial License Agreements.
       
    30 ** See http://www.trolltech.com/qpl/ for QPL licensing information.
       
    31 ** See http://www.trolltech.com/gpl/ for GPL licensing information.
       
    32 **
       
    33 ** Contact info@trolltech.com if any conditions of this licensing are
       
    34 ** not clear to you.
       
    35 **
       
    36 **********************************************************************/
       
    37 
       
    38 #include "qregexp.h"
       
    39 #include <ctype.h>
       
    40 #include <stdlib.h>
       
    41 
       
    42 // NOT REVISED
       
    43 /*!
       
    44   \class QRegExp qregexp.h
       
    45   \ingroup tools
       
    46   \ingroup misc
       
    47   \brief The QRegExp class provides pattern matching using regular
       
    48   expressions or wildcards.
       
    49 
       
    50   QRegExp knows these regexp primitives:
       
    51   <ul plain>
       
    52   <li><dfn>c</dfn> matches the character 'c'
       
    53   <li><dfn>.</dfn> matches any character
       
    54   <li><dfn>^</dfn> matches start of input
       
    55   <li><dfn>$</dfn>  matches end of input
       
    56   <li><dfn>[]</dfn> matches a defined set of characters - see below.
       
    57   <li><dfn>a*</dfn> matches a sequence of zero or more a's
       
    58   <li><dfn>a+</dfn> matches a sequence of one or more a's
       
    59   <li><dfn>a?</dfn> matches an optional a
       
    60   <li><dfn>\c</dfn> escape code for matching special characters such
       
    61   as \, [, *, +, . etc.
       
    62   <li><dfn>\t</dfn> matches the TAB character (9)
       
    63   <li><dfn>\n</dfn> matches newline (10)
       
    64   <li><dfn>\r</dfn> matches return (13)
       
    65   <li><dfn>\s</dfn> matches a white space (defined as any character
       
    66   for which QChar::isSpace() returns TRUE. This includes at least
       
    67   ASCII characters 9 (TAB), 10 (LF), 11 (VT), 12(FF), 13 (CR) and 32
       
    68   (Space)).
       
    69   <li><dfn>\d</dfn> matches a digit (defined as any character for
       
    70   which QChar::isDigit() returns TRUE. This includes at least ASCII
       
    71   characters '0'-'9').
       
    72   <li><dfn>\x1f6b</dfn> matches the character with unicode point U1f6b
       
    73   (hexadecimal 1f6b). \x0012 will match the ASCII/Latin1 character
       
    74   0x12 (18 decimal, 12 hexadecimal).
       
    75   <li><dfn>\022</dfn> matches the ASCII/Latin1 character 022 (18
       
    76   decimal, 22 octal).
       
    77   </ul>
       
    78 
       
    79   In wildcard mode, it only knows four primitives:
       
    80   <ul plain>
       
    81   <li><dfn>c</dfn> matches the character 'c'
       
    82   <li><dfn>?</dfn> matches any character
       
    83   <li><dfn>*</dfn> matches any sequence of characters
       
    84   <li><dfn>[]</dfn> matches a defined set of characters - see below.
       
    85   </ul>
       
    86 
       
    87   QRegExp supports Unicode both in the pattern strings and in the
       
    88   strings to be matched.
       
    89 
       
    90   When writing regular expressions in C++ code, remember that C++
       
    91   processes \ characters.  So in order to match e.g. a "." character,
       
    92   you must write "\\." in C++ source, not "\.".
       
    93 
       
    94   A character set matches a defined set of characters. For example,
       
    95   [BSD] matches any of 'B', 'D' and 'S'. Within a character set, the
       
    96   special characters '.', '*', '?', '^', '$', '+' and '[' lose their
       
    97   special meanings. The following special characters apply:
       
    98   <ul plain>
       
    99   <li><dfn>^</dfn> When placed first in the list, changes the
       
   100   character set to match any character \e not in the list. To include
       
   101   the character '^' itself in the set, escape it or place it anywhere
       
   102   but first.
       
   103   <li><dfn>-</dfn> Defines a range of characters. To include the
       
   104   character '-' itself in the set, escape it or place it last.
       
   105   <li><dfn>]</dfn> Ends the character set definition. To include the
       
   106   character ']' itself in the set, escape it or place it first (but
       
   107   after the negation operator '^', if present)
       
   108   </ul>
       
   109   Thus, [a-zA-Z0-9.] matches upper and lower case ASCII letters,
       
   110   digits and dot; and [^\s] matches everything except white space.
       
   111 
       
   112   \bug Case insensitive matching is not supported for non-ASCII/Latin1
       
   113   (non-8bit) characters. Any character with a non-zero QChar.row() is
       
   114   matched case sensitively even if the QRegExp is in case insensitive
       
   115   mode.
       
   116 
       
   117   \note In Qt 3.0, the language of regular expressions will contain
       
   118   five more special characters, namely '(', ')', '{', '|' and '}'. To
       
   119   ease porting, it's a good idea to escape these characters with a
       
   120   backslash in all the regular expressions you'll write from now on.
       
   121 */
       
   122 
       
   123 
       
   124 //
       
   125 // The regexp pattern is internally represented as an array of uints,
       
   126 // each element containing an 16-bit character or a 32-bit code
       
   127 // (listed below).  User-defined character classes (e.g. [a-zA-Z])
       
   128 // are encoded as this:
       
   129 // uint no:	1		2		3		...
       
   130 // value:	CCL | n		from | to	from | to
       
   131 //
       
   132 // where n is the (16-bit) number of following range definitions and
       
   133 // from and to define the ranges inclusive. from <= to is always true,
       
   134 // otherwise it is a built-in charclass (Pxx, eg \s - PWS). Single
       
   135 // characters in the class are coded as from==to.  Negated classes
       
   136 // (e.g. [^a-z]) use CCN instead of CCL.
       
   137 
       
   138 const uint END	= 0x00000000;
       
   139 const uint PWS	= 0x10010000;		// predef charclass: whitespace (\s)
       
   140 const uint PDG	= 0x10020000;		// predef charclass: digit (\d)
       
   141 const uint CCL	= 0x20010000;		// character class	[]
       
   142 const uint CCN	= 0x20020000;		// neg character class	[^]
       
   143 const uint CHR	= 0x40000000;		// character
       
   144 const uint BOL	= 0x80010000;		// beginning of line	^
       
   145 const uint EOL	= 0x80020000;		// end of line		$
       
   146 const uint BOW	= 0x80030000;		// beginning of word	\<
       
   147 const uint EOW	= 0x80040000;		// end of word		\>
       
   148 const uint ANY	= 0x80050000;		// any character	.
       
   149 const uint CLO	= 0x80070000;		// Kleene closure	*
       
   150 const uint OPT	= 0x80080000;		// Optional closure	?
       
   151 
       
   152 const uint MCC  = 0x20000000;		// character class bitmask
       
   153 const uint MCD  = 0xffff0000;		// code mask
       
   154 const uint MVL  = 0x0000ffff;		// value mask
       
   155 
       
   156 //
       
   157 // QRegExp::error codes (internal)
       
   158 //
       
   159 
       
   160 const int PatOk		= 0;			// pattern ok
       
   161 const int PatNull	= 1;			// no pattern defined
       
   162 const int PatSyntax	= 2;			// pattern syntax error
       
   163 const int PatOverflow	= 4;			// pattern too long
       
   164 
       
   165 
       
   166 /*****************************************************************************
       
   167   QRegExp member functions
       
   168  *****************************************************************************/
       
   169 
       
   170 /*!
       
   171   Constructs an empty regular expression.
       
   172 */
       
   173 
       
   174 QRegExp::QRegExp()
       
   175 {
       
   176     rxdata = 0;
       
   177     cs = TRUE;
       
   178     wc = FALSE;
       
   179     error = PatOk;
       
   180 }
       
   181 
       
   182 /*!
       
   183   Constructs a regular expression.
       
   184 
       
   185   \arg \e pattern is the regular expression pattern string.
       
   186   \arg \e caseSensitive specifies whether or not to use case sensitive
       
   187   matching.
       
   188   \arg \e wildcard specifies whether the pattern string should be used for
       
   189   wildcard matching (also called globbing expression), normally used for
       
   190   matching file names.
       
   191 
       
   192   \sa setWildcard()
       
   193 */
       
   194 
       
   195 QRegExp::QRegExp( const QString &pattern, bool caseSensitive, bool wildcard )
       
   196 {
       
   197     rxstring = pattern;
       
   198     rxdata = 0;
       
   199     cs = caseSensitive;
       
   200     wc = wildcard;
       
   201     compile();
       
   202 }
       
   203 
       
   204 /*!
       
   205   Constructs a regular expression which is a copy of \e r.
       
   206   \sa operator=(const QRegExp&)
       
   207 */
       
   208 
       
   209 QRegExp::QRegExp( const QRegExp &r )
       
   210 {
       
   211     rxstring = r.pattern();
       
   212     rxdata = 0;
       
   213     cs = r.caseSensitive();
       
   214     wc = r.wildcard();
       
   215     compile();
       
   216 }
       
   217 
       
   218 /*!
       
   219   Destructs the regular expression and cleans up its internal data.
       
   220 */
       
   221 
       
   222 QRegExp::~QRegExp()
       
   223 {
       
   224     if ( rxdata )                      // Avoid purify complaints
       
   225 	delete [] rxdata;
       
   226 }
       
   227 
       
   228 /*!
       
   229   Copies the regexp \e r and returns a reference to this regexp.
       
   230   The case sensitivity and wildcard options are copied, as well.
       
   231 */
       
   232 
       
   233 QRegExp &QRegExp::operator=( const QRegExp &r )
       
   234 {
       
   235     rxstring = r.rxstring;
       
   236     cs = r.cs;
       
   237     wc = r.wc;
       
   238     compile();
       
   239     return *this;
       
   240 }
       
   241 
       
   242 /*!
       
   243   \obsolete
       
   244   Consider using setPattern() instead of this method.
       
   245 
       
   246   Sets the pattern string to \e pattern and returns a reference to this regexp.
       
   247   The case sensitivity or wildcard options do not change.
       
   248 */
       
   249 
       
   250 QRegExp &QRegExp::operator=( const QString &pattern )
       
   251 {
       
   252     rxstring = pattern;
       
   253     compile();
       
   254     return *this;
       
   255 }
       
   256 
       
   257 
       
   258 /*!
       
   259   Returns TRUE if this regexp is equal to \e r.
       
   260 
       
   261   Two regexp objects are equal if they have equal pattern strings,
       
   262   case sensitivity options and wildcard options.
       
   263 */
       
   264 
       
   265 bool QRegExp::operator==( const QRegExp &r ) const
       
   266 {
       
   267     return rxstring == r.rxstring && cs == r.cs && wc == r.wc;
       
   268 }
       
   269 
       
   270 /*!
       
   271   \fn bool QRegExp::operator!=( const QRegExp &r ) const
       
   272 
       
   273   Returns TRUE if this regexp is \e not equal to \e r.
       
   274 
       
   275   \sa operator==()
       
   276 */
       
   277 
       
   278 /*!
       
   279   \fn bool QRegExp::isEmpty() const
       
   280   Returns TRUE if the regexp is empty.
       
   281 */
       
   282 
       
   283 /*!
       
   284   \fn bool QRegExp::isValid() const
       
   285   Returns TRUE if the regexp is valid, or FALSE if it is invalid.
       
   286 
       
   287   The pattern "[a-z" is an example of an invalid pattern, since it lacks a
       
   288   closing bracket.
       
   289 */
       
   290 
       
   291 
       
   292 /*!
       
   293   \fn bool QRegExp::wildcard() const
       
   294   Returns TRUE if wildcard mode is on, otherwise FALSE. \sa setWildcard().
       
   295 */
       
   296 
       
   297 /*!
       
   298   Sets the wildcard option for the regular expression.	The default
       
   299   is FALSE.
       
   300 
       
   301   Setting \e wildcard to TRUE makes it convenient to match filenames
       
   302   instead of plain text.
       
   303 
       
   304   For example, "qr*.cpp" matches the string "qregexp.cpp" in wildcard mode,
       
   305   but not "qicpp" (which would be matched in normal mode).
       
   306 
       
   307   \sa wildcard()
       
   308 */
       
   309 
       
   310 void QRegExp::setWildcard( bool wildcard )
       
   311 {
       
   312     if ( wildcard != wc ) {
       
   313 	wc = wildcard;
       
   314 	compile();
       
   315     }
       
   316 }
       
   317 
       
   318 /*!
       
   319   \fn bool QRegExp::caseSensitive() const
       
   320 
       
   321   Returns TRUE if case sensitivity is enabled, otherwise FALSE.	 The
       
   322   default is TRUE.
       
   323 
       
   324   \sa setCaseSensitive()
       
   325 */
       
   326 
       
   327 /*!
       
   328   Enables or disables case sensitive matching.
       
   329 
       
   330   In case sensitive mode, "a.e" matches "axe" but not "Axe".
       
   331 
       
   332   See also: caseSensitive()
       
   333 */
       
   334 
       
   335 void QRegExp::setCaseSensitive( bool enable )
       
   336 {
       
   337     if ( cs != enable ) {
       
   338 	cs = enable;
       
   339 	compile();
       
   340     }
       
   341 }
       
   342 
       
   343 
       
   344 /*!
       
   345   \fn QString QRegExp::pattern() const
       
   346   Returns the pattern string of the regexp.
       
   347 */
       
   348 
       
   349 
       
   350 /*!
       
   351   \fn void QRegExp::setPattern(const QString & pattern)
       
   352   Sets the pattern string to \a pattern and returns a reference to this regexp.
       
   353   The case sensitivity or wildcard options do not change.
       
   354 */
       
   355 
       
   356 static inline bool iswordchar( int x )
       
   357 {
       
   358     return isalnum(x) || x == '_';	//# Only 8-bit support
       
   359 }
       
   360 
       
   361 
       
   362 /*!
       
   363   \internal
       
   364   Match character class
       
   365 */
       
   366 
       
   367 static bool matchcharclass( uint *rxd, QChar c )
       
   368 {
       
   369     uint *d = rxd;
       
   370     uint clcode = *d & MCD;
       
   371     bool neg = clcode == CCN;
       
   372     if ( clcode != CCL && clcode != CCN)
       
   373 	qWarning("QRegExp: Internal error, please report to qt-bugs@trolltech.com");
       
   374     uint numFields = *d & MVL;
       
   375     uint cval = (((uint)(c.row())) << 8) | ((uint)c.cell());
       
   376     bool found = FALSE;
       
   377     for ( int i = 0; i < (int)numFields; i++ ) {
       
   378 	d++;
       
   379 	if ( *d == PWS && c.isSpace() ) {
       
   380 	    found = TRUE;
       
   381 	    break;
       
   382 	}
       
   383 	if ( *d == PDG && c.isDigit() ) {
       
   384 	    found = TRUE;
       
   385 	    break;
       
   386 	}
       
   387 	else {
       
   388 	    uint from = ( *d & MCD ) >> 16;
       
   389 	    uint to = *d & MVL;
       
   390 	    if ( (cval >= from) && (cval <= to) ) {
       
   391 		found = TRUE;
       
   392 		break;
       
   393 	    }
       
   394 	}
       
   395     }
       
   396     return neg ? !found : found;
       
   397 }
       
   398 
       
   399 
       
   400 
       
   401 /*
       
   402   Internal: Recursively match string.
       
   403 */
       
   404 
       
   405 static int matchstring( uint *rxd, const QChar *str, uint strlength,
       
   406 			const QChar *bol, bool cs )
       
   407 {
       
   408     const QChar *p = str;
       
   409     const QChar *start = p;
       
   410     uint pl = strlength;
       
   411     uint *d = rxd;
       
   412 
       
   413     //### in all cases here: handle pl == 0! (don't read past strlen)
       
   414     while ( *d ) {
       
   415 	if ( *d & CHR ) {			// match char
       
   416 	    if ( !pl )
       
   417 		return -1;
       
   418 	    QChar c( *d );
       
   419 	    if ( !cs && !c.row() ) {		// case insensitive, #Only 8bit
       
   420 		if ( p->row() || tolower(p->cell()) != c.cell() )
       
   421 		    return -1;
       
   422 		p++;
       
   423 		pl--;
       
   424 	    } else {				// case insensitive
       
   425 		if ( *p != c )
       
   426 		    return -1;
       
   427 		p++;
       
   428 		pl--;
       
   429 	    }
       
   430 	    d++;
       
   431 	}
       
   432 	else if ( *d & MCC ) {			// match char class
       
   433 	    if ( !pl )
       
   434 		return -1;
       
   435 	    if ( !matchcharclass( d, *p ) )
       
   436 		return -1;
       
   437 	    p++;
       
   438 	    pl--;
       
   439 	    d += (*d & MVL) + 1;
       
   440 	}
       
   441 	else switch ( *d++ ) {
       
   442 	    case PWS:				// match whitespace
       
   443 		if ( !pl || !p->isSpace() )
       
   444 		    return -1;
       
   445 		p++;
       
   446 		pl--;
       
   447 		break;
       
   448 	    case PDG:				// match digits
       
   449 		if ( !pl || !p->isDigit() )
       
   450 		    return -1;
       
   451 		p++;
       
   452 		pl--;
       
   453 		break;
       
   454 	    case ANY:				// match anything
       
   455 		if ( !pl )
       
   456 		    return -1;
       
   457 		p++;
       
   458 		pl--;
       
   459 		break;
       
   460 	    case BOL:				// match beginning of line
       
   461 		if ( p != bol )
       
   462 		    return -1;
       
   463 		break;
       
   464 	    case EOL:				// match end of line
       
   465 		if ( pl )
       
   466 		    return -1;
       
   467 		break;
       
   468 	    case BOW:				// match beginning of word
       
   469 		if ( !iswordchar(*p) || (p > bol && iswordchar(*(p-1)) ) )
       
   470 		    return -1;
       
   471 		break;
       
   472 	    case EOW:				// match end of word
       
   473 		if ( iswordchar(*p) || p == bol || !iswordchar(*(p-1)) )
       
   474 		    return -1;
       
   475 		break;
       
   476 	    case CLO:				// Kleene closure
       
   477 		{
       
   478 		const QChar *first_p = p;
       
   479 		if ( *d & CHR ) {		// match char
       
   480 		    QChar c( *d );
       
   481 		    if ( !cs && !c.row() ) {	// case insensitive, #only 8bit
       
   482 			while ( pl && !p->row() && tolower(p->cell())==c.cell() ) {
       
   483 			    p++;
       
   484 			    pl--;
       
   485 			}
       
   486 		    }
       
   487 		    else {			// case sensitive
       
   488 			while ( pl && *p == c ) {
       
   489 			    p++;
       
   490 			    pl--;
       
   491 			}
       
   492 		    }
       
   493 		    d++;
       
   494 		}
       
   495 		else if ( *d & MCC ) {			// match char class
       
   496 		    while( pl && matchcharclass( d, *p ) ) {
       
   497 			p++;
       
   498 			pl--;
       
   499 		    }
       
   500 		    d += (*d & MVL) + 1;
       
   501 		}
       
   502 		else if ( *d == PWS ) {
       
   503 		    while ( pl && p->isSpace() ) {
       
   504 			p++;
       
   505 			pl--;
       
   506 		    }
       
   507 		    d++;
       
   508 		}
       
   509 		else if ( *d == PDG ) {
       
   510 		    while ( pl && p->isDigit() ) {
       
   511 			p++;
       
   512 			pl--;
       
   513 		    }
       
   514 		    d++;
       
   515 		}
       
   516 		else if ( *d == ANY ) {
       
   517 		    p += pl;
       
   518 		    pl = 0;
       
   519 		    d++;
       
   520 		}
       
   521 		else {
       
   522 		    return -1;			// error
       
   523 		}
       
   524 		d++;				// skip CLO's END
       
   525 		while ( p >= first_p ) {	// go backwards
       
   526 		    int end = matchstring( d, p, pl, bol, cs );
       
   527 		    if ( end >= 0 )
       
   528 			return ( p - start ) + end;
       
   529 		    if ( !p )
       
   530 			return -1;
       
   531 		    --p;
       
   532 		    ++pl;
       
   533 		}
       
   534 		}
       
   535 		return -1;
       
   536 	    case OPT:				// optional closure
       
   537 		{
       
   538 		const QChar *first_p = p;
       
   539 		if ( *d & CHR ) {		// match char
       
   540 		    QChar c( *d );
       
   541 		    if ( !cs && !c.row() ) {	// case insensitive, #only 8bit
       
   542 			if ( pl && !p->row() && tolower(p->cell()) == c.cell() ) {
       
   543 			    p++;
       
   544 			    pl--;
       
   545 			}
       
   546 		    }
       
   547 		    else {			// case sensitive
       
   548 			if ( pl && *p == c ) {
       
   549 			    p++;
       
   550 			    pl--;
       
   551 			}
       
   552 		    }
       
   553 		    d++;
       
   554 		}
       
   555 		else if ( *d & MCC ) {			// match char class
       
   556 		    if ( pl && matchcharclass( d, *p ) ) {
       
   557 			p++;
       
   558 			pl--;
       
   559 		    }
       
   560 		    d += (*d & MVL) + 1;
       
   561 		}
       
   562 		else if ( *d == PWS ) {
       
   563 		    if ( pl && p->isSpace() ) {
       
   564 			p++;
       
   565 			pl--;
       
   566 		    }
       
   567 		    d++;
       
   568 		}
       
   569 		else if ( *d == PDG ) {
       
   570 		    if ( pl && p->isDigit() ) {
       
   571 			p++;
       
   572 			pl--;
       
   573 		    }
       
   574 		    d++;
       
   575 		}
       
   576 		else if ( *d == ANY ) {
       
   577 		    if ( pl ) {
       
   578 			p++;
       
   579 			pl--;
       
   580 		    }
       
   581 		    d++;
       
   582 		}
       
   583 		else {
       
   584 		    return -1;			// error
       
   585 		}
       
   586 		d++;				// skip OPT's END
       
   587 		while ( p >= first_p ) {	// go backwards
       
   588 		    int end = matchstring( d, p, pl, bol, cs );
       
   589 		    if ( end >= 0 )
       
   590 			return ( p - start ) + end;
       
   591 		    if ( !p )
       
   592 			return -1;
       
   593 		    --p;
       
   594 		    ++pl;
       
   595 		}
       
   596 		}
       
   597 		return -1;
       
   598 
       
   599 	    default:				// error
       
   600 		return -1;
       
   601 	}
       
   602     }
       
   603     return p - start;
       
   604 }
       
   605 
       
   606 
       
   607 /*!
       
   608   \internal
       
   609   Recursively match string.
       
   610 */
       
   611 
       
   612 // This is obsolete now, but since it is protected (not private), it
       
   613 // is still implemented on the off-chance that somebody has made a
       
   614 // class derived from QRegExp and calls this directly.
       
   615 // Qt 3.0: Remove this?
       
   616 
       
   617 
       
   618 const QChar *QRegExp::matchstr( uint *rxd, const QChar *str, uint strlength,
       
   619 				const QChar *bol ) const
       
   620 {
       
   621     int len = matchstring( rxd, str, strlength, bol, cs );
       
   622     if ( len < 0 )
       
   623 	return 0;
       
   624     return str + len;
       
   625 }
       
   626 
       
   627 /*!
       
   628   Attempts to match in \e str, starting from position \e index.
       
   629   Returns the position of the match, or -1 if there was no match.
       
   630 
       
   631   If \e len is not a null pointer, the length of the match is stored in
       
   632   \e *len.
       
   633 
       
   634   If \e indexIsStart is TRUE (the default), the position \e index in
       
   635   the string will match the start-of-input primitive (^) in the
       
   636   regexp, if present. Otherwise, position 0 in \e str will match.
       
   637 
       
   638   Example:
       
   639   \code
       
   640     QRegExp r("[0-9]*\\.[0-9]+");		// matches floating point
       
   641     int len;
       
   642     r.match("pi = 3.1416", 0, &len);		// returns 5, len == 6
       
   643   \endcode
       
   644 
       
   645   \note In Qt 3.0, this function will be replaced by find().
       
   646 */
       
   647 
       
   648 int QRegExp::match( const QString &str, int index, int *len,
       
   649 		    bool indexIsStart ) const
       
   650 {
       
   651     if ( !isValid() || isEmpty() )
       
   652 	return -1;
       
   653     if ( str.length() < (uint)index )
       
   654 	return -1;
       
   655     const QChar *start = str.unicode();
       
   656     const QChar *p = start + index;
       
   657     uint pl = str.length() - index;
       
   658     uint *d  = rxdata;
       
   659     int ep = -1;
       
   660 
       
   661     if ( *d == BOL ) {				// match from beginning of line
       
   662 	ep = matchstring( d, p, pl, indexIsStart ? p : start, cs );
       
   663     } else {
       
   664 	if ( *d & CHR ) {
       
   665 	    QChar c( *d );
       
   666 	    if ( !cs && !c.row() ) {		// case sensitive, # only 8bit
       
   667 		while ( pl && ( p->row() || tolower(p->cell()) != c.cell() ) ) {
       
   668 		    p++;
       
   669 		    pl--;
       
   670 		}
       
   671 	    } else {				// case insensitive
       
   672 		while ( pl && *p != c ) {
       
   673 		    p++;
       
   674 		    pl--;
       
   675 		}
       
   676 	    }
       
   677 	}
       
   678 	while( 1 ) {				// regular match
       
   679 	    ep = matchstring( d, p, pl, indexIsStart ? start+index : start, cs );
       
   680 	    if ( ep >= 0 )
       
   681 		break;
       
   682 	    if ( !pl )
       
   683 		break;
       
   684 	    p++;
       
   685 	    pl--;
       
   686 	}
       
   687     }
       
   688     if ( len )
       
   689 	*len = ep >= 0 ? ep : 0;      // No match -> 0, for historical reasons
       
   690     return ep >= 0 ? (int)(p - start) : -1;		// return index;
       
   691 }
       
   692 
       
   693 /*! \fn int QRegExp::find( const QString& str, int index )
       
   694 
       
   695   Attempts to match in \e str, starting from position \e index.
       
   696   Returns the position of the match, or -1 if there was no match.
       
   697 
       
   698   \sa match()
       
   699 */
       
   700 
       
   701 //
       
   702 // Translate wildcard pattern to standard regexp pattern.
       
   703 // Ex:	 *.cpp	==> ^.*\.cpp$
       
   704 //
       
   705 
       
   706 static QString wc2rx( const QString &pattern )
       
   707 {
       
   708     int patlen = (int)pattern.length();
       
   709     QString wcpattern = QString::fromLatin1("^");
       
   710 
       
   711     QChar c;
       
   712     for( int i = 0; i < patlen; i++ ) {
       
   713 	c = pattern[i];
       
   714 	switch ( (char)c ) {
       
   715 	case '*':				// '*' ==> '.*'
       
   716 	    wcpattern += '.';
       
   717 	    break;
       
   718 	case '?':				// '?' ==> '.'
       
   719 	    c = '.';
       
   720 	    break;
       
   721 	case '.':				// quote special regexp chars
       
   722 	case '+':
       
   723 	case '\\':
       
   724 	case '$':
       
   725 	case '^':
       
   726 	    wcpattern += '\\';
       
   727 	    break;
       
   728 	case '[':
       
   729 	    if ( (char)pattern[i+1] == '^' ) { // don't quote '^' after '['
       
   730 		wcpattern += '[';
       
   731 		c = pattern[i+1];
       
   732 		i++;
       
   733 	    }
       
   734 	    break;
       
   735 	}
       
   736 	wcpattern += c;
       
   737 
       
   738     }
       
   739     wcpattern += '$';
       
   740     return wcpattern;				// return new regexp pattern
       
   741 }
       
   742 
       
   743 
       
   744 //
       
   745 // Internal: Get char value and increment pointer.
       
   746 //
       
   747 
       
   748 static uint char_val( const QChar **str, uint *strlength )   // get char value
       
   749 {
       
   750     const QChar *p = *str;
       
   751     uint pl = *strlength;
       
   752     uint len = 1;
       
   753     uint v = 0;
       
   754     if ( (char)*p == '\\' ) {			// escaped code
       
   755 	p++;
       
   756 	pl--;
       
   757 	if ( !pl ) {				// it is just a '\'
       
   758 	    (*str)++;
       
   759 	    (*strlength)--;
       
   760 	    return '\\';
       
   761 	}
       
   762 	len++;					// length at least 2
       
   763 	int i;
       
   764 	char c;
       
   765 	char ch = tolower((char)*p);
       
   766 	switch ( ch ) {
       
   767 	    case 'b':  v = '\b';  break;	// bell
       
   768 	    case 'f':  v = '\f';  break;	// form feed
       
   769 	    case 'n':  v = '\n';  break;	// newline
       
   770 	    case 'r':  v = '\r';  break;	// return
       
   771 	    case 't':  v = '\t';  break;	// tab
       
   772 	    case 's':  v = PWS; break;		// whitespace charclass
       
   773 	    case 'd':  v = PDG; break;		// digit charclass
       
   774 	    case '<':  v = BOW; break;		// word beginning matcher
       
   775 	    case '>':  v = EOW; break;		// word ending matcher
       
   776 
       
   777 	    case 'x': {				// hex code
       
   778 		p++;
       
   779 		pl--;
       
   780 		for ( i = 0; (i < 4) && pl; i++ ) {	//up to 4 hex digits
       
   781 		    c = tolower((char)*p);
       
   782 		    bool a = ( c >= 'a' && c <= 'f' );
       
   783 		    if ( (c >= '0' && c <= '9') || a ) {
       
   784 			v <<= 4;
       
   785 			v += a ? 10 + c - 'a' : c - '0';
       
   786 			len++;
       
   787 		    }
       
   788 		    else {
       
   789 			break;
       
   790 		    }
       
   791 		    p++;
       
   792 		    pl--;
       
   793 		}
       
   794 	    }
       
   795 	    break;
       
   796 
       
   797 	    default: {
       
   798 		if ( ch >= '0' && ch <= '7' ) {	//octal code
       
   799 		    len--;
       
   800 		    for ( i = 0; (i < 3) && pl; i++ ) {	// up to 3 oct digits
       
   801 			c = (char)*p;
       
   802 			if ( c >= '0' && c <= '7' ) {
       
   803 			    v <<= 3;
       
   804 			    v += c - '0';
       
   805 			    len++;
       
   806 			}
       
   807 			else {
       
   808 			    break;
       
   809 			}
       
   810 			p++;
       
   811 			pl--;
       
   812 		    }
       
   813 		}
       
   814 		else {				// not an octal number
       
   815 		    v = (((uint)(p->row())) << 8) | ((uint)p->cell());
       
   816 		}
       
   817 	    }
       
   818 	}
       
   819     } else {
       
   820 	v = (((uint)(p->row())) << 8) | ((uint)p->cell());
       
   821     }
       
   822     *str += len;
       
   823     *strlength -= len;
       
   824     return v;
       
   825 }
       
   826 
       
   827 
       
   828 #if defined(DEBUG)
       
   829 static uint *dump( uint *p )
       
   830 {
       
   831     while ( *p != END ) {
       
   832 	if ( *p & CHR ) {
       
   833 	    QChar uc = (QChar)*p;
       
   834 	    char c = (char)uc;
       
   835 	    uint u = (((uint)(uc.row())) << 8) | ((uint)uc.cell());
       
   836 	    qDebug( "\tCHR\tU%04x (%c)", u, (c ? c : ' '));
       
   837 	    p++;
       
   838 	}
       
   839 	else if ( *p & MCC ) {
       
   840 	    uint clcode = *p & MCD;
       
   841 	    uint numFields = *p & MVL;
       
   842 	    if ( clcode == CCL )
       
   843 		qDebug( "\tCCL\t%i", numFields );
       
   844 	    else if ( clcode == CCN )
       
   845 		qDebug( "\tCCN\t%i", numFields );
       
   846 	    else
       
   847 		qDebug("coding error!");
       
   848 	    for ( int i = 0; i < (int)numFields; i++ ) {
       
   849 		p++;
       
   850 		if ( *p == PWS )
       
   851 		    qDebug( "\t\tPWS" );
       
   852 		else if ( *p == PDG )
       
   853 		    qDebug( "\t\tPDG" );
       
   854 		else {
       
   855 		    uint from = ( *p & MCD ) >> 16;
       
   856 		    uint to = *p & MVL;
       
   857 		    char fc = (char)QChar(from);
       
   858 		    char tc = (char)QChar(to);
       
   859 		    qDebug( "\t\tU%04x (%c) - U%04x (%c)", from,
       
   860 			   (fc ? fc : ' '), to, (tc ? tc : ' ') );
       
   861 		}
       
   862 	    }
       
   863 	    p++;
       
   864 	}
       
   865 	else switch ( *p++ ) {
       
   866 	    case PWS:
       
   867 		qDebug( "\tPWS" );
       
   868 		break;
       
   869 	    case PDG:
       
   870 		qDebug( "\tPDG" );
       
   871 		break;
       
   872 	    case BOL:
       
   873 		qDebug( "\tBOL" );
       
   874 		break;
       
   875 	    case EOL:
       
   876 		qDebug( "\tEOL" );
       
   877 		break;
       
   878 	    case BOW:
       
   879 		qDebug( "\tBOW" );
       
   880 		break;
       
   881 	    case EOW:
       
   882 		qDebug( "\tEOW" );
       
   883 		break;
       
   884 	    case ANY:
       
   885 		qDebug( "\tANY" );
       
   886 		break;
       
   887 	    case CLO:
       
   888 		qDebug( "\tCLO" );
       
   889 		p = dump( p );
       
   890 		break;
       
   891 	    case OPT:
       
   892 		qDebug( "\tOPT" );
       
   893 		p = dump( p );
       
   894 		break;
       
   895 	}
       
   896     }
       
   897     qDebug( "\tEND" );
       
   898     return p+1;
       
   899 }
       
   900 #endif // DEBUG
       
   901 
       
   902 
       
   903 static const int maxlen = 1024;			// max length of regexp array
       
   904 static uint rxarray[ maxlen ];			// tmp regexp array
       
   905 
       
   906 /*!
       
   907   \internal
       
   908   Compiles the regular expression and stores the result in rxdata.
       
   909   The 'error' flag is set to non-zero if an error is detected.
       
   910   NOTE! This function is not reentrant!
       
   911 */
       
   912 
       
   913 void QRegExp::compile()
       
   914 {
       
   915     if ( rxdata ) {				// delete old data
       
   916 	delete [] rxdata;
       
   917 	rxdata = 0;
       
   918     }
       
   919     if ( rxstring.isEmpty() ) {			// no regexp pattern set
       
   920 	error = PatNull;
       
   921 	return;
       
   922     }
       
   923 
       
   924     error = PatOk;				// assume pattern is ok
       
   925 
       
   926     QString pattern;
       
   927     if ( wc )
       
   928 	pattern = wc2rx(rxstring);
       
   929     else
       
   930 	pattern = rxstring;
       
   931     const QChar *start = pattern.unicode();	// pattern pointer
       
   932     const QChar *p = start;			// pattern pointer
       
   933     uint pl = pattern.length();
       
   934     uint *d = rxarray;				// data pointer
       
   935     uint *prev_d = 0;
       
   936 
       
   937 #define GEN(x)	*d++ = (x)
       
   938 
       
   939     while ( pl ) {
       
   940 	char ch = (char)*p;
       
   941 	switch ( ch ) {
       
   942 
       
   943 	    case '^':				// beginning of line
       
   944 		prev_d = d;
       
   945 		GEN( p == start ? BOL : (CHR | ch) );
       
   946 		p++;
       
   947 		pl--;
       
   948 		break;
       
   949 
       
   950 	    case '$':				// end of line
       
   951 		prev_d = d;
       
   952 		GEN( pl == 1 ? EOL : (CHR | ch) );
       
   953 		p++;
       
   954 		pl--;
       
   955 		break;
       
   956 
       
   957 	    case '.':				// any char
       
   958 		prev_d = d;
       
   959 		GEN( ANY );
       
   960 		p++;
       
   961 		pl--;
       
   962 		break;
       
   963 
       
   964 	    case '[':				// character class
       
   965 		{
       
   966 		prev_d = d;
       
   967 		p++;
       
   968 		pl--;
       
   969 		if ( !pl ) {
       
   970 		    error = PatSyntax;
       
   971 		    return;
       
   972 		}
       
   973 		bool firstIsEscaped = ( (char)*p == '\\' );
       
   974 		uint cch = char_val( &p, &pl );
       
   975 		if ( cch == '^' && !firstIsEscaped ) {	// negate!
       
   976 		    GEN( CCN );
       
   977 		    if ( !pl ) {
       
   978 			error = PatSyntax;
       
   979 			return;
       
   980 		    }
       
   981 		    cch = char_val( &p, &pl );
       
   982 		} else {
       
   983 		    GEN( CCL );
       
   984 		}
       
   985 		uint numFields = 0;
       
   986 		while ( pl ) {
       
   987 		    if ((pl>2) && ((char)*p == '-') && ((char)*(p+1) != ']')) {
       
   988 			// Found a range
       
   989 		       	char_val( &p, &pl ); // Read the '-'
       
   990 			uint cch2 = char_val( &p, &pl ); // Read the range end
       
   991 			if ( cch > cch2 ) { 		// swap start and stop
       
   992 			    int tmp = cch;
       
   993 			    cch = cch2;
       
   994 			    cch2 = tmp;
       
   995 			}
       
   996 			GEN( (cch << 16) | cch2 );	// from < to
       
   997 			numFields++;
       
   998 		    }
       
   999 		    else {
       
  1000 			// Found a single character
       
  1001 			if ( cch & MCD ) // It's a code; will not be mistaken
       
  1002 			    GEN( cch );	 // for a range, since from > to
       
  1003 			else
       
  1004 			    GEN( (cch << 16) | cch ); // from == to range
       
  1005 			numFields++;
       
  1006 		    }
       
  1007 		    if ( d >= rxarray + maxlen ) {	// pattern too long
       
  1008 			error = PatOverflow;		
       
  1009 			return;
       
  1010 		    }
       
  1011 		    if ( !pl ) {		// At least ']' should be left
       
  1012 			error = PatSyntax;
       
  1013 			return;
       
  1014 		    }
       
  1015 		    bool nextIsEscaped = ( (char)*p == '\\' );
       
  1016 		    cch = char_val( &p, &pl );
       
  1017 		    if ( cch == (uint)']' && !nextIsEscaped )
       
  1018 			break;
       
  1019 		    if ( !pl ) {		// End, should have seen ']'
       
  1020 			error = PatSyntax;
       
  1021 			return;
       
  1022 		    }
       
  1023 		}
       
  1024 		*prev_d |= numFields;		// Store number of fields
       
  1025 		}
       
  1026 		break;
       
  1027 
       
  1028 	    case '*':				// Kleene closure, or
       
  1029 	    case '+':				// positive closure, or
       
  1030 	    case '?':				// optional closure
       
  1031 		{
       
  1032 		if ( prev_d == 0 ) {		// no previous expression
       
  1033 		    error = PatSyntax;		// empty closure
       
  1034 		    return;
       
  1035 		}
       
  1036 		switch ( *prev_d ) {		// test if invalid closure
       
  1037 		    case BOL:
       
  1038 		    case BOW:
       
  1039 		    case EOW:
       
  1040 		    case CLO:
       
  1041 		    case OPT:
       
  1042 			error = PatSyntax;
       
  1043 			return;
       
  1044 		}
       
  1045 		int ddiff = d - prev_d;
       
  1046 		if ( *p == '+' ) {		// convert to Kleene closure
       
  1047 		    if ( d + ddiff >= rxarray + maxlen ) {
       
  1048 			error = PatOverflow;	// pattern too long
       
  1049 			return;
       
  1050 		    }
       
  1051 		    memcpy( d, prev_d, ddiff*sizeof(uint) );
       
  1052 		    d += ddiff;
       
  1053 		    prev_d += ddiff;
       
  1054 		}
       
  1055 		memmove( prev_d+1, prev_d, ddiff*sizeof(uint) );
       
  1056 		*prev_d = ch == '?' ? OPT : CLO;
       
  1057 		d++;
       
  1058 		GEN( END );
       
  1059 		p++;
       
  1060 		pl--;
       
  1061 		}
       
  1062 		break;
       
  1063 
       
  1064 	    default:
       
  1065 		{
       
  1066 		prev_d = d;
       
  1067 		uint cv = char_val( &p, &pl );
       
  1068 		if ( cv & MCD ) {			// It's a code
       
  1069 		    GEN( cv );
       
  1070 		}
       
  1071 		else {
       
  1072 		    if ( !cs && cv <= 0xff )		// #only 8bit support
       
  1073 			cv = tolower( cv );
       
  1074 		    GEN( CHR | cv );
       
  1075 		}
       
  1076 		}
       
  1077 	}
       
  1078 	if ( d >= rxarray + maxlen ) {		// oops!
       
  1079 	    error = PatOverflow;		// pattern too long
       
  1080 	    return;
       
  1081 	}
       
  1082     }
       
  1083     GEN( END );
       
  1084     int len = d - rxarray;
       
  1085     rxdata = new uint[ len ];			// copy from rxarray to rxdata
       
  1086     CHECK_PTR( rxdata );
       
  1087     memcpy( rxdata, rxarray, len*sizeof(uint) );
       
  1088 #if defined(DEBUG)
       
  1089     //dump( rxdata );	// uncomment this line for debugging
       
  1090 #endif
       
  1091 }