|
1 /**************************************************************************** |
|
2 ** |
|
3 ** |
|
4 ** Implementation of QRegExp class |
|
5 ** |
|
6 ** Created : 950126 |
|
7 ** |
|
8 ** Copyright (C) 1992-2000 Trolltech AS. All rights reserved. |
|
9 ** |
|
10 ** This file is part of the tools module of the Qt GUI Toolkit. |
|
11 ** |
|
12 ** This file may be distributed under the terms of the Q Public License |
|
13 ** as defined by Trolltech AS of Norway and appearing in the file |
|
14 ** LICENSE.QPL included in the packaging of this file. |
|
15 ** |
|
16 ** This file may be distributed and/or modified under the terms of the |
|
17 ** GNU General Public License version 2 as published by the Free Software |
|
18 ** Foundation and appearing in the file LICENSE.GPL included in the |
|
19 ** packaging of this file. |
|
20 ** |
|
21 ** Licensees holding valid Qt Enterprise Edition or Qt Professional Edition |
|
22 ** licenses may use this file in accordance with the Qt Commercial License |
|
23 ** Agreement provided with the Software. |
|
24 ** |
|
25 ** This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE |
|
26 ** WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. |
|
27 ** |
|
28 ** See http://www.trolltech.com/pricing.html or email sales@trolltech.com for |
|
29 ** information about Qt Commercial License Agreements. |
|
30 ** See http://www.trolltech.com/qpl/ for QPL licensing information. |
|
31 ** See http://www.trolltech.com/gpl/ for GPL licensing information. |
|
32 ** |
|
33 ** Contact info@trolltech.com if any conditions of this licensing are |
|
34 ** not clear to you. |
|
35 ** |
|
36 **********************************************************************/ |
|
37 |
|
38 #include "qregexp.h" |
|
39 #include <ctype.h> |
|
40 #include <stdlib.h> |
|
41 |
|
42 // NOT REVISED |
|
43 /*! |
|
44 \class QRegExp qregexp.h |
|
45 \ingroup tools |
|
46 \ingroup misc |
|
47 \brief The QRegExp class provides pattern matching using regular |
|
48 expressions or wildcards. |
|
49 |
|
50 QRegExp knows these regexp primitives: |
|
51 <ul plain> |
|
52 <li><dfn>c</dfn> matches the character 'c' |
|
53 <li><dfn>.</dfn> matches any character |
|
54 <li><dfn>^</dfn> matches start of input |
|
55 <li><dfn>$</dfn> matches end of input |
|
56 <li><dfn>[]</dfn> matches a defined set of characters - see below. |
|
57 <li><dfn>a*</dfn> matches a sequence of zero or more a's |
|
58 <li><dfn>a+</dfn> matches a sequence of one or more a's |
|
59 <li><dfn>a?</dfn> matches an optional a |
|
60 <li><dfn>\c</dfn> escape code for matching special characters such |
|
61 as \, [, *, +, . etc. |
|
62 <li><dfn>\t</dfn> matches the TAB character (9) |
|
63 <li><dfn>\n</dfn> matches newline (10) |
|
64 <li><dfn>\r</dfn> matches return (13) |
|
65 <li><dfn>\s</dfn> matches a white space (defined as any character |
|
66 for which QChar::isSpace() returns TRUE. This includes at least |
|
67 ASCII characters 9 (TAB), 10 (LF), 11 (VT), 12(FF), 13 (CR) and 32 |
|
68 (Space)). |
|
69 <li><dfn>\d</dfn> matches a digit (defined as any character for |
|
70 which QChar::isDigit() returns TRUE. This includes at least ASCII |
|
71 characters '0'-'9'). |
|
72 <li><dfn>\x1f6b</dfn> matches the character with unicode point U1f6b |
|
73 (hexadecimal 1f6b). \x0012 will match the ASCII/Latin1 character |
|
74 0x12 (18 decimal, 12 hexadecimal). |
|
75 <li><dfn>\022</dfn> matches the ASCII/Latin1 character 022 (18 |
|
76 decimal, 22 octal). |
|
77 </ul> |
|
78 |
|
79 In wildcard mode, it only knows four primitives: |
|
80 <ul plain> |
|
81 <li><dfn>c</dfn> matches the character 'c' |
|
82 <li><dfn>?</dfn> matches any character |
|
83 <li><dfn>*</dfn> matches any sequence of characters |
|
84 <li><dfn>[]</dfn> matches a defined set of characters - see below. |
|
85 </ul> |
|
86 |
|
87 QRegExp supports Unicode both in the pattern strings and in the |
|
88 strings to be matched. |
|
89 |
|
90 When writing regular expressions in C++ code, remember that C++ |
|
91 processes \ characters. So in order to match e.g. a "." character, |
|
92 you must write "\\." in C++ source, not "\.". |
|
93 |
|
94 A character set matches a defined set of characters. For example, |
|
95 [BSD] matches any of 'B', 'D' and 'S'. Within a character set, the |
|
96 special characters '.', '*', '?', '^', '$', '+' and '[' lose their |
|
97 special meanings. The following special characters apply: |
|
98 <ul plain> |
|
99 <li><dfn>^</dfn> When placed first in the list, changes the |
|
100 character set to match any character \e not in the list. To include |
|
101 the character '^' itself in the set, escape it or place it anywhere |
|
102 but first. |
|
103 <li><dfn>-</dfn> Defines a range of characters. To include the |
|
104 character '-' itself in the set, escape it or place it last. |
|
105 <li><dfn>]</dfn> Ends the character set definition. To include the |
|
106 character ']' itself in the set, escape it or place it first (but |
|
107 after the negation operator '^', if present) |
|
108 </ul> |
|
109 Thus, [a-zA-Z0-9.] matches upper and lower case ASCII letters, |
|
110 digits and dot; and [^\s] matches everything except white space. |
|
111 |
|
112 \bug Case insensitive matching is not supported for non-ASCII/Latin1 |
|
113 (non-8bit) characters. Any character with a non-zero QChar.row() is |
|
114 matched case sensitively even if the QRegExp is in case insensitive |
|
115 mode. |
|
116 |
|
117 \note In Qt 3.0, the language of regular expressions will contain |
|
118 five more special characters, namely '(', ')', '{', '|' and '}'. To |
|
119 ease porting, it's a good idea to escape these characters with a |
|
120 backslash in all the regular expressions you'll write from now on. |
|
121 */ |
|
122 |
|
123 |
|
124 // |
|
125 // The regexp pattern is internally represented as an array of uints, |
|
126 // each element containing an 16-bit character or a 32-bit code |
|
127 // (listed below). User-defined character classes (e.g. [a-zA-Z]) |
|
128 // are encoded as this: |
|
129 // uint no: 1 2 3 ... |
|
130 // value: CCL | n from | to from | to |
|
131 // |
|
132 // where n is the (16-bit) number of following range definitions and |
|
133 // from and to define the ranges inclusive. from <= to is always true, |
|
134 // otherwise it is a built-in charclass (Pxx, eg \s - PWS). Single |
|
135 // characters in the class are coded as from==to. Negated classes |
|
136 // (e.g. [^a-z]) use CCN instead of CCL. |
|
137 |
|
138 const uint END = 0x00000000; |
|
139 const uint PWS = 0x10010000; // predef charclass: whitespace (\s) |
|
140 const uint PDG = 0x10020000; // predef charclass: digit (\d) |
|
141 const uint CCL = 0x20010000; // character class [] |
|
142 const uint CCN = 0x20020000; // neg character class [^] |
|
143 const uint CHR = 0x40000000; // character |
|
144 const uint BOL = 0x80010000; // beginning of line ^ |
|
145 const uint EOL = 0x80020000; // end of line $ |
|
146 const uint BOW = 0x80030000; // beginning of word \< |
|
147 const uint EOW = 0x80040000; // end of word \> |
|
148 const uint ANY = 0x80050000; // any character . |
|
149 const uint CLO = 0x80070000; // Kleene closure * |
|
150 const uint OPT = 0x80080000; // Optional closure ? |
|
151 |
|
152 const uint MCC = 0x20000000; // character class bitmask |
|
153 const uint MCD = 0xffff0000; // code mask |
|
154 const uint MVL = 0x0000ffff; // value mask |
|
155 |
|
156 // |
|
157 // QRegExp::error codes (internal) |
|
158 // |
|
159 |
|
160 const int PatOk = 0; // pattern ok |
|
161 const int PatNull = 1; // no pattern defined |
|
162 const int PatSyntax = 2; // pattern syntax error |
|
163 const int PatOverflow = 4; // pattern too long |
|
164 |
|
165 |
|
166 /***************************************************************************** |
|
167 QRegExp member functions |
|
168 *****************************************************************************/ |
|
169 |
|
170 /*! |
|
171 Constructs an empty regular expression. |
|
172 */ |
|
173 |
|
174 QRegExp::QRegExp() |
|
175 { |
|
176 rxdata = 0; |
|
177 cs = TRUE; |
|
178 wc = FALSE; |
|
179 error = PatOk; |
|
180 } |
|
181 |
|
182 /*! |
|
183 Constructs a regular expression. |
|
184 |
|
185 \arg \e pattern is the regular expression pattern string. |
|
186 \arg \e caseSensitive specifies whether or not to use case sensitive |
|
187 matching. |
|
188 \arg \e wildcard specifies whether the pattern string should be used for |
|
189 wildcard matching (also called globbing expression), normally used for |
|
190 matching file names. |
|
191 |
|
192 \sa setWildcard() |
|
193 */ |
|
194 |
|
195 QRegExp::QRegExp( const QString &pattern, bool caseSensitive, bool wildcard ) |
|
196 { |
|
197 rxstring = pattern; |
|
198 rxdata = 0; |
|
199 cs = caseSensitive; |
|
200 wc = wildcard; |
|
201 compile(); |
|
202 } |
|
203 |
|
204 /*! |
|
205 Constructs a regular expression which is a copy of \e r. |
|
206 \sa operator=(const QRegExp&) |
|
207 */ |
|
208 |
|
209 QRegExp::QRegExp( const QRegExp &r ) |
|
210 { |
|
211 rxstring = r.pattern(); |
|
212 rxdata = 0; |
|
213 cs = r.caseSensitive(); |
|
214 wc = r.wildcard(); |
|
215 compile(); |
|
216 } |
|
217 |
|
218 /*! |
|
219 Destructs the regular expression and cleans up its internal data. |
|
220 */ |
|
221 |
|
222 QRegExp::~QRegExp() |
|
223 { |
|
224 if ( rxdata ) // Avoid purify complaints |
|
225 delete [] rxdata; |
|
226 } |
|
227 |
|
228 /*! |
|
229 Copies the regexp \e r and returns a reference to this regexp. |
|
230 The case sensitivity and wildcard options are copied, as well. |
|
231 */ |
|
232 |
|
233 QRegExp &QRegExp::operator=( const QRegExp &r ) |
|
234 { |
|
235 rxstring = r.rxstring; |
|
236 cs = r.cs; |
|
237 wc = r.wc; |
|
238 compile(); |
|
239 return *this; |
|
240 } |
|
241 |
|
242 /*! |
|
243 \obsolete |
|
244 Consider using setPattern() instead of this method. |
|
245 |
|
246 Sets the pattern string to \e pattern and returns a reference to this regexp. |
|
247 The case sensitivity or wildcard options do not change. |
|
248 */ |
|
249 |
|
250 QRegExp &QRegExp::operator=( const QString &pattern ) |
|
251 { |
|
252 rxstring = pattern; |
|
253 compile(); |
|
254 return *this; |
|
255 } |
|
256 |
|
257 |
|
258 /*! |
|
259 Returns TRUE if this regexp is equal to \e r. |
|
260 |
|
261 Two regexp objects are equal if they have equal pattern strings, |
|
262 case sensitivity options and wildcard options. |
|
263 */ |
|
264 |
|
265 bool QRegExp::operator==( const QRegExp &r ) const |
|
266 { |
|
267 return rxstring == r.rxstring && cs == r.cs && wc == r.wc; |
|
268 } |
|
269 |
|
270 /*! |
|
271 \fn bool QRegExp::operator!=( const QRegExp &r ) const |
|
272 |
|
273 Returns TRUE if this regexp is \e not equal to \e r. |
|
274 |
|
275 \sa operator==() |
|
276 */ |
|
277 |
|
278 /*! |
|
279 \fn bool QRegExp::isEmpty() const |
|
280 Returns TRUE if the regexp is empty. |
|
281 */ |
|
282 |
|
283 /*! |
|
284 \fn bool QRegExp::isValid() const |
|
285 Returns TRUE if the regexp is valid, or FALSE if it is invalid. |
|
286 |
|
287 The pattern "[a-z" is an example of an invalid pattern, since it lacks a |
|
288 closing bracket. |
|
289 */ |
|
290 |
|
291 |
|
292 /*! |
|
293 \fn bool QRegExp::wildcard() const |
|
294 Returns TRUE if wildcard mode is on, otherwise FALSE. \sa setWildcard(). |
|
295 */ |
|
296 |
|
297 /*! |
|
298 Sets the wildcard option for the regular expression. The default |
|
299 is FALSE. |
|
300 |
|
301 Setting \e wildcard to TRUE makes it convenient to match filenames |
|
302 instead of plain text. |
|
303 |
|
304 For example, "qr*.cpp" matches the string "qregexp.cpp" in wildcard mode, |
|
305 but not "qicpp" (which would be matched in normal mode). |
|
306 |
|
307 \sa wildcard() |
|
308 */ |
|
309 |
|
310 void QRegExp::setWildcard( bool wildcard ) |
|
311 { |
|
312 if ( wildcard != wc ) { |
|
313 wc = wildcard; |
|
314 compile(); |
|
315 } |
|
316 } |
|
317 |
|
318 /*! |
|
319 \fn bool QRegExp::caseSensitive() const |
|
320 |
|
321 Returns TRUE if case sensitivity is enabled, otherwise FALSE. The |
|
322 default is TRUE. |
|
323 |
|
324 \sa setCaseSensitive() |
|
325 */ |
|
326 |
|
327 /*! |
|
328 Enables or disables case sensitive matching. |
|
329 |
|
330 In case sensitive mode, "a.e" matches "axe" but not "Axe". |
|
331 |
|
332 See also: caseSensitive() |
|
333 */ |
|
334 |
|
335 void QRegExp::setCaseSensitive( bool enable ) |
|
336 { |
|
337 if ( cs != enable ) { |
|
338 cs = enable; |
|
339 compile(); |
|
340 } |
|
341 } |
|
342 |
|
343 |
|
344 /*! |
|
345 \fn QString QRegExp::pattern() const |
|
346 Returns the pattern string of the regexp. |
|
347 */ |
|
348 |
|
349 |
|
350 /*! |
|
351 \fn void QRegExp::setPattern(const QString & pattern) |
|
352 Sets the pattern string to \a pattern and returns a reference to this regexp. |
|
353 The case sensitivity or wildcard options do not change. |
|
354 */ |
|
355 |
|
356 static inline bool iswordchar( int x ) |
|
357 { |
|
358 return isalnum(x) || x == '_'; //# Only 8-bit support |
|
359 } |
|
360 |
|
361 |
|
362 /*! |
|
363 \internal |
|
364 Match character class |
|
365 */ |
|
366 |
|
367 static bool matchcharclass( uint *rxd, QChar c ) |
|
368 { |
|
369 uint *d = rxd; |
|
370 uint clcode = *d & MCD; |
|
371 bool neg = clcode == CCN; |
|
372 if ( clcode != CCL && clcode != CCN) |
|
373 qWarning("QRegExp: Internal error, please report to qt-bugs@trolltech.com"); |
|
374 uint numFields = *d & MVL; |
|
375 uint cval = (((uint)(c.row())) << 8) | ((uint)c.cell()); |
|
376 bool found = FALSE; |
|
377 for ( int i = 0; i < (int)numFields; i++ ) { |
|
378 d++; |
|
379 if ( *d == PWS && c.isSpace() ) { |
|
380 found = TRUE; |
|
381 break; |
|
382 } |
|
383 if ( *d == PDG && c.isDigit() ) { |
|
384 found = TRUE; |
|
385 break; |
|
386 } |
|
387 else { |
|
388 uint from = ( *d & MCD ) >> 16; |
|
389 uint to = *d & MVL; |
|
390 if ( (cval >= from) && (cval <= to) ) { |
|
391 found = TRUE; |
|
392 break; |
|
393 } |
|
394 } |
|
395 } |
|
396 return neg ? !found : found; |
|
397 } |
|
398 |
|
399 |
|
400 |
|
401 /* |
|
402 Internal: Recursively match string. |
|
403 */ |
|
404 |
|
405 static int matchstring( uint *rxd, const QChar *str, uint strlength, |
|
406 const QChar *bol, bool cs ) |
|
407 { |
|
408 const QChar *p = str; |
|
409 const QChar *start = p; |
|
410 uint pl = strlength; |
|
411 uint *d = rxd; |
|
412 |
|
413 //### in all cases here: handle pl == 0! (don't read past strlen) |
|
414 while ( *d ) { |
|
415 if ( *d & CHR ) { // match char |
|
416 if ( !pl ) |
|
417 return -1; |
|
418 QChar c( *d ); |
|
419 if ( !cs && !c.row() ) { // case insensitive, #Only 8bit |
|
420 if ( p->row() || tolower(p->cell()) != c.cell() ) |
|
421 return -1; |
|
422 p++; |
|
423 pl--; |
|
424 } else { // case insensitive |
|
425 if ( *p != c ) |
|
426 return -1; |
|
427 p++; |
|
428 pl--; |
|
429 } |
|
430 d++; |
|
431 } |
|
432 else if ( *d & MCC ) { // match char class |
|
433 if ( !pl ) |
|
434 return -1; |
|
435 if ( !matchcharclass( d, *p ) ) |
|
436 return -1; |
|
437 p++; |
|
438 pl--; |
|
439 d += (*d & MVL) + 1; |
|
440 } |
|
441 else switch ( *d++ ) { |
|
442 case PWS: // match whitespace |
|
443 if ( !pl || !p->isSpace() ) |
|
444 return -1; |
|
445 p++; |
|
446 pl--; |
|
447 break; |
|
448 case PDG: // match digits |
|
449 if ( !pl || !p->isDigit() ) |
|
450 return -1; |
|
451 p++; |
|
452 pl--; |
|
453 break; |
|
454 case ANY: // match anything |
|
455 if ( !pl ) |
|
456 return -1; |
|
457 p++; |
|
458 pl--; |
|
459 break; |
|
460 case BOL: // match beginning of line |
|
461 if ( p != bol ) |
|
462 return -1; |
|
463 break; |
|
464 case EOL: // match end of line |
|
465 if ( pl ) |
|
466 return -1; |
|
467 break; |
|
468 case BOW: // match beginning of word |
|
469 if ( !iswordchar(*p) || (p > bol && iswordchar(*(p-1)) ) ) |
|
470 return -1; |
|
471 break; |
|
472 case EOW: // match end of word |
|
473 if ( iswordchar(*p) || p == bol || !iswordchar(*(p-1)) ) |
|
474 return -1; |
|
475 break; |
|
476 case CLO: // Kleene closure |
|
477 { |
|
478 const QChar *first_p = p; |
|
479 if ( *d & CHR ) { // match char |
|
480 QChar c( *d ); |
|
481 if ( !cs && !c.row() ) { // case insensitive, #only 8bit |
|
482 while ( pl && !p->row() && tolower(p->cell())==c.cell() ) { |
|
483 p++; |
|
484 pl--; |
|
485 } |
|
486 } |
|
487 else { // case sensitive |
|
488 while ( pl && *p == c ) { |
|
489 p++; |
|
490 pl--; |
|
491 } |
|
492 } |
|
493 d++; |
|
494 } |
|
495 else if ( *d & MCC ) { // match char class |
|
496 while( pl && matchcharclass( d, *p ) ) { |
|
497 p++; |
|
498 pl--; |
|
499 } |
|
500 d += (*d & MVL) + 1; |
|
501 } |
|
502 else if ( *d == PWS ) { |
|
503 while ( pl && p->isSpace() ) { |
|
504 p++; |
|
505 pl--; |
|
506 } |
|
507 d++; |
|
508 } |
|
509 else if ( *d == PDG ) { |
|
510 while ( pl && p->isDigit() ) { |
|
511 p++; |
|
512 pl--; |
|
513 } |
|
514 d++; |
|
515 } |
|
516 else if ( *d == ANY ) { |
|
517 p += pl; |
|
518 pl = 0; |
|
519 d++; |
|
520 } |
|
521 else { |
|
522 return -1; // error |
|
523 } |
|
524 d++; // skip CLO's END |
|
525 while ( p >= first_p ) { // go backwards |
|
526 int end = matchstring( d, p, pl, bol, cs ); |
|
527 if ( end >= 0 ) |
|
528 return ( p - start ) + end; |
|
529 if ( !p ) |
|
530 return -1; |
|
531 --p; |
|
532 ++pl; |
|
533 } |
|
534 } |
|
535 return -1; |
|
536 case OPT: // optional closure |
|
537 { |
|
538 const QChar *first_p = p; |
|
539 if ( *d & CHR ) { // match char |
|
540 QChar c( *d ); |
|
541 if ( !cs && !c.row() ) { // case insensitive, #only 8bit |
|
542 if ( pl && !p->row() && tolower(p->cell()) == c.cell() ) { |
|
543 p++; |
|
544 pl--; |
|
545 } |
|
546 } |
|
547 else { // case sensitive |
|
548 if ( pl && *p == c ) { |
|
549 p++; |
|
550 pl--; |
|
551 } |
|
552 } |
|
553 d++; |
|
554 } |
|
555 else if ( *d & MCC ) { // match char class |
|
556 if ( pl && matchcharclass( d, *p ) ) { |
|
557 p++; |
|
558 pl--; |
|
559 } |
|
560 d += (*d & MVL) + 1; |
|
561 } |
|
562 else if ( *d == PWS ) { |
|
563 if ( pl && p->isSpace() ) { |
|
564 p++; |
|
565 pl--; |
|
566 } |
|
567 d++; |
|
568 } |
|
569 else if ( *d == PDG ) { |
|
570 if ( pl && p->isDigit() ) { |
|
571 p++; |
|
572 pl--; |
|
573 } |
|
574 d++; |
|
575 } |
|
576 else if ( *d == ANY ) { |
|
577 if ( pl ) { |
|
578 p++; |
|
579 pl--; |
|
580 } |
|
581 d++; |
|
582 } |
|
583 else { |
|
584 return -1; // error |
|
585 } |
|
586 d++; // skip OPT's END |
|
587 while ( p >= first_p ) { // go backwards |
|
588 int end = matchstring( d, p, pl, bol, cs ); |
|
589 if ( end >= 0 ) |
|
590 return ( p - start ) + end; |
|
591 if ( !p ) |
|
592 return -1; |
|
593 --p; |
|
594 ++pl; |
|
595 } |
|
596 } |
|
597 return -1; |
|
598 |
|
599 default: // error |
|
600 return -1; |
|
601 } |
|
602 } |
|
603 return p - start; |
|
604 } |
|
605 |
|
606 |
|
607 /*! |
|
608 \internal |
|
609 Recursively match string. |
|
610 */ |
|
611 |
|
612 // This is obsolete now, but since it is protected (not private), it |
|
613 // is still implemented on the off-chance that somebody has made a |
|
614 // class derived from QRegExp and calls this directly. |
|
615 // Qt 3.0: Remove this? |
|
616 |
|
617 |
|
618 const QChar *QRegExp::matchstr( uint *rxd, const QChar *str, uint strlength, |
|
619 const QChar *bol ) const |
|
620 { |
|
621 int len = matchstring( rxd, str, strlength, bol, cs ); |
|
622 if ( len < 0 ) |
|
623 return 0; |
|
624 return str + len; |
|
625 } |
|
626 |
|
627 /*! |
|
628 Attempts to match in \e str, starting from position \e index. |
|
629 Returns the position of the match, or -1 if there was no match. |
|
630 |
|
631 If \e len is not a null pointer, the length of the match is stored in |
|
632 \e *len. |
|
633 |
|
634 If \e indexIsStart is TRUE (the default), the position \e index in |
|
635 the string will match the start-of-input primitive (^) in the |
|
636 regexp, if present. Otherwise, position 0 in \e str will match. |
|
637 |
|
638 Example: |
|
639 \code |
|
640 QRegExp r("[0-9]*\\.[0-9]+"); // matches floating point |
|
641 int len; |
|
642 r.match("pi = 3.1416", 0, &len); // returns 5, len == 6 |
|
643 \endcode |
|
644 |
|
645 \note In Qt 3.0, this function will be replaced by find(). |
|
646 */ |
|
647 |
|
648 int QRegExp::match( const QString &str, int index, int *len, |
|
649 bool indexIsStart ) const |
|
650 { |
|
651 if ( !isValid() || isEmpty() ) |
|
652 return -1; |
|
653 if ( str.length() < (uint)index ) |
|
654 return -1; |
|
655 const QChar *start = str.unicode(); |
|
656 const QChar *p = start + index; |
|
657 uint pl = str.length() - index; |
|
658 uint *d = rxdata; |
|
659 int ep = -1; |
|
660 |
|
661 if ( *d == BOL ) { // match from beginning of line |
|
662 ep = matchstring( d, p, pl, indexIsStart ? p : start, cs ); |
|
663 } else { |
|
664 if ( *d & CHR ) { |
|
665 QChar c( *d ); |
|
666 if ( !cs && !c.row() ) { // case sensitive, # only 8bit |
|
667 while ( pl && ( p->row() || tolower(p->cell()) != c.cell() ) ) { |
|
668 p++; |
|
669 pl--; |
|
670 } |
|
671 } else { // case insensitive |
|
672 while ( pl && *p != c ) { |
|
673 p++; |
|
674 pl--; |
|
675 } |
|
676 } |
|
677 } |
|
678 while( 1 ) { // regular match |
|
679 ep = matchstring( d, p, pl, indexIsStart ? start+index : start, cs ); |
|
680 if ( ep >= 0 ) |
|
681 break; |
|
682 if ( !pl ) |
|
683 break; |
|
684 p++; |
|
685 pl--; |
|
686 } |
|
687 } |
|
688 if ( len ) |
|
689 *len = ep >= 0 ? ep : 0; // No match -> 0, for historical reasons |
|
690 return ep >= 0 ? (int)(p - start) : -1; // return index; |
|
691 } |
|
692 |
|
693 /*! \fn int QRegExp::find( const QString& str, int index ) |
|
694 |
|
695 Attempts to match in \e str, starting from position \e index. |
|
696 Returns the position of the match, or -1 if there was no match. |
|
697 |
|
698 \sa match() |
|
699 */ |
|
700 |
|
701 // |
|
702 // Translate wildcard pattern to standard regexp pattern. |
|
703 // Ex: *.cpp ==> ^.*\.cpp$ |
|
704 // |
|
705 |
|
706 static QString wc2rx( const QString &pattern ) |
|
707 { |
|
708 int patlen = (int)pattern.length(); |
|
709 QString wcpattern = QString::fromLatin1("^"); |
|
710 |
|
711 QChar c; |
|
712 for( int i = 0; i < patlen; i++ ) { |
|
713 c = pattern[i]; |
|
714 switch ( (char)c ) { |
|
715 case '*': // '*' ==> '.*' |
|
716 wcpattern += '.'; |
|
717 break; |
|
718 case '?': // '?' ==> '.' |
|
719 c = '.'; |
|
720 break; |
|
721 case '.': // quote special regexp chars |
|
722 case '+': |
|
723 case '\\': |
|
724 case '$': |
|
725 case '^': |
|
726 wcpattern += '\\'; |
|
727 break; |
|
728 case '[': |
|
729 if ( (char)pattern[i+1] == '^' ) { // don't quote '^' after '[' |
|
730 wcpattern += '['; |
|
731 c = pattern[i+1]; |
|
732 i++; |
|
733 } |
|
734 break; |
|
735 } |
|
736 wcpattern += c; |
|
737 |
|
738 } |
|
739 wcpattern += '$'; |
|
740 return wcpattern; // return new regexp pattern |
|
741 } |
|
742 |
|
743 |
|
744 // |
|
745 // Internal: Get char value and increment pointer. |
|
746 // |
|
747 |
|
748 static uint char_val( const QChar **str, uint *strlength ) // get char value |
|
749 { |
|
750 const QChar *p = *str; |
|
751 uint pl = *strlength; |
|
752 uint len = 1; |
|
753 uint v = 0; |
|
754 if ( (char)*p == '\\' ) { // escaped code |
|
755 p++; |
|
756 pl--; |
|
757 if ( !pl ) { // it is just a '\' |
|
758 (*str)++; |
|
759 (*strlength)--; |
|
760 return '\\'; |
|
761 } |
|
762 len++; // length at least 2 |
|
763 int i; |
|
764 char c; |
|
765 char ch = tolower((char)*p); |
|
766 switch ( ch ) { |
|
767 case 'b': v = '\b'; break; // bell |
|
768 case 'f': v = '\f'; break; // form feed |
|
769 case 'n': v = '\n'; break; // newline |
|
770 case 'r': v = '\r'; break; // return |
|
771 case 't': v = '\t'; break; // tab |
|
772 case 's': v = PWS; break; // whitespace charclass |
|
773 case 'd': v = PDG; break; // digit charclass |
|
774 case '<': v = BOW; break; // word beginning matcher |
|
775 case '>': v = EOW; break; // word ending matcher |
|
776 |
|
777 case 'x': { // hex code |
|
778 p++; |
|
779 pl--; |
|
780 for ( i = 0; (i < 4) && pl; i++ ) { //up to 4 hex digits |
|
781 c = tolower((char)*p); |
|
782 bool a = ( c >= 'a' && c <= 'f' ); |
|
783 if ( (c >= '0' && c <= '9') || a ) { |
|
784 v <<= 4; |
|
785 v += a ? 10 + c - 'a' : c - '0'; |
|
786 len++; |
|
787 } |
|
788 else { |
|
789 break; |
|
790 } |
|
791 p++; |
|
792 pl--; |
|
793 } |
|
794 } |
|
795 break; |
|
796 |
|
797 default: { |
|
798 if ( ch >= '0' && ch <= '7' ) { //octal code |
|
799 len--; |
|
800 for ( i = 0; (i < 3) && pl; i++ ) { // up to 3 oct digits |
|
801 c = (char)*p; |
|
802 if ( c >= '0' && c <= '7' ) { |
|
803 v <<= 3; |
|
804 v += c - '0'; |
|
805 len++; |
|
806 } |
|
807 else { |
|
808 break; |
|
809 } |
|
810 p++; |
|
811 pl--; |
|
812 } |
|
813 } |
|
814 else { // not an octal number |
|
815 v = (((uint)(p->row())) << 8) | ((uint)p->cell()); |
|
816 } |
|
817 } |
|
818 } |
|
819 } else { |
|
820 v = (((uint)(p->row())) << 8) | ((uint)p->cell()); |
|
821 } |
|
822 *str += len; |
|
823 *strlength -= len; |
|
824 return v; |
|
825 } |
|
826 |
|
827 |
|
828 #if defined(DEBUG) |
|
829 static uint *dump( uint *p ) |
|
830 { |
|
831 while ( *p != END ) { |
|
832 if ( *p & CHR ) { |
|
833 QChar uc = (QChar)*p; |
|
834 char c = (char)uc; |
|
835 uint u = (((uint)(uc.row())) << 8) | ((uint)uc.cell()); |
|
836 qDebug( "\tCHR\tU%04x (%c)", u, (c ? c : ' ')); |
|
837 p++; |
|
838 } |
|
839 else if ( *p & MCC ) { |
|
840 uint clcode = *p & MCD; |
|
841 uint numFields = *p & MVL; |
|
842 if ( clcode == CCL ) |
|
843 qDebug( "\tCCL\t%i", numFields ); |
|
844 else if ( clcode == CCN ) |
|
845 qDebug( "\tCCN\t%i", numFields ); |
|
846 else |
|
847 qDebug("coding error!"); |
|
848 for ( int i = 0; i < (int)numFields; i++ ) { |
|
849 p++; |
|
850 if ( *p == PWS ) |
|
851 qDebug( "\t\tPWS" ); |
|
852 else if ( *p == PDG ) |
|
853 qDebug( "\t\tPDG" ); |
|
854 else { |
|
855 uint from = ( *p & MCD ) >> 16; |
|
856 uint to = *p & MVL; |
|
857 char fc = (char)QChar(from); |
|
858 char tc = (char)QChar(to); |
|
859 qDebug( "\t\tU%04x (%c) - U%04x (%c)", from, |
|
860 (fc ? fc : ' '), to, (tc ? tc : ' ') ); |
|
861 } |
|
862 } |
|
863 p++; |
|
864 } |
|
865 else switch ( *p++ ) { |
|
866 case PWS: |
|
867 qDebug( "\tPWS" ); |
|
868 break; |
|
869 case PDG: |
|
870 qDebug( "\tPDG" ); |
|
871 break; |
|
872 case BOL: |
|
873 qDebug( "\tBOL" ); |
|
874 break; |
|
875 case EOL: |
|
876 qDebug( "\tEOL" ); |
|
877 break; |
|
878 case BOW: |
|
879 qDebug( "\tBOW" ); |
|
880 break; |
|
881 case EOW: |
|
882 qDebug( "\tEOW" ); |
|
883 break; |
|
884 case ANY: |
|
885 qDebug( "\tANY" ); |
|
886 break; |
|
887 case CLO: |
|
888 qDebug( "\tCLO" ); |
|
889 p = dump( p ); |
|
890 break; |
|
891 case OPT: |
|
892 qDebug( "\tOPT" ); |
|
893 p = dump( p ); |
|
894 break; |
|
895 } |
|
896 } |
|
897 qDebug( "\tEND" ); |
|
898 return p+1; |
|
899 } |
|
900 #endif // DEBUG |
|
901 |
|
902 |
|
903 static const int maxlen = 1024; // max length of regexp array |
|
904 static uint rxarray[ maxlen ]; // tmp regexp array |
|
905 |
|
906 /*! |
|
907 \internal |
|
908 Compiles the regular expression and stores the result in rxdata. |
|
909 The 'error' flag is set to non-zero if an error is detected. |
|
910 NOTE! This function is not reentrant! |
|
911 */ |
|
912 |
|
913 void QRegExp::compile() |
|
914 { |
|
915 if ( rxdata ) { // delete old data |
|
916 delete [] rxdata; |
|
917 rxdata = 0; |
|
918 } |
|
919 if ( rxstring.isEmpty() ) { // no regexp pattern set |
|
920 error = PatNull; |
|
921 return; |
|
922 } |
|
923 |
|
924 error = PatOk; // assume pattern is ok |
|
925 |
|
926 QString pattern; |
|
927 if ( wc ) |
|
928 pattern = wc2rx(rxstring); |
|
929 else |
|
930 pattern = rxstring; |
|
931 const QChar *start = pattern.unicode(); // pattern pointer |
|
932 const QChar *p = start; // pattern pointer |
|
933 uint pl = pattern.length(); |
|
934 uint *d = rxarray; // data pointer |
|
935 uint *prev_d = 0; |
|
936 |
|
937 #define GEN(x) *d++ = (x) |
|
938 |
|
939 while ( pl ) { |
|
940 char ch = (char)*p; |
|
941 switch ( ch ) { |
|
942 |
|
943 case '^': // beginning of line |
|
944 prev_d = d; |
|
945 GEN( p == start ? BOL : (CHR | ch) ); |
|
946 p++; |
|
947 pl--; |
|
948 break; |
|
949 |
|
950 case '$': // end of line |
|
951 prev_d = d; |
|
952 GEN( pl == 1 ? EOL : (CHR | ch) ); |
|
953 p++; |
|
954 pl--; |
|
955 break; |
|
956 |
|
957 case '.': // any char |
|
958 prev_d = d; |
|
959 GEN( ANY ); |
|
960 p++; |
|
961 pl--; |
|
962 break; |
|
963 |
|
964 case '[': // character class |
|
965 { |
|
966 prev_d = d; |
|
967 p++; |
|
968 pl--; |
|
969 if ( !pl ) { |
|
970 error = PatSyntax; |
|
971 return; |
|
972 } |
|
973 bool firstIsEscaped = ( (char)*p == '\\' ); |
|
974 uint cch = char_val( &p, &pl ); |
|
975 if ( cch == '^' && !firstIsEscaped ) { // negate! |
|
976 GEN( CCN ); |
|
977 if ( !pl ) { |
|
978 error = PatSyntax; |
|
979 return; |
|
980 } |
|
981 cch = char_val( &p, &pl ); |
|
982 } else { |
|
983 GEN( CCL ); |
|
984 } |
|
985 uint numFields = 0; |
|
986 while ( pl ) { |
|
987 if ((pl>2) && ((char)*p == '-') && ((char)*(p+1) != ']')) { |
|
988 // Found a range |
|
989 char_val( &p, &pl ); // Read the '-' |
|
990 uint cch2 = char_val( &p, &pl ); // Read the range end |
|
991 if ( cch > cch2 ) { // swap start and stop |
|
992 int tmp = cch; |
|
993 cch = cch2; |
|
994 cch2 = tmp; |
|
995 } |
|
996 GEN( (cch << 16) | cch2 ); // from < to |
|
997 numFields++; |
|
998 } |
|
999 else { |
|
1000 // Found a single character |
|
1001 if ( cch & MCD ) // It's a code; will not be mistaken |
|
1002 GEN( cch ); // for a range, since from > to |
|
1003 else |
|
1004 GEN( (cch << 16) | cch ); // from == to range |
|
1005 numFields++; |
|
1006 } |
|
1007 if ( d >= rxarray + maxlen ) { // pattern too long |
|
1008 error = PatOverflow; |
|
1009 return; |
|
1010 } |
|
1011 if ( !pl ) { // At least ']' should be left |
|
1012 error = PatSyntax; |
|
1013 return; |
|
1014 } |
|
1015 bool nextIsEscaped = ( (char)*p == '\\' ); |
|
1016 cch = char_val( &p, &pl ); |
|
1017 if ( cch == (uint)']' && !nextIsEscaped ) |
|
1018 break; |
|
1019 if ( !pl ) { // End, should have seen ']' |
|
1020 error = PatSyntax; |
|
1021 return; |
|
1022 } |
|
1023 } |
|
1024 *prev_d |= numFields; // Store number of fields |
|
1025 } |
|
1026 break; |
|
1027 |
|
1028 case '*': // Kleene closure, or |
|
1029 case '+': // positive closure, or |
|
1030 case '?': // optional closure |
|
1031 { |
|
1032 if ( prev_d == 0 ) { // no previous expression |
|
1033 error = PatSyntax; // empty closure |
|
1034 return; |
|
1035 } |
|
1036 switch ( *prev_d ) { // test if invalid closure |
|
1037 case BOL: |
|
1038 case BOW: |
|
1039 case EOW: |
|
1040 case CLO: |
|
1041 case OPT: |
|
1042 error = PatSyntax; |
|
1043 return; |
|
1044 } |
|
1045 int ddiff = d - prev_d; |
|
1046 if ( *p == '+' ) { // convert to Kleene closure |
|
1047 if ( d + ddiff >= rxarray + maxlen ) { |
|
1048 error = PatOverflow; // pattern too long |
|
1049 return; |
|
1050 } |
|
1051 memcpy( d, prev_d, ddiff*sizeof(uint) ); |
|
1052 d += ddiff; |
|
1053 prev_d += ddiff; |
|
1054 } |
|
1055 memmove( prev_d+1, prev_d, ddiff*sizeof(uint) ); |
|
1056 *prev_d = ch == '?' ? OPT : CLO; |
|
1057 d++; |
|
1058 GEN( END ); |
|
1059 p++; |
|
1060 pl--; |
|
1061 } |
|
1062 break; |
|
1063 |
|
1064 default: |
|
1065 { |
|
1066 prev_d = d; |
|
1067 uint cv = char_val( &p, &pl ); |
|
1068 if ( cv & MCD ) { // It's a code |
|
1069 GEN( cv ); |
|
1070 } |
|
1071 else { |
|
1072 if ( !cs && cv <= 0xff ) // #only 8bit support |
|
1073 cv = tolower( cv ); |
|
1074 GEN( CHR | cv ); |
|
1075 } |
|
1076 } |
|
1077 } |
|
1078 if ( d >= rxarray + maxlen ) { // oops! |
|
1079 error = PatOverflow; // pattern too long |
|
1080 return; |
|
1081 } |
|
1082 } |
|
1083 GEN( END ); |
|
1084 int len = d - rxarray; |
|
1085 rxdata = new uint[ len ]; // copy from rxarray to rxdata |
|
1086 CHECK_PTR( rxdata ); |
|
1087 memcpy( rxdata, rxarray, len*sizeof(uint) ); |
|
1088 #if defined(DEBUG) |
|
1089 //dump( rxdata ); // uncomment this line for debugging |
|
1090 #endif |
|
1091 } |