author | Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com> |
Fri, 12 Mar 2010 15:46:37 +0200 | |
branch | RCL_3 |
changeset 5 | d3bac044e0f0 |
parent 4 | 3b1da2848fc7 |
permissions | -rw-r--r-- |
0 | 1 |
/**************************************************************************** |
2 |
** |
|
4
3b1da2848fc7
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
0
diff
changeset
|
3 |
** Copyright (C) 2010 Nokia Corporation and/or its subsidiary(-ies). |
0 | 4 |
** All rights reserved. |
5 |
** Contact: Nokia Corporation (qt-info@nokia.com) |
|
6 |
** |
|
7 |
** This file is part of the utils of the Qt Toolkit. |
|
8 |
** |
|
9 |
** $QT_BEGIN_LICENSE:LGPL$ |
|
10 |
** No Commercial Usage |
|
11 |
** This file contains pre-release code and may not be distributed. |
|
12 |
** You may use this file in accordance with the terms and conditions |
|
13 |
** contained in the Technology Preview License Agreement accompanying |
|
14 |
** this package. |
|
15 |
** |
|
16 |
** GNU Lesser General Public License Usage |
|
17 |
** Alternatively, this file may be used under the terms of the GNU Lesser |
|
18 |
** General Public License version 2.1 as published by the Free Software |
|
19 |
** Foundation and appearing in the file LICENSE.LGPL included in the |
|
20 |
** packaging of this file. Please review the following information to |
|
21 |
** ensure the GNU Lesser General Public License version 2.1 requirements |
|
22 |
** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html. |
|
23 |
** |
|
24 |
** In addition, as a special exception, Nokia gives you certain additional |
|
25 |
** rights. These rights are described in the Nokia Qt LGPL Exception |
|
26 |
** version 1.1, included in the file LGPL_EXCEPTION.txt in this package. |
|
27 |
** |
|
28 |
** If you have questions regarding the use of this file, please contact |
|
29 |
** Nokia at qt-info@nokia.com. |
|
30 |
** |
|
31 |
** |
|
32 |
** |
|
33 |
** |
|
34 |
** |
|
35 |
** |
|
36 |
** |
|
37 |
** |
|
38 |
** $QT_END_LICENSE$ |
|
39 |
** |
|
40 |
****************************************************************************/ |
|
41 |
||
42 |
#include "re2nfa.h" |
|
43 |
#include "tokenizer.cpp" |
|
44 |
||
45 |
RE2NFA::RE2NFA(const QMap<QString, NFA> ¯os, const QSet<InputType> &maxInputSet, Qt::CaseSensitivity cs) |
|
46 |
: macros(macros), index(0), errorColumn(-1), maxInputSet(maxInputSet), caseSensitivity(cs) |
|
47 |
{ |
|
48 |
} |
|
49 |
||
50 |
NFA RE2NFA::parse(const QString &expression, int *errCol) |
|
51 |
{ |
|
52 |
tokenize(expression); |
|
53 |
||
54 |
if (symbols.isEmpty()) |
|
55 |
return NFA(); |
|
56 |
||
57 |
index = 0; |
|
58 |
||
59 |
NFA result = parseExpr(); |
|
60 |
if (result.isEmpty()) { |
|
61 |
if (errCol) |
|
62 |
*errCol = errorColumn; |
|
63 |
} |
|
64 |
return result; |
|
65 |
} |
|
66 |
||
67 |
NFA RE2NFA::parseExpr() |
|
68 |
{ |
|
69 |
NFA value = parseBranch(); |
|
70 |
while (test(TOK_OR)) { |
|
71 |
NFA rhs = parseBranch(); |
|
72 |
value = NFA::createAlternatingNFA(value, rhs); |
|
73 |
} |
|
74 |
return value; |
|
75 |
} |
|
76 |
||
77 |
NFA RE2NFA::parseBranch() |
|
78 |
{ |
|
79 |
NFA value = parsePiece(); |
|
80 |
if (!hasNext()) |
|
81 |
return value; |
|
82 |
NFA next; |
|
83 |
do { |
|
84 |
next = parsePiece(); |
|
85 |
if (!next.isEmpty()) |
|
86 |
value = NFA::createConcatenatingNFA(value, next); |
|
87 |
} while (!next.isEmpty() && hasNext()); |
|
88 |
return value; |
|
89 |
} |
|
90 |
||
91 |
NFA RE2NFA::parsePiece() |
|
92 |
{ |
|
93 |
NFA atom = parseAtom(); |
|
94 |
if (atom.isEmpty() || !hasNext()) |
|
95 |
return atom; |
|
96 |
return parseMaybeQuantifier(atom); |
|
97 |
} |
|
98 |
||
99 |
NFA RE2NFA::parseAtom() |
|
100 |
{ |
|
101 |
// #### |
|
102 |
switch (next()) { |
|
103 |
case TOK_STRING: |
|
104 |
return createCharNFA(); |
|
105 |
case TOK_LPAREN: { |
|
106 |
NFA subExpr = parseExpr(); |
|
107 |
next(TOK_RPAREN); |
|
108 |
return subExpr; |
|
109 |
} |
|
110 |
case TOK_LBRACE: { |
|
111 |
QString macroName = lexemUntil(TOK_RBRACE); |
|
112 |
QMap<QString, NFA>::ConstIterator macro = macros.find(macroName); |
|
113 |
if (macro == macros.end()) { |
|
114 |
qWarning("Unknown macro '%s' - probably used before defined", qPrintable(macroName)); |
|
115 |
return NFA(); |
|
116 |
} |
|
117 |
return *macro; |
|
118 |
} |
|
119 |
case TOK_LBRACKET: { |
|
120 |
NFA set = parseSet(); |
|
121 |
next(TOK_RBRACKET); |
|
122 |
return set; |
|
123 |
} |
|
124 |
case TOK_SEQUENCE: |
|
125 |
return parseSet2(); |
|
126 |
case TOK_DOT: |
|
127 |
return NFA::createSetNFA(maxInputSet); |
|
128 |
default: |
|
129 |
prev(); |
|
130 |
return NFA(); |
|
131 |
} |
|
132 |
} |
|
133 |
||
134 |
NFA RE2NFA::parseMaybeQuantifier(const NFA &nfa) |
|
135 |
{ |
|
136 |
// #### |
|
137 |
switch (next()) { |
|
138 |
case TOK_STAR: |
|
139 |
return NFA::createOptionalNFA(nfa); |
|
140 |
case TOK_QUESTION: |
|
141 |
return NFA::createZeroOrOneNFA(nfa); |
|
142 |
case TOK_PLUS: |
|
143 |
return NFA::createConcatenatingNFA(nfa, NFA::createOptionalNFA(nfa)); |
|
144 |
case TOK_LBRACE: { |
|
145 |
const int rewind = index - 1; |
|
146 |
||
147 |
QString lexemBeforeComma; |
|
148 |
QString lexemAfterComma; |
|
149 |
bool seenComma = false; |
|
150 |
forever { |
|
151 |
if (test(TOK_COMMA)) { |
|
152 |
if (seenComma) { |
|
153 |
errorColumn = symbol().column; |
|
154 |
return NFA(); |
|
155 |
} |
|
156 |
seenComma = true; |
|
157 |
} else if (test(TOK_RBRACE)) { |
|
158 |
break; |
|
159 |
} else { |
|
160 |
next(TOK_STRING); |
|
161 |
if (seenComma) |
|
162 |
lexemAfterComma += symbol().lexem; |
|
163 |
else |
|
164 |
lexemBeforeComma += symbol().lexem; |
|
165 |
} |
|
166 |
} |
|
167 |
bool isNumber = false; |
|
168 |
int min = lexemBeforeComma.toInt(&isNumber); |
|
169 |
if (!isNumber) { |
|
170 |
index = rewind; |
|
171 |
return nfa; |
|
172 |
} |
|
173 |
int max = min; |
|
174 |
if (seenComma) { |
|
175 |
max = lexemAfterComma.toInt(&isNumber); |
|
176 |
if (!isNumber) { |
|
177 |
errorColumn = symbol().column; |
|
178 |
return NFA(); |
|
179 |
} |
|
180 |
} |
|
181 |
return NFA::applyQuantity(nfa, min, max); |
|
182 |
} |
|
183 |
default: |
|
184 |
prev(); |
|
185 |
return nfa; |
|
186 |
} |
|
187 |
} |
|
188 |
||
189 |
NFA RE2NFA::parseSet() |
|
190 |
{ |
|
191 |
QSet<InputType> set; |
|
192 |
bool negate = false; |
|
193 |
||
194 |
next(TOK_STRING); |
|
195 |
||
196 |
do { |
|
197 |
Q_ASSERT(symbol().lexem.length() == 1); |
|
198 |
// ### |
|
199 |
QChar ch = symbol().lexem.at(0); |
|
200 |
if (set.isEmpty() && ch == QLatin1Char('^')) { |
|
201 |
negate = true; |
|
202 |
continue; |
|
203 |
} |
|
204 |
||
205 |
// look ahead for ranges like a-z |
|
206 |
bool rangeFound = false; |
|
207 |
if (test(TOK_STRING)) { |
|
208 |
if (symbol().lexem.length() == 1 |
|
209 |
&& symbol().lexem.at(0) == QLatin1Char('-')) { |
|
210 |
next(TOK_STRING); |
|
211 |
Q_ASSERT(symbol().lexem.length() == 1); |
|
212 |
QChar last = symbol().lexem.at(0); |
|
213 |
||
214 |
if (ch.unicode() > last.unicode()) |
|
215 |
qSwap(ch, last); |
|
216 |
||
217 |
for (ushort i = ch.unicode(); i <= last.unicode(); ++i) { |
|
218 |
if (caseSensitivity == Qt::CaseInsensitive) { |
|
219 |
set.insert(QChar(i).toLower().unicode()); |
|
220 |
} else { |
|
221 |
set.insert(i); |
|
222 |
} |
|
223 |
} |
|
224 |
||
225 |
rangeFound = true; |
|
226 |
} else { |
|
227 |
prev(); |
|
228 |
} |
|
229 |
} |
|
230 |
||
231 |
if (!rangeFound) { |
|
232 |
if (caseSensitivity == Qt::CaseInsensitive) { |
|
233 |
set.insert(ch.toLower().unicode()); |
|
234 |
} else { |
|
235 |
set.insert(ch.unicode()); |
|
236 |
} |
|
237 |
} |
|
238 |
} while (test(TOK_STRING)); |
|
239 |
||
240 |
if (negate) { |
|
241 |
QSet<InputType> negatedSet = maxInputSet; |
|
242 |
negatedSet.subtract(set); |
|
243 |
set = negatedSet; |
|
244 |
} |
|
245 |
||
246 |
return NFA::createSetNFA(set); |
|
247 |
} |
|
248 |
||
249 |
NFA RE2NFA::parseSet2() |
|
250 |
{ |
|
251 |
QSet<InputType> set; |
|
252 |
bool negate = false; |
|
253 |
||
254 |
QString str = symbol().lexem; |
|
255 |
// strip off brackets |
|
256 |
str.chop(1); |
|
257 |
str.remove(0, 1); |
|
258 |
||
259 |
int i = 0; |
|
260 |
while (i < str.length()) { |
|
261 |
// ### |
|
262 |
QChar ch = str.at(i++); |
|
263 |
if (set.isEmpty() && ch == QLatin1Char('^')) { |
|
264 |
negate = true; |
|
265 |
continue; |
|
266 |
} |
|
267 |
||
268 |
// look ahead for ranges like a-z |
|
269 |
bool rangeFound = false; |
|
270 |
if (i < str.length() - 1 && str.at(i) == QLatin1Char('-')) { |
|
271 |
++i; |
|
272 |
QChar last = str.at(i++); |
|
273 |
||
274 |
if (ch.unicode() > last.unicode()) |
|
275 |
qSwap(ch, last); |
|
276 |
||
277 |
for (ushort i = ch.unicode(); i <= last.unicode(); ++i) { |
|
278 |
if (caseSensitivity == Qt::CaseInsensitive) { |
|
279 |
set.insert(QChar(i).toLower().unicode()); |
|
280 |
} else { |
|
281 |
set.insert(i); |
|
282 |
} |
|
283 |
} |
|
284 |
||
285 |
rangeFound = true; |
|
286 |
} |
|
287 |
||
288 |
if (!rangeFound) { |
|
289 |
if (caseSensitivity == Qt::CaseInsensitive) { |
|
290 |
set.insert(ch.toLower().unicode()); |
|
291 |
} else { |
|
292 |
set.insert(ch.unicode()); |
|
293 |
} |
|
294 |
} |
|
295 |
} |
|
296 |
||
297 |
if (negate) { |
|
298 |
QSet<InputType> negatedSet = maxInputSet; |
|
299 |
negatedSet.subtract(set); |
|
300 |
set = negatedSet; |
|
301 |
} |
|
302 |
||
303 |
return NFA::createSetNFA(set); |
|
304 |
} |
|
305 |
NFA RE2NFA::createCharNFA() |
|
306 |
{ |
|
307 |
NFA nfa; |
|
308 |
// #### |
|
309 |
if (caseSensitivity == Qt::CaseInsensitive) { |
|
310 |
nfa = NFA::createStringNFA(symbol().lexem.toLower().toLatin1()); |
|
311 |
} else { |
|
312 |
nfa = NFA::createStringNFA(symbol().lexem.toLatin1()); |
|
313 |
} |
|
314 |
return nfa; |
|
315 |
} |
|
316 |
||
317 |
static inline int skipQuote(const QString &str, int pos) |
|
318 |
{ |
|
319 |
while (pos < str.length() |
|
320 |
&& str.at(pos) != QLatin1Char('"')) { |
|
321 |
if (str.at(pos) == QLatin1Char('\\')) { |
|
322 |
++pos; |
|
323 |
if (pos >= str.length()) |
|
324 |
break; |
|
325 |
} |
|
326 |
++pos; |
|
327 |
} |
|
328 |
if (pos < str.length()) |
|
329 |
++pos; |
|
330 |
return pos; |
|
331 |
} |
|
332 |
||
333 |
#if 0 |
|
334 |
static const char*tokStr(Token t) |
|
335 |
{ |
|
336 |
switch (t) { |
|
337 |
case TOK_INVALID: return "TOK_INVALID"; |
|
338 |
case TOK_STRING: return "TOK_STRING"; |
|
339 |
case TOK_LBRACE: return "TOK_LBRACE"; |
|
340 |
case TOK_RBRACE: return "TOK_RBRACE"; |
|
341 |
case TOK_LBRACKET: return "TOK_LBRACKET"; |
|
342 |
case TOK_RBRACKET: return "TOK_RBRACKET"; |
|
343 |
case TOK_LPAREN: return "TOK_LPAREN"; |
|
344 |
case TOK_RPAREN: return "TOK_RPAREN"; |
|
345 |
case TOK_COMMA: return "TOK_COMMA"; |
|
346 |
case TOK_STAR: return "TOK_STAR"; |
|
347 |
case TOK_OR: return "TOK_OR"; |
|
348 |
case TOK_QUESTION: return "TOK_QUESTION"; |
|
349 |
case TOK_DOT: return "TOK_DOT"; |
|
350 |
case TOK_PLUS: return "TOK_PLUS"; |
|
351 |
case TOK_SEQUENCE: return "TOK_SEQUENCE"; |
|
352 |
case TOK_QUOTED_STRING: return "TOK_QUOTED_STRING"; |
|
353 |
} |
|
354 |
return ""; |
|
355 |
} |
|
356 |
#endif |
|
357 |
||
358 |
void RE2NFA::tokenize(const QString &input) |
|
359 |
{ |
|
360 |
symbols.clear(); |
|
361 |
#if 1 |
|
362 |
RegExpTokenizer tokenizer(input); |
|
363 |
Symbol sym; |
|
364 |
int tok = tokenizer.lex(); |
|
365 |
while (tok != -1) { |
|
366 |
Symbol sym; |
|
367 |
sym.token = static_cast<Token>(tok); |
|
368 |
sym.lexem = input.mid(tokenizer.lexemStart, tokenizer.lexemLength); |
|
369 |
||
370 |
if (sym.token == TOK_QUOTED_STRING) { |
|
371 |
sym.lexem.chop(1); |
|
372 |
sym.lexem.remove(0, 1); |
|
373 |
sym.token = TOK_STRING; |
|
374 |
} |
|
375 |
||
376 |
if (sym.token == TOK_STRING || sym.token == TOK_SEQUENCE) { |
|
377 |
for (int i = 0; i < sym.lexem.length(); ++i) { |
|
378 |
if (sym.lexem.at(i) == '\\') { |
|
379 |
if (i >= sym.lexem.length() - 1) |
|
380 |
break; |
|
381 |
QChar ch = sym.lexem.at(i + 1); |
|
382 |
if (ch == QLatin1Char('n')) { |
|
383 |
ch = '\n'; |
|
384 |
} else if (ch == QLatin1Char('r')) { |
|
385 |
ch = '\r'; |
|
386 |
} else if (ch == QLatin1Char('t')) { |
|
387 |
ch = '\t'; |
|
388 |
} else if (ch == QLatin1Char('f')) { |
|
389 |
ch = '\f'; |
|
390 |
} |
|
391 |
sym.lexem.replace(i, 2, ch); |
|
392 |
} |
|
393 |
} |
|
394 |
} |
|
395 |
||
396 |
/* |
|
397 |
if (sym.token == TOK_SEQUENCE) { |
|
398 |
Symbol s; |
|
399 |
s.token = TOK_LBRACKET; |
|
400 |
s.lexem = "["; |
|
401 |
symbols.append(s); |
|
402 |
||
403 |
for (int i = 1; i < sym.lexem.length() - 1; ++i) { |
|
404 |
s.token = TOK_STRING; |
|
405 |
s.lexem = sym.lexem.at(i); |
|
406 |
symbols.append(s); |
|
407 |
} |
|
408 |
||
409 |
s.token = TOK_RBRACKET; |
|
410 |
s.lexem = "]"; |
|
411 |
symbols.append(s); |
|
412 |
||
413 |
tok = tokenizer.lex(); |
|
414 |
continue; |
|
415 |
} |
|
416 |
*/ |
|
417 |
||
418 |
symbols.append(sym); |
|
419 |
tok = tokenizer.lex(); |
|
420 |
} |
|
421 |
#else |
|
422 |
int pos = 0; |
|
423 |
bool insideSet = false; |
|
424 |
while (pos < input.length()) { |
|
425 |
QChar ch = input.at(pos); |
|
426 |
||
427 |
Symbol sym; |
|
428 |
sym.column = pos; |
|
429 |
sym.token = TOK_INVALID; |
|
430 |
sym.lexem = QString(ch); |
|
431 |
switch (ch.toLatin1()) { |
|
432 |
case '"': { |
|
433 |
if (insideSet) { |
|
434 |
sym.token = TOK_STRING; |
|
435 |
sym.lexem = QString(ch); |
|
436 |
symbols += sym; |
|
437 |
++pos; |
|
438 |
continue; |
|
439 |
} |
|
440 |
if (pos + 1 >= input.length()) |
|
441 |
return; |
|
442 |
int quoteEnd = skipQuote(input, pos + 1); |
|
443 |
sym.token = TOK_STRING; |
|
444 |
sym.lexem = input.mid(pos + 1, quoteEnd - pos - 2); |
|
445 |
symbols += sym; |
|
446 |
pos = quoteEnd; |
|
447 |
continue; |
|
448 |
} |
|
449 |
case '{': |
|
450 |
sym.token = (insideSet ? TOK_STRING : TOK_LBRACE); |
|
451 |
break; |
|
452 |
case '}': |
|
453 |
sym.token = (insideSet ? TOK_STRING : TOK_RBRACE); |
|
454 |
break; |
|
455 |
case '[': |
|
456 |
insideSet = true; |
|
457 |
sym.token = TOK_LBRACKET; |
|
458 |
break; |
|
459 |
case ']': |
|
460 |
insideSet = false; |
|
461 |
sym.token = TOK_RBRACKET; |
|
462 |
break; |
|
463 |
case '(': |
|
464 |
sym.token = (insideSet ? TOK_STRING : TOK_LPAREN); |
|
465 |
break; |
|
466 |
case ')': |
|
467 |
sym.token = (insideSet ? TOK_STRING : TOK_RPAREN); |
|
468 |
break; |
|
469 |
case ',': |
|
470 |
sym.token = (insideSet ? TOK_STRING : TOK_COMMA); |
|
471 |
break; |
|
472 |
case '*': |
|
473 |
sym.token = (insideSet ? TOK_STRING : TOK_STAR); |
|
474 |
break; |
|
475 |
case '|': |
|
476 |
sym.token = (insideSet ? TOK_STRING : TOK_OR); |
|
477 |
break; |
|
478 |
case '?': |
|
479 |
sym.token = (insideSet ? TOK_STRING : TOK_QUESTION); |
|
480 |
break; |
|
481 |
case '.': |
|
482 |
sym.token = (insideSet ? TOK_STRING : TOK_DOT); |
|
483 |
break; |
|
484 |
case '+': |
|
485 |
sym.token = (insideSet ? TOK_STRING : TOK_PLUS); |
|
486 |
break; |
|
487 |
case '\\': |
|
488 |
++pos; |
|
489 |
if (pos >= input.length()) |
|
490 |
return; |
|
491 |
ch = input.at(pos); |
|
492 |
if (ch == QLatin1Char('n')) { |
|
493 |
ch = '\n'; |
|
494 |
} else if (ch == QLatin1Char('r')) { |
|
495 |
ch = '\r'; |
|
496 |
} else if (ch == QLatin1Char('t')) { |
|
497 |
ch = '\t'; |
|
498 |
} else if (ch == QLatin1Char('f')) { |
|
499 |
ch = '\f'; |
|
500 |
} |
|
501 |
// fall through |
|
502 |
default: |
|
503 |
sym.token = TOK_STRING; |
|
504 |
sym.lexem = QString(ch); |
|
505 |
symbols += sym; |
|
506 |
++pos; |
|
507 |
continue; |
|
508 |
} |
|
509 |
symbols += sym; |
|
510 |
++pos; |
|
511 |
} |
|
512 |
#endif |
|
513 |
#if 0 |
|
514 |
foreach (Symbol s, symbols) { |
|
515 |
qDebug() << "Tok" << tokStr(s.token) << "lexem" << s.lexem; |
|
516 |
} |
|
517 |
#endif |
|
518 |
} |
|
519 |
||
520 |
bool RE2NFA::next(Token t) |
|
521 |
{ |
|
522 |
if (hasNext() && next() == t) |
|
523 |
return true; |
|
524 |
errorColumn = symbol().column; |
|
525 |
Q_ASSERT(false); |
|
526 |
return false; |
|
527 |
} |
|
528 |
||
529 |
bool RE2NFA::test(Token t) |
|
530 |
{ |
|
531 |
if (index >= symbols.count()) |
|
532 |
return false; |
|
533 |
if (symbols.at(index).token == t) { |
|
534 |
++index; |
|
535 |
return true; |
|
536 |
} |
|
537 |
return false; |
|
538 |
} |
|
539 |
||
540 |
QString RE2NFA::lexemUntil(Token t) |
|
541 |
{ |
|
542 |
QString lexem; |
|
543 |
while (hasNext() && next() != t) |
|
544 |
lexem += symbol().lexem; |
|
545 |
return lexem; |
|
546 |
} |
|
547 |