|
1 /**************************************************************************** |
|
2 ** |
|
3 ** Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies). |
|
4 ** All rights reserved. |
|
5 ** Contact: Nokia Corporation (qt-info@nokia.com) |
|
6 ** |
|
7 ** This file is part of the utils of the Qt Toolkit. |
|
8 ** |
|
9 ** $QT_BEGIN_LICENSE:LGPL$ |
|
10 ** No Commercial Usage |
|
11 ** This file contains pre-release code and may not be distributed. |
|
12 ** You may use this file in accordance with the terms and conditions |
|
13 ** contained in the Technology Preview License Agreement accompanying |
|
14 ** this package. |
|
15 ** |
|
16 ** GNU Lesser General Public License Usage |
|
17 ** Alternatively, this file may be used under the terms of the GNU Lesser |
|
18 ** General Public License version 2.1 as published by the Free Software |
|
19 ** Foundation and appearing in the file LICENSE.LGPL included in the |
|
20 ** packaging of this file. Please review the following information to |
|
21 ** ensure the GNU Lesser General Public License version 2.1 requirements |
|
22 ** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html. |
|
23 ** |
|
24 ** In addition, as a special exception, Nokia gives you certain additional |
|
25 ** rights. These rights are described in the Nokia Qt LGPL Exception |
|
26 ** version 1.1, included in the file LGPL_EXCEPTION.txt in this package. |
|
27 ** |
|
28 ** If you have questions regarding the use of this file, please contact |
|
29 ** Nokia at qt-info@nokia.com. |
|
30 ** |
|
31 ** |
|
32 ** |
|
33 ** |
|
34 ** |
|
35 ** |
|
36 ** |
|
37 ** |
|
38 ** $QT_END_LICENSE$ |
|
39 ** |
|
40 ****************************************************************************/ |
|
41 |
|
42 #include "re2nfa.h" |
|
43 #include "tokenizer.cpp" |
|
44 |
|
45 RE2NFA::RE2NFA(const QMap<QString, NFA> ¯os, const QSet<InputType> &maxInputSet, Qt::CaseSensitivity cs) |
|
46 : macros(macros), index(0), errorColumn(-1), maxInputSet(maxInputSet), caseSensitivity(cs) |
|
47 { |
|
48 } |
|
49 |
|
50 NFA RE2NFA::parse(const QString &expression, int *errCol) |
|
51 { |
|
52 tokenize(expression); |
|
53 |
|
54 if (symbols.isEmpty()) |
|
55 return NFA(); |
|
56 |
|
57 index = 0; |
|
58 |
|
59 NFA result = parseExpr(); |
|
60 if (result.isEmpty()) { |
|
61 if (errCol) |
|
62 *errCol = errorColumn; |
|
63 } |
|
64 return result; |
|
65 } |
|
66 |
|
67 NFA RE2NFA::parseExpr() |
|
68 { |
|
69 NFA value = parseBranch(); |
|
70 while (test(TOK_OR)) { |
|
71 NFA rhs = parseBranch(); |
|
72 value = NFA::createAlternatingNFA(value, rhs); |
|
73 } |
|
74 return value; |
|
75 } |
|
76 |
|
77 NFA RE2NFA::parseBranch() |
|
78 { |
|
79 NFA value = parsePiece(); |
|
80 if (!hasNext()) |
|
81 return value; |
|
82 NFA next; |
|
83 do { |
|
84 next = parsePiece(); |
|
85 if (!next.isEmpty()) |
|
86 value = NFA::createConcatenatingNFA(value, next); |
|
87 } while (!next.isEmpty() && hasNext()); |
|
88 return value; |
|
89 } |
|
90 |
|
91 NFA RE2NFA::parsePiece() |
|
92 { |
|
93 NFA atom = parseAtom(); |
|
94 if (atom.isEmpty() || !hasNext()) |
|
95 return atom; |
|
96 return parseMaybeQuantifier(atom); |
|
97 } |
|
98 |
|
99 NFA RE2NFA::parseAtom() |
|
100 { |
|
101 // #### |
|
102 switch (next()) { |
|
103 case TOK_STRING: |
|
104 return createCharNFA(); |
|
105 case TOK_LPAREN: { |
|
106 NFA subExpr = parseExpr(); |
|
107 next(TOK_RPAREN); |
|
108 return subExpr; |
|
109 } |
|
110 case TOK_LBRACE: { |
|
111 QString macroName = lexemUntil(TOK_RBRACE); |
|
112 QMap<QString, NFA>::ConstIterator macro = macros.find(macroName); |
|
113 if (macro == macros.end()) { |
|
114 qWarning("Unknown macro '%s' - probably used before defined", qPrintable(macroName)); |
|
115 return NFA(); |
|
116 } |
|
117 return *macro; |
|
118 } |
|
119 case TOK_LBRACKET: { |
|
120 NFA set = parseSet(); |
|
121 next(TOK_RBRACKET); |
|
122 return set; |
|
123 } |
|
124 case TOK_SEQUENCE: |
|
125 return parseSet2(); |
|
126 case TOK_DOT: |
|
127 return NFA::createSetNFA(maxInputSet); |
|
128 default: |
|
129 prev(); |
|
130 return NFA(); |
|
131 } |
|
132 } |
|
133 |
|
134 NFA RE2NFA::parseMaybeQuantifier(const NFA &nfa) |
|
135 { |
|
136 // #### |
|
137 switch (next()) { |
|
138 case TOK_STAR: |
|
139 return NFA::createOptionalNFA(nfa); |
|
140 case TOK_QUESTION: |
|
141 return NFA::createZeroOrOneNFA(nfa); |
|
142 case TOK_PLUS: |
|
143 return NFA::createConcatenatingNFA(nfa, NFA::createOptionalNFA(nfa)); |
|
144 case TOK_LBRACE: { |
|
145 const int rewind = index - 1; |
|
146 |
|
147 QString lexemBeforeComma; |
|
148 QString lexemAfterComma; |
|
149 bool seenComma = false; |
|
150 forever { |
|
151 if (test(TOK_COMMA)) { |
|
152 if (seenComma) { |
|
153 errorColumn = symbol().column; |
|
154 return NFA(); |
|
155 } |
|
156 seenComma = true; |
|
157 } else if (test(TOK_RBRACE)) { |
|
158 break; |
|
159 } else { |
|
160 next(TOK_STRING); |
|
161 if (seenComma) |
|
162 lexemAfterComma += symbol().lexem; |
|
163 else |
|
164 lexemBeforeComma += symbol().lexem; |
|
165 } |
|
166 } |
|
167 bool isNumber = false; |
|
168 int min = lexemBeforeComma.toInt(&isNumber); |
|
169 if (!isNumber) { |
|
170 index = rewind; |
|
171 return nfa; |
|
172 } |
|
173 int max = min; |
|
174 if (seenComma) { |
|
175 max = lexemAfterComma.toInt(&isNumber); |
|
176 if (!isNumber) { |
|
177 errorColumn = symbol().column; |
|
178 return NFA(); |
|
179 } |
|
180 } |
|
181 return NFA::applyQuantity(nfa, min, max); |
|
182 } |
|
183 default: |
|
184 prev(); |
|
185 return nfa; |
|
186 } |
|
187 } |
|
188 |
|
189 NFA RE2NFA::parseSet() |
|
190 { |
|
191 QSet<InputType> set; |
|
192 bool negate = false; |
|
193 |
|
194 next(TOK_STRING); |
|
195 |
|
196 do { |
|
197 Q_ASSERT(symbol().lexem.length() == 1); |
|
198 // ### |
|
199 QChar ch = symbol().lexem.at(0); |
|
200 if (set.isEmpty() && ch == QLatin1Char('^')) { |
|
201 negate = true; |
|
202 continue; |
|
203 } |
|
204 |
|
205 // look ahead for ranges like a-z |
|
206 bool rangeFound = false; |
|
207 if (test(TOK_STRING)) { |
|
208 if (symbol().lexem.length() == 1 |
|
209 && symbol().lexem.at(0) == QLatin1Char('-')) { |
|
210 next(TOK_STRING); |
|
211 Q_ASSERT(symbol().lexem.length() == 1); |
|
212 QChar last = symbol().lexem.at(0); |
|
213 |
|
214 if (ch.unicode() > last.unicode()) |
|
215 qSwap(ch, last); |
|
216 |
|
217 for (ushort i = ch.unicode(); i <= last.unicode(); ++i) { |
|
218 if (caseSensitivity == Qt::CaseInsensitive) { |
|
219 set.insert(QChar(i).toLower().unicode()); |
|
220 } else { |
|
221 set.insert(i); |
|
222 } |
|
223 } |
|
224 |
|
225 rangeFound = true; |
|
226 } else { |
|
227 prev(); |
|
228 } |
|
229 } |
|
230 |
|
231 if (!rangeFound) { |
|
232 if (caseSensitivity == Qt::CaseInsensitive) { |
|
233 set.insert(ch.toLower().unicode()); |
|
234 } else { |
|
235 set.insert(ch.unicode()); |
|
236 } |
|
237 } |
|
238 } while (test(TOK_STRING)); |
|
239 |
|
240 if (negate) { |
|
241 QSet<InputType> negatedSet = maxInputSet; |
|
242 negatedSet.subtract(set); |
|
243 set = negatedSet; |
|
244 } |
|
245 |
|
246 return NFA::createSetNFA(set); |
|
247 } |
|
248 |
|
249 NFA RE2NFA::parseSet2() |
|
250 { |
|
251 QSet<InputType> set; |
|
252 bool negate = false; |
|
253 |
|
254 QString str = symbol().lexem; |
|
255 // strip off brackets |
|
256 str.chop(1); |
|
257 str.remove(0, 1); |
|
258 |
|
259 int i = 0; |
|
260 while (i < str.length()) { |
|
261 // ### |
|
262 QChar ch = str.at(i++); |
|
263 if (set.isEmpty() && ch == QLatin1Char('^')) { |
|
264 negate = true; |
|
265 continue; |
|
266 } |
|
267 |
|
268 // look ahead for ranges like a-z |
|
269 bool rangeFound = false; |
|
270 if (i < str.length() - 1 && str.at(i) == QLatin1Char('-')) { |
|
271 ++i; |
|
272 QChar last = str.at(i++); |
|
273 |
|
274 if (ch.unicode() > last.unicode()) |
|
275 qSwap(ch, last); |
|
276 |
|
277 for (ushort i = ch.unicode(); i <= last.unicode(); ++i) { |
|
278 if (caseSensitivity == Qt::CaseInsensitive) { |
|
279 set.insert(QChar(i).toLower().unicode()); |
|
280 } else { |
|
281 set.insert(i); |
|
282 } |
|
283 } |
|
284 |
|
285 rangeFound = true; |
|
286 } |
|
287 |
|
288 if (!rangeFound) { |
|
289 if (caseSensitivity == Qt::CaseInsensitive) { |
|
290 set.insert(ch.toLower().unicode()); |
|
291 } else { |
|
292 set.insert(ch.unicode()); |
|
293 } |
|
294 } |
|
295 } |
|
296 |
|
297 if (negate) { |
|
298 QSet<InputType> negatedSet = maxInputSet; |
|
299 negatedSet.subtract(set); |
|
300 set = negatedSet; |
|
301 } |
|
302 |
|
303 return NFA::createSetNFA(set); |
|
304 } |
|
305 NFA RE2NFA::createCharNFA() |
|
306 { |
|
307 NFA nfa; |
|
308 // #### |
|
309 if (caseSensitivity == Qt::CaseInsensitive) { |
|
310 nfa = NFA::createStringNFA(symbol().lexem.toLower().toLatin1()); |
|
311 } else { |
|
312 nfa = NFA::createStringNFA(symbol().lexem.toLatin1()); |
|
313 } |
|
314 return nfa; |
|
315 } |
|
316 |
|
317 static inline int skipQuote(const QString &str, int pos) |
|
318 { |
|
319 while (pos < str.length() |
|
320 && str.at(pos) != QLatin1Char('"')) { |
|
321 if (str.at(pos) == QLatin1Char('\\')) { |
|
322 ++pos; |
|
323 if (pos >= str.length()) |
|
324 break; |
|
325 } |
|
326 ++pos; |
|
327 } |
|
328 if (pos < str.length()) |
|
329 ++pos; |
|
330 return pos; |
|
331 } |
|
332 |
|
333 #if 0 |
|
334 static const char*tokStr(Token t) |
|
335 { |
|
336 switch (t) { |
|
337 case TOK_INVALID: return "TOK_INVALID"; |
|
338 case TOK_STRING: return "TOK_STRING"; |
|
339 case TOK_LBRACE: return "TOK_LBRACE"; |
|
340 case TOK_RBRACE: return "TOK_RBRACE"; |
|
341 case TOK_LBRACKET: return "TOK_LBRACKET"; |
|
342 case TOK_RBRACKET: return "TOK_RBRACKET"; |
|
343 case TOK_LPAREN: return "TOK_LPAREN"; |
|
344 case TOK_RPAREN: return "TOK_RPAREN"; |
|
345 case TOK_COMMA: return "TOK_COMMA"; |
|
346 case TOK_STAR: return "TOK_STAR"; |
|
347 case TOK_OR: return "TOK_OR"; |
|
348 case TOK_QUESTION: return "TOK_QUESTION"; |
|
349 case TOK_DOT: return "TOK_DOT"; |
|
350 case TOK_PLUS: return "TOK_PLUS"; |
|
351 case TOK_SEQUENCE: return "TOK_SEQUENCE"; |
|
352 case TOK_QUOTED_STRING: return "TOK_QUOTED_STRING"; |
|
353 } |
|
354 return ""; |
|
355 } |
|
356 #endif |
|
357 |
|
358 void RE2NFA::tokenize(const QString &input) |
|
359 { |
|
360 symbols.clear(); |
|
361 #if 1 |
|
362 RegExpTokenizer tokenizer(input); |
|
363 Symbol sym; |
|
364 int tok = tokenizer.lex(); |
|
365 while (tok != -1) { |
|
366 Symbol sym; |
|
367 sym.token = static_cast<Token>(tok); |
|
368 sym.lexem = input.mid(tokenizer.lexemStart, tokenizer.lexemLength); |
|
369 |
|
370 if (sym.token == TOK_QUOTED_STRING) { |
|
371 sym.lexem.chop(1); |
|
372 sym.lexem.remove(0, 1); |
|
373 sym.token = TOK_STRING; |
|
374 } |
|
375 |
|
376 if (sym.token == TOK_STRING || sym.token == TOK_SEQUENCE) { |
|
377 for (int i = 0; i < sym.lexem.length(); ++i) { |
|
378 if (sym.lexem.at(i) == '\\') { |
|
379 if (i >= sym.lexem.length() - 1) |
|
380 break; |
|
381 QChar ch = sym.lexem.at(i + 1); |
|
382 if (ch == QLatin1Char('n')) { |
|
383 ch = '\n'; |
|
384 } else if (ch == QLatin1Char('r')) { |
|
385 ch = '\r'; |
|
386 } else if (ch == QLatin1Char('t')) { |
|
387 ch = '\t'; |
|
388 } else if (ch == QLatin1Char('f')) { |
|
389 ch = '\f'; |
|
390 } |
|
391 sym.lexem.replace(i, 2, ch); |
|
392 } |
|
393 } |
|
394 } |
|
395 |
|
396 /* |
|
397 if (sym.token == TOK_SEQUENCE) { |
|
398 Symbol s; |
|
399 s.token = TOK_LBRACKET; |
|
400 s.lexem = "["; |
|
401 symbols.append(s); |
|
402 |
|
403 for (int i = 1; i < sym.lexem.length() - 1; ++i) { |
|
404 s.token = TOK_STRING; |
|
405 s.lexem = sym.lexem.at(i); |
|
406 symbols.append(s); |
|
407 } |
|
408 |
|
409 s.token = TOK_RBRACKET; |
|
410 s.lexem = "]"; |
|
411 symbols.append(s); |
|
412 |
|
413 tok = tokenizer.lex(); |
|
414 continue; |
|
415 } |
|
416 */ |
|
417 |
|
418 symbols.append(sym); |
|
419 tok = tokenizer.lex(); |
|
420 } |
|
421 #else |
|
422 int pos = 0; |
|
423 bool insideSet = false; |
|
424 while (pos < input.length()) { |
|
425 QChar ch = input.at(pos); |
|
426 |
|
427 Symbol sym; |
|
428 sym.column = pos; |
|
429 sym.token = TOK_INVALID; |
|
430 sym.lexem = QString(ch); |
|
431 switch (ch.toLatin1()) { |
|
432 case '"': { |
|
433 if (insideSet) { |
|
434 sym.token = TOK_STRING; |
|
435 sym.lexem = QString(ch); |
|
436 symbols += sym; |
|
437 ++pos; |
|
438 continue; |
|
439 } |
|
440 if (pos + 1 >= input.length()) |
|
441 return; |
|
442 int quoteEnd = skipQuote(input, pos + 1); |
|
443 sym.token = TOK_STRING; |
|
444 sym.lexem = input.mid(pos + 1, quoteEnd - pos - 2); |
|
445 symbols += sym; |
|
446 pos = quoteEnd; |
|
447 continue; |
|
448 } |
|
449 case '{': |
|
450 sym.token = (insideSet ? TOK_STRING : TOK_LBRACE); |
|
451 break; |
|
452 case '}': |
|
453 sym.token = (insideSet ? TOK_STRING : TOK_RBRACE); |
|
454 break; |
|
455 case '[': |
|
456 insideSet = true; |
|
457 sym.token = TOK_LBRACKET; |
|
458 break; |
|
459 case ']': |
|
460 insideSet = false; |
|
461 sym.token = TOK_RBRACKET; |
|
462 break; |
|
463 case '(': |
|
464 sym.token = (insideSet ? TOK_STRING : TOK_LPAREN); |
|
465 break; |
|
466 case ')': |
|
467 sym.token = (insideSet ? TOK_STRING : TOK_RPAREN); |
|
468 break; |
|
469 case ',': |
|
470 sym.token = (insideSet ? TOK_STRING : TOK_COMMA); |
|
471 break; |
|
472 case '*': |
|
473 sym.token = (insideSet ? TOK_STRING : TOK_STAR); |
|
474 break; |
|
475 case '|': |
|
476 sym.token = (insideSet ? TOK_STRING : TOK_OR); |
|
477 break; |
|
478 case '?': |
|
479 sym.token = (insideSet ? TOK_STRING : TOK_QUESTION); |
|
480 break; |
|
481 case '.': |
|
482 sym.token = (insideSet ? TOK_STRING : TOK_DOT); |
|
483 break; |
|
484 case '+': |
|
485 sym.token = (insideSet ? TOK_STRING : TOK_PLUS); |
|
486 break; |
|
487 case '\\': |
|
488 ++pos; |
|
489 if (pos >= input.length()) |
|
490 return; |
|
491 ch = input.at(pos); |
|
492 if (ch == QLatin1Char('n')) { |
|
493 ch = '\n'; |
|
494 } else if (ch == QLatin1Char('r')) { |
|
495 ch = '\r'; |
|
496 } else if (ch == QLatin1Char('t')) { |
|
497 ch = '\t'; |
|
498 } else if (ch == QLatin1Char('f')) { |
|
499 ch = '\f'; |
|
500 } |
|
501 // fall through |
|
502 default: |
|
503 sym.token = TOK_STRING; |
|
504 sym.lexem = QString(ch); |
|
505 symbols += sym; |
|
506 ++pos; |
|
507 continue; |
|
508 } |
|
509 symbols += sym; |
|
510 ++pos; |
|
511 } |
|
512 #endif |
|
513 #if 0 |
|
514 foreach (Symbol s, symbols) { |
|
515 qDebug() << "Tok" << tokStr(s.token) << "lexem" << s.lexem; |
|
516 } |
|
517 #endif |
|
518 } |
|
519 |
|
520 bool RE2NFA::next(Token t) |
|
521 { |
|
522 if (hasNext() && next() == t) |
|
523 return true; |
|
524 errorColumn = symbol().column; |
|
525 Q_ASSERT(false); |
|
526 return false; |
|
527 } |
|
528 |
|
529 bool RE2NFA::test(Token t) |
|
530 { |
|
531 if (index >= symbols.count()) |
|
532 return false; |
|
533 if (symbols.at(index).token == t) { |
|
534 ++index; |
|
535 return true; |
|
536 } |
|
537 return false; |
|
538 } |
|
539 |
|
540 QString RE2NFA::lexemUntil(Token t) |
|
541 { |
|
542 QString lexem; |
|
543 while (hasNext() && next() != t) |
|
544 lexem += symbol().lexem; |
|
545 return lexem; |
|
546 } |
|
547 |