FCL/sf/mw/qt: comparison util/lexgen/re2nfa.cpp

equal deleted inserted replaced

--1:000000000000
+:1918ee327afb
+/****************************************************************************
+**
+** Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies).
+** All rights reserved.
+** Contact: Nokia Corporation (qt-info@nokia.com)
+**
+** This file is part of the utils of the Qt Toolkit.
+**
+** $QT_BEGIN_LICENSE:LGPL$
+** No Commercial Usage
+** This file contains pre-release code and may not be distributed.
+** You may use this file in accordance with the terms and conditions
+** contained in the Technology Preview License Agreement accompanying
+** this package.
+**
+** GNU Lesser General Public License Usage
+** Alternatively, this file may be used under the terms of the GNU Lesser
+** General Public License version 2.1 as published by the Free Software
+** Foundation and appearing in the file LICENSE.LGPL included in the
+** packaging of this file.  Please review the following information to
+** ensure the GNU Lesser General Public License version 2.1 requirements
+** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
+**
+** In addition, as a special exception, Nokia gives you certain additional
+** rights.  These rights are described in the Nokia Qt LGPL Exception
+** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
+**
+** If you have questions regarding the use of this file, please contact
+** Nokia at qt-info@nokia.com.
+**
+**
+**
+**
+**
+**
+**
+**
+** $QT_END_LICENSE$
+**
+****************************************************************************/
+#include "re2nfa.h"
+#include "tokenizer.cpp"
+RE2NFA::RE2NFA(const QMap<QString, NFA> &macros, const QSet<InputType> &maxInputSet, Qt::CaseSensitivity cs)
+: macros(macros), index(0), errorColumn(-1), maxInputSet(maxInputSet), caseSensitivity(cs)
+{
+}
+NFA RE2NFA::parse(const QString &expression, int *errCol)
+{
+tokenize(expression);
+if (symbols.isEmpty())
+return NFA();
+index = 0;
+NFA result = parseExpr();
+if (result.isEmpty()) {
+if (errCol)
+*errCol = errorColumn;
+}
+return result;
+}
+NFA RE2NFA::parseExpr()
+{
+NFA value = parseBranch();
+while (test(TOK_OR)) {
+NFA rhs = parseBranch();
+value = NFA::createAlternatingNFA(value, rhs);
+}
+return value;
+}
+NFA RE2NFA::parseBranch()
+{
+NFA value = parsePiece();
+if (!hasNext())
+return value;
+NFA next;
+do {
+next = parsePiece();
+if (!next.isEmpty())
+value = NFA::createConcatenatingNFA(value, next);
+} while (!next.isEmpty() && hasNext());
+return value;
+}
+NFA RE2NFA::parsePiece()
+{
+NFA atom = parseAtom();
+if (atom.isEmpty() || !hasNext())
+return atom;
+return parseMaybeQuantifier(atom);
+}
+NFA RE2NFA::parseAtom()
+{
+// ####
+switch (next()) {
+case TOK_STRING:
+return createCharNFA();
+case TOK_LPAREN: {
+NFA subExpr = parseExpr();
+next(TOK_RPAREN);
+return subExpr;
+}
+case TOK_LBRACE: {
+QString macroName = lexemUntil(TOK_RBRACE);
+QMap<QString, NFA>::ConstIterator macro = macros.find(macroName);
+if (macro == macros.end()) {
+qWarning("Unknown macro '%s' - probably used before defined", qPrintable(macroName));
+return NFA();
+}
+return *macro;
+}
+case TOK_LBRACKET: {
+NFA set = parseSet();
+next(TOK_RBRACKET);
+return set;
+}
+case TOK_SEQUENCE:
+return parseSet2();
+case TOK_DOT:
+return NFA::createSetNFA(maxInputSet);
+default:
+prev();
+return NFA();
+}
+}
+NFA RE2NFA::parseMaybeQuantifier(const NFA &nfa)
+{
+// ####
+switch (next()) {
+case TOK_STAR:
+return NFA::createOptionalNFA(nfa);
+case TOK_QUESTION:
+return NFA::createZeroOrOneNFA(nfa);
+case TOK_PLUS:
+return NFA::createConcatenatingNFA(nfa, NFA::createOptionalNFA(nfa));
+case TOK_LBRACE: {
+const int rewind = index - 1;
+QString lexemBeforeComma;
+QString lexemAfterComma;
+bool seenComma = false;
+forever {
+if (test(TOK_COMMA)) {
+if (seenComma) {
+errorColumn = symbol().column;
+return NFA();
+}
+seenComma = true;
+} else if (test(TOK_RBRACE)) {
+break;
+} else {
+next(TOK_STRING);
+if (seenComma)
+lexemAfterComma += symbol().lexem;
+else
+lexemBeforeComma += symbol().lexem;
+}
+}
+bool isNumber = false;
+int min = lexemBeforeComma.toInt(&isNumber);
+if (!isNumber) {
+index = rewind;
+return nfa;
+}
+int max = min;
+if (seenComma) {
+max = lexemAfterComma.toInt(&isNumber);
+if (!isNumber) {
+errorColumn = symbol().column;
+return NFA();
+}
+}
+return NFA::applyQuantity(nfa, min, max);
+}
+default:
+prev();
+return nfa;
+}
+}
+NFA RE2NFA::parseSet()
+{
+QSet<InputType> set;
+bool negate = false;
+next(TOK_STRING);
+do {
+Q_ASSERT(symbol().lexem.length() == 1);
+// ###
+QChar ch = symbol().lexem.at(0);
+if (set.isEmpty() && ch == QLatin1Char('^')) {
+negate = true;
+continue;
+}
+// look ahead for ranges like a-z
+bool rangeFound = false;
+if (test(TOK_STRING)) {
+if (symbol().lexem.length() == 1
+&& symbol().lexem.at(0) == QLatin1Char('-')) {
+next(TOK_STRING);
+Q_ASSERT(symbol().lexem.length() == 1);
+QChar last = symbol().lexem.at(0);
+if (ch.unicode() > last.unicode())
+qSwap(ch, last);
+for (ushort i = ch.unicode(); i <= last.unicode(); ++i) {
+if (caseSensitivity == Qt::CaseInsensitive) {
+set.insert(QChar(i).toLower().unicode());
+} else {
+set.insert(i);
+}
+}
+rangeFound = true;
+} else {
+prev();
+}
+}
+if (!rangeFound) {
+if (caseSensitivity == Qt::CaseInsensitive) {
+set.insert(ch.toLower().unicode());
+} else {
+set.insert(ch.unicode());
+}
+}
+} while (test(TOK_STRING));
+if (negate) {
+QSet<InputType> negatedSet = maxInputSet;
+negatedSet.subtract(set);
+set = negatedSet;
+}
+return NFA::createSetNFA(set);
+}
+NFA RE2NFA::parseSet2()
+{
+QSet<InputType> set;
+bool negate = false;
+QString str = symbol().lexem;
+// strip off brackets
+str.chop(1);
+str.remove(0, 1);
+int i = 0;
+while (i < str.length()) {
+// ###
+QChar ch = str.at(i++);
+if (set.isEmpty() && ch == QLatin1Char('^')) {
+negate = true;
+continue;
+}
+// look ahead for ranges like a-z
+bool rangeFound = false;
+if (i < str.length() - 1 && str.at(i) == QLatin1Char('-')) {
+++i;
+QChar last = str.at(i++);
+if (ch.unicode() > last.unicode())
+qSwap(ch, last);
+for (ushort i = ch.unicode(); i <= last.unicode(); ++i) {
+if (caseSensitivity == Qt::CaseInsensitive) {
+set.insert(QChar(i).toLower().unicode());
+} else {
+set.insert(i);
+}
+}
+rangeFound = true;
+}
+if (!rangeFound) {
+if (caseSensitivity == Qt::CaseInsensitive) {
+set.insert(ch.toLower().unicode());
+} else {
+set.insert(ch.unicode());
+}
+}
+}
+if (negate) {
+QSet<InputType> negatedSet = maxInputSet;
+negatedSet.subtract(set);
+set = negatedSet;
+}
+return NFA::createSetNFA(set);
+}
+NFA RE2NFA::createCharNFA()
+{
+NFA nfa;
+// ####
+if (caseSensitivity == Qt::CaseInsensitive) {
+nfa = NFA::createStringNFA(symbol().lexem.toLower().toLatin1());
+} else {
+nfa = NFA::createStringNFA(symbol().lexem.toLatin1());
+}
+return nfa;
+}
+static inline int skipQuote(const QString &str, int pos)
+{
+while (pos < str.length()
+&& str.at(pos) != QLatin1Char('"')) {
+if (str.at(pos) == QLatin1Char('\\')) {
+++pos;
+if (pos >= str.length())
+break;
+}
+++pos;
+}
+if (pos < str.length())
+++pos;
+return pos;
+}
+#if 0
+static const char*tokStr(Token t)
+{
+switch (t) {
+case TOK_INVALID: return "TOK_INVALID";
+case TOK_STRING: return "TOK_STRING";
+case TOK_LBRACE: return "TOK_LBRACE";
+case TOK_RBRACE: return "TOK_RBRACE";
+case TOK_LBRACKET: return "TOK_LBRACKET";
+case TOK_RBRACKET: return "TOK_RBRACKET";
+case TOK_LPAREN: return "TOK_LPAREN";
+case TOK_RPAREN: return "TOK_RPAREN";
+case TOK_COMMA: return "TOK_COMMA";
+case TOK_STAR: return "TOK_STAR";
+case TOK_OR: return "TOK_OR";
+case TOK_QUESTION: return "TOK_QUESTION";
+case TOK_DOT: return "TOK_DOT";
+case TOK_PLUS: return "TOK_PLUS";
+case TOK_SEQUENCE: return "TOK_SEQUENCE";
+case TOK_QUOTED_STRING: return "TOK_QUOTED_STRING";
+}
+return "";
+}
+#endif
+void RE2NFA::tokenize(const QString &input)
+{
+symbols.clear();
+#if 1
+RegExpTokenizer tokenizer(input);
+Symbol sym;
+int tok = tokenizer.lex();
+while (tok != -1) {
+Symbol sym;
+sym.token = static_cast<Token>(tok);
+sym.lexem = input.mid(tokenizer.lexemStart, tokenizer.lexemLength);
+if (sym.token == TOK_QUOTED_STRING) {
+sym.lexem.chop(1);
+sym.lexem.remove(0, 1);
+sym.token = TOK_STRING;
+}
+if (sym.token == TOK_STRING || sym.token == TOK_SEQUENCE) {
+for (int i = 0; i < sym.lexem.length(); ++i) {
+if (sym.lexem.at(i) == '\\') {
+if (i >= sym.lexem.length() - 1)
+break;
+QChar ch = sym.lexem.at(i + 1);
+if (ch == QLatin1Char('n')) {
+ch = '\n';
+} else if (ch == QLatin1Char('r')) {
+ch = '\r';
+} else if (ch == QLatin1Char('t')) {
+ch = '\t';
+} else if (ch == QLatin1Char('f')) {
+ch = '\f';
+}
+sym.lexem.replace(i, 2, ch);
+}
+}
+}
+/*
+if (sym.token == TOK_SEQUENCE) {
+Symbol s;
+s.token = TOK_LBRACKET;
+s.lexem = "[";
+symbols.append(s);
+for (int i = 1; i < sym.lexem.length() - 1; ++i) {
+s.token = TOK_STRING;
+s.lexem = sym.lexem.at(i);
+symbols.append(s);
+}
+s.token = TOK_RBRACKET;
+s.lexem = "]";
+symbols.append(s);
+tok = tokenizer.lex();
+continue;
+}
+*/
+symbols.append(sym);
+tok = tokenizer.lex();
+}
+#else
+int pos = 0;
+bool insideSet = false;
+while (pos < input.length()) {
+QChar ch = input.at(pos);
+Symbol sym;
+sym.column = pos;
+sym.token = TOK_INVALID;
+sym.lexem = QString(ch);
+switch (ch.toLatin1()) {
+case '"': {
+if (insideSet) {
+sym.token = TOK_STRING;
+sym.lexem = QString(ch);
+symbols += sym;
+++pos;
+continue;
+}
+if (pos + 1 >= input.length())
+return;
+int quoteEnd = skipQuote(input, pos + 1);
+sym.token = TOK_STRING;
+sym.lexem = input.mid(pos + 1, quoteEnd - pos - 2);
+symbols += sym;
+pos = quoteEnd;
+continue;
+}
+case '{':
+sym.token = (insideSet ? TOK_STRING : TOK_LBRACE);
+break;
+case '}':
+sym.token = (insideSet ? TOK_STRING : TOK_RBRACE);
+break;
+case '[':
+insideSet = true;
+sym.token = TOK_LBRACKET;
+break;
+case ']':
+insideSet = false;
+sym.token = TOK_RBRACKET;
+break;
+case '(':
+sym.token = (insideSet ? TOK_STRING : TOK_LPAREN);
+break;
+case ')':
+sym.token = (insideSet ? TOK_STRING : TOK_RPAREN);
+break;
+case ',':
+sym.token = (insideSet ? TOK_STRING : TOK_COMMA);
+break;
+case '*':
+sym.token = (insideSet ? TOK_STRING : TOK_STAR);
+break;
+case '|':
+sym.token = (insideSet ? TOK_STRING : TOK_OR);
+break;
+case '?':
+sym.token = (insideSet ? TOK_STRING : TOK_QUESTION);
+break;
+case '.':
+sym.token = (insideSet ? TOK_STRING : TOK_DOT);
+break;
+case '+':
+sym.token = (insideSet ? TOK_STRING : TOK_PLUS);
+break;
+case '\\':
+++pos;
+if (pos >= input.length())
+return;
+ch = input.at(pos);
+if (ch == QLatin1Char('n')) {
+ch = '\n';
+} else if (ch == QLatin1Char('r')) {
+ch = '\r';
+} else if (ch == QLatin1Char('t')) {
+ch = '\t';
+} else if (ch == QLatin1Char('f')) {
+ch = '\f';
+}
+// fall through
+default:
+sym.token = TOK_STRING;
+sym.lexem = QString(ch);
+symbols += sym;
+++pos;
+continue;
+}
+symbols += sym;
+++pos;
+}
+#endif
+#if 0
+foreach (Symbol s, symbols) {
+qDebug() << "Tok" << tokStr(s.token) << "lexem" << s.lexem;
+}
+#endif
+}
+bool RE2NFA::next(Token t)
+{
+if (hasNext() && next() == t)
+return true;
+errorColumn = symbol().column;
+Q_ASSERT(false);
+return false;
+}
+bool RE2NFA::test(Token t)
+{
+if (index >= symbols.count())
+return false;
+if (symbols.at(index).token == t) {
+++index;
+return true;
+}
+return false;
+}
+QString RE2NFA::lexemUntil(Token t)
+{
+QString lexem;
+while (hasNext() && next() != t)
+lexem += symbol().lexem;
+return lexem;
+}

changeset 0	1918ee327afb
child 4	3b1da2848fc7