util/lexgen/re2nfa.cpp
changeset 0 1918ee327afb
child 4 3b1da2848fc7
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/util/lexgen/re2nfa.cpp	Mon Jan 11 14:00:40 2010 +0000
@@ -0,0 +1,547 @@
+/****************************************************************************
+**
+** Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies).
+** All rights reserved.
+** Contact: Nokia Corporation (qt-info@nokia.com)
+**
+** This file is part of the utils of the Qt Toolkit.
+**
+** $QT_BEGIN_LICENSE:LGPL$
+** No Commercial Usage
+** This file contains pre-release code and may not be distributed.
+** You may use this file in accordance with the terms and conditions
+** contained in the Technology Preview License Agreement accompanying
+** this package.
+**
+** GNU Lesser General Public License Usage
+** Alternatively, this file may be used under the terms of the GNU Lesser
+** General Public License version 2.1 as published by the Free Software
+** Foundation and appearing in the file LICENSE.LGPL included in the
+** packaging of this file.  Please review the following information to
+** ensure the GNU Lesser General Public License version 2.1 requirements
+** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
+**
+** In addition, as a special exception, Nokia gives you certain additional
+** rights.  These rights are described in the Nokia Qt LGPL Exception
+** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
+**
+** If you have questions regarding the use of this file, please contact
+** Nokia at qt-info@nokia.com.
+**
+**
+**
+**
+**
+**
+**
+**
+** $QT_END_LICENSE$
+**
+****************************************************************************/
+
+#include "re2nfa.h"
+#include "tokenizer.cpp"
+
+RE2NFA::RE2NFA(const QMap<QString, NFA> &macros, const QSet<InputType> &maxInputSet, Qt::CaseSensitivity cs)
+    : macros(macros), index(0), errorColumn(-1), maxInputSet(maxInputSet), caseSensitivity(cs)
+{
+}
+
+NFA RE2NFA::parse(const QString &expression, int *errCol)
+{
+    tokenize(expression);
+
+    if (symbols.isEmpty())
+        return NFA();
+
+    index = 0;
+
+    NFA result = parseExpr();
+    if (result.isEmpty()) {
+        if (errCol)
+            *errCol = errorColumn;
+    }
+    return result;
+}
+
+NFA RE2NFA::parseExpr()
+{
+    NFA value = parseBranch();
+    while (test(TOK_OR)) {
+        NFA rhs = parseBranch();
+        value = NFA::createAlternatingNFA(value, rhs);
+    }
+    return value;
+}
+
+NFA RE2NFA::parseBranch()
+{
+    NFA value = parsePiece();
+    if (!hasNext())
+        return value;
+    NFA next;
+    do {
+        next = parsePiece();
+        if (!next.isEmpty())
+            value = NFA::createConcatenatingNFA(value, next);
+    } while (!next.isEmpty() && hasNext());
+    return value;
+}
+
+NFA RE2NFA::parsePiece()
+{
+    NFA atom = parseAtom();
+    if (atom.isEmpty() || !hasNext())
+        return atom;
+    return parseMaybeQuantifier(atom);
+}
+
+NFA RE2NFA::parseAtom()
+{
+    // ####
+    switch (next()) {
+        case TOK_STRING:
+            return createCharNFA();
+        case TOK_LPAREN: {
+            NFA subExpr = parseExpr();
+            next(TOK_RPAREN);
+            return subExpr;
+        }
+        case TOK_LBRACE: {
+            QString macroName = lexemUntil(TOK_RBRACE);
+            QMap<QString, NFA>::ConstIterator macro = macros.find(macroName);
+            if (macro == macros.end()) {
+                qWarning("Unknown macro '%s' - probably used before defined", qPrintable(macroName));
+                return NFA();
+            }
+            return *macro;
+        }
+        case TOK_LBRACKET: {
+            NFA set = parseSet();
+            next(TOK_RBRACKET);
+            return set;
+        }
+        case TOK_SEQUENCE:
+            return parseSet2();
+        case TOK_DOT:
+            return NFA::createSetNFA(maxInputSet);
+        default:
+            prev();
+            return NFA();
+    }
+}
+
+NFA RE2NFA::parseMaybeQuantifier(const NFA &nfa)
+{
+    // ####
+    switch (next()) {
+        case TOK_STAR:
+            return NFA::createOptionalNFA(nfa);
+        case TOK_QUESTION:
+            return NFA::createZeroOrOneNFA(nfa);
+        case TOK_PLUS:
+            return NFA::createConcatenatingNFA(nfa, NFA::createOptionalNFA(nfa));
+        case TOK_LBRACE: {
+              const int rewind = index - 1;
+
+              QString lexemBeforeComma;
+              QString lexemAfterComma;
+              bool seenComma = false;
+              forever {
+                  if (test(TOK_COMMA)) {
+                      if (seenComma) {
+                          errorColumn = symbol().column;
+                          return NFA();
+                      }
+                      seenComma = true;
+                  } else if (test(TOK_RBRACE)) {
+                      break;
+                  } else {
+                      next(TOK_STRING);
+                      if (seenComma)
+                          lexemAfterComma += symbol().lexem;
+                      else
+                          lexemBeforeComma += symbol().lexem;
+                  }
+              }
+              bool isNumber = false;
+              int min = lexemBeforeComma.toInt(&isNumber);
+              if (!isNumber) {
+                  index = rewind;
+                  return nfa;
+              }
+              int max = min;
+              if (seenComma) {
+                  max = lexemAfterComma.toInt(&isNumber);
+                  if (!isNumber) {
+                      errorColumn = symbol().column;
+                      return NFA();
+                  }
+              }
+              return NFA::applyQuantity(nfa, min, max);
+        }
+        default:
+            prev();
+            return nfa;
+    }
+}
+
+NFA RE2NFA::parseSet()
+{
+    QSet<InputType> set;
+    bool negate = false;
+
+    next(TOK_STRING);
+
+    do {
+        Q_ASSERT(symbol().lexem.length() == 1);
+        // ###
+        QChar ch = symbol().lexem.at(0);
+        if (set.isEmpty() && ch == QLatin1Char('^')) {
+            negate = true;
+            continue;
+        }
+
+        // look ahead for ranges like a-z
+        bool rangeFound = false;
+        if (test(TOK_STRING)) {
+            if (symbol().lexem.length() == 1
+                && symbol().lexem.at(0) == QLatin1Char('-')) {
+                next(TOK_STRING);
+                Q_ASSERT(symbol().lexem.length() == 1);
+                QChar last = symbol().lexem.at(0);
+
+                if (ch.unicode() > last.unicode())
+                    qSwap(ch, last);
+
+                for (ushort i = ch.unicode(); i <= last.unicode(); ++i) {
+                    if (caseSensitivity == Qt::CaseInsensitive) {
+                        set.insert(QChar(i).toLower().unicode());
+                    } else {
+                        set.insert(i);
+                    }
+                }
+
+                rangeFound = true;
+            } else {
+                prev();
+            }
+        }
+
+        if (!rangeFound) {
+            if (caseSensitivity == Qt::CaseInsensitive) {
+                set.insert(ch.toLower().unicode());
+            } else {
+                set.insert(ch.unicode());
+            }
+        }
+    } while (test(TOK_STRING));
+
+    if (negate) {
+        QSet<InputType> negatedSet = maxInputSet;
+        negatedSet.subtract(set);
+        set = negatedSet;
+    }
+
+    return NFA::createSetNFA(set);
+}
+
+NFA RE2NFA::parseSet2()
+{
+    QSet<InputType> set;
+    bool negate = false;
+
+    QString str = symbol().lexem;
+    // strip off brackets
+    str.chop(1);
+    str.remove(0, 1);
+
+    int i = 0;
+    while (i < str.length()) {
+        // ###
+        QChar ch = str.at(i++);
+        if (set.isEmpty() && ch == QLatin1Char('^')) {
+            negate = true;
+            continue;
+        }
+
+        // look ahead for ranges like a-z
+        bool rangeFound = false;
+        if (i < str.length() - 1 && str.at(i) == QLatin1Char('-')) {
+            ++i;
+            QChar last = str.at(i++);
+
+            if (ch.unicode() > last.unicode())
+                qSwap(ch, last);
+
+            for (ushort i = ch.unicode(); i <= last.unicode(); ++i) {
+                if (caseSensitivity == Qt::CaseInsensitive) {
+                    set.insert(QChar(i).toLower().unicode());
+                } else {
+                    set.insert(i);
+                }
+            }
+
+            rangeFound = true;
+        }
+
+        if (!rangeFound) {
+            if (caseSensitivity == Qt::CaseInsensitive) {
+                set.insert(ch.toLower().unicode());
+            } else {
+                set.insert(ch.unicode());
+            }
+        }
+    }
+
+    if (negate) {
+        QSet<InputType> negatedSet = maxInputSet;
+        negatedSet.subtract(set);
+        set = negatedSet;
+    }
+
+    return NFA::createSetNFA(set);
+}
+NFA RE2NFA::createCharNFA()
+{
+    NFA nfa;
+    // ####
+    if (caseSensitivity == Qt::CaseInsensitive) {
+        nfa = NFA::createStringNFA(symbol().lexem.toLower().toLatin1());
+    } else {
+        nfa = NFA::createStringNFA(symbol().lexem.toLatin1());
+    }
+    return nfa;
+}
+
+static inline int skipQuote(const QString &str, int pos)
+{
+    while (pos < str.length()
+           && str.at(pos) != QLatin1Char('"')) {
+        if (str.at(pos) == QLatin1Char('\\')) {
+            ++pos;
+            if (pos >= str.length())
+                break;
+        }
+        ++pos;
+    }
+    if (pos < str.length())
+        ++pos;
+    return pos;
+}
+
+#if 0
+static const char*tokStr(Token t)
+{
+    switch (t) {
+        case TOK_INVALID: return "TOK_INVALID";
+        case TOK_STRING: return "TOK_STRING";
+        case TOK_LBRACE: return "TOK_LBRACE";
+        case TOK_RBRACE: return "TOK_RBRACE";
+        case TOK_LBRACKET: return "TOK_LBRACKET";
+        case TOK_RBRACKET: return "TOK_RBRACKET";
+        case TOK_LPAREN: return "TOK_LPAREN";
+        case TOK_RPAREN: return "TOK_RPAREN";
+        case TOK_COMMA: return "TOK_COMMA";
+        case TOK_STAR: return "TOK_STAR";
+        case TOK_OR: return "TOK_OR";
+        case TOK_QUESTION: return "TOK_QUESTION";
+        case TOK_DOT: return "TOK_DOT";
+        case TOK_PLUS: return "TOK_PLUS";
+        case TOK_SEQUENCE: return "TOK_SEQUENCE";
+        case TOK_QUOTED_STRING: return "TOK_QUOTED_STRING";
+    }
+    return "";
+}
+#endif
+
+void RE2NFA::tokenize(const QString &input)
+{
+    symbols.clear();
+#if 1
+    RegExpTokenizer tokenizer(input);
+    Symbol sym;
+    int tok = tokenizer.lex();
+    while (tok != -1) {
+        Symbol sym;
+        sym.token = static_cast<Token>(tok);
+        sym.lexem = input.mid(tokenizer.lexemStart, tokenizer.lexemLength);
+
+        if (sym.token == TOK_QUOTED_STRING) {
+            sym.lexem.chop(1);
+            sym.lexem.remove(0, 1);
+            sym.token = TOK_STRING;
+        }
+
+        if (sym.token == TOK_STRING || sym.token == TOK_SEQUENCE) {
+            for (int i = 0; i < sym.lexem.length(); ++i) {
+                if (sym.lexem.at(i) == '\\') {
+                    if (i >= sym.lexem.length() - 1)
+                        break;
+                    QChar ch = sym.lexem.at(i + 1);
+                    if (ch == QLatin1Char('n')) {
+                        ch = '\n';
+                    } else if (ch == QLatin1Char('r')) {
+                        ch = '\r';
+                    } else if (ch == QLatin1Char('t')) {
+                        ch = '\t';
+                    } else if (ch == QLatin1Char('f')) {
+                        ch = '\f';
+                    }
+                    sym.lexem.replace(i, 2, ch);
+                }
+            }
+        }
+
+        /*
+        if (sym.token == TOK_SEQUENCE) {
+            Symbol s;
+            s.token = TOK_LBRACKET;
+            s.lexem = "[";
+            symbols.append(s);
+
+            for (int i = 1; i < sym.lexem.length() - 1; ++i) {
+                s.token = TOK_STRING;
+                s.lexem = sym.lexem.at(i);
+                symbols.append(s);
+            }
+
+            s.token = TOK_RBRACKET;
+            s.lexem = "]";
+            symbols.append(s);
+
+            tok = tokenizer.lex();
+            continue;
+        }
+        */
+
+        symbols.append(sym);
+        tok = tokenizer.lex();
+    }
+#else
+    int pos = 0;
+    bool insideSet = false;
+    while (pos < input.length()) {
+        QChar ch = input.at(pos);
+
+        Symbol sym;
+        sym.column = pos;
+        sym.token = TOK_INVALID;
+        sym.lexem = QString(ch);
+        switch (ch.toLatin1()) {
+            case '"': {
+                if (insideSet) {
+                    sym.token = TOK_STRING;
+                    sym.lexem = QString(ch);
+                    symbols += sym;
+                    ++pos;
+                    continue;
+                }
+                if (pos + 1 >= input.length())
+                    return;
+                int quoteEnd = skipQuote(input, pos + 1);
+                sym.token = TOK_STRING;
+                sym.lexem = input.mid(pos + 1, quoteEnd - pos - 2);
+                symbols += sym;
+                pos = quoteEnd;
+                continue;
+            }
+            case '{':
+                sym.token = (insideSet ? TOK_STRING : TOK_LBRACE);
+                break;
+            case '}':
+                sym.token = (insideSet ? TOK_STRING : TOK_RBRACE);
+                break;
+            case '[':
+                insideSet = true;
+                sym.token = TOK_LBRACKET;
+                break;
+            case ']':
+                insideSet = false;
+                sym.token = TOK_RBRACKET;
+                break;
+            case '(':
+                sym.token = (insideSet ? TOK_STRING : TOK_LPAREN);
+                break;
+            case ')':
+                sym.token = (insideSet ? TOK_STRING : TOK_RPAREN);
+                break;
+            case ',':
+                sym.token = (insideSet ? TOK_STRING : TOK_COMMA);
+                break;
+            case '*':
+                sym.token = (insideSet ? TOK_STRING : TOK_STAR);
+                break;
+            case '|':
+                sym.token = (insideSet ? TOK_STRING : TOK_OR);
+                break;
+            case '?':
+                sym.token = (insideSet ? TOK_STRING : TOK_QUESTION);
+                break;
+            case '.':
+                sym.token = (insideSet ? TOK_STRING : TOK_DOT);
+                break;
+            case '+':
+                sym.token = (insideSet ? TOK_STRING : TOK_PLUS);
+                break;
+            case '\\':
+                ++pos;
+                if (pos >= input.length())
+                    return;
+                ch = input.at(pos);
+                if (ch == QLatin1Char('n')) {
+                    ch = '\n';
+                } else if (ch == QLatin1Char('r')) {
+                    ch = '\r';
+                } else if (ch == QLatin1Char('t')) {
+                    ch = '\t';
+                } else if (ch == QLatin1Char('f')) {
+                    ch = '\f';
+                }
+                // fall through
+            default:
+                sym.token = TOK_STRING;
+                sym.lexem = QString(ch);
+                symbols += sym;
+                ++pos;
+                continue;
+        }
+        symbols += sym;
+        ++pos;
+    }
+#endif
+#if 0
+    foreach (Symbol s, symbols) {
+        qDebug() << "Tok" << tokStr(s.token) << "lexem" << s.lexem;
+    }
+#endif
+}
+
+bool RE2NFA::next(Token t)
+{
+    if (hasNext() && next() == t)
+        return true;
+    errorColumn = symbol().column;
+    Q_ASSERT(false);
+    return false;
+}
+
+bool RE2NFA::test(Token t)
+{
+    if (index >= symbols.count())
+        return false;
+    if (symbols.at(index).token == t) {
+        ++index;
+        return true;
+    }
+    return false;
+}
+
+QString RE2NFA::lexemUntil(Token t)
+{
+    QString lexem;
+    while (hasNext() && next() != t)
+        lexem += symbol().lexem;
+    return lexem;
+}
+