util/lexgen/main.cpp
changeset 0 1918ee327afb
child 4 3b1da2848fc7
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/util/lexgen/main.cpp	Mon Jan 11 14:00:40 2010 +0000
@@ -0,0 +1,323 @@
+/****************************************************************************
+**
+** Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies).
+** All rights reserved.
+** Contact: Nokia Corporation (qt-info@nokia.com)
+**
+** This file is part of the utils of the Qt Toolkit.
+**
+** $QT_BEGIN_LICENSE:LGPL$
+** No Commercial Usage
+** This file contains pre-release code and may not be distributed.
+** You may use this file in accordance with the terms and conditions
+** contained in the Technology Preview License Agreement accompanying
+** this package.
+**
+** GNU Lesser General Public License Usage
+** Alternatively, this file may be used under the terms of the GNU Lesser
+** General Public License version 2.1 as published by the Free Software
+** Foundation and appearing in the file LICENSE.LGPL included in the
+** packaging of this file.  Please review the following information to
+** ensure the GNU Lesser General Public License version 2.1 requirements
+** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
+**
+** In addition, as a special exception, Nokia gives you certain additional
+** rights.  These rights are described in the Nokia Qt LGPL Exception
+** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
+**
+** If you have questions regarding the use of this file, please contact
+** Nokia at qt-info@nokia.com.
+**
+**
+**
+**
+**
+**
+**
+**
+** $QT_END_LICENSE$
+**
+****************************************************************************/
+
+#include "nfa.h"
+#include "re2nfa.h"
+#include "configfile.h"
+#include "generator.h"
+
+#include <QFile>
+#include <QCoreApplication>
+#include <QFileInfo>
+#include <QDateTime>
+
+struct Symbol
+{
+    QString token;
+    QString lexem;
+};
+
+static QList<Symbol> tokenize(const DFA &dfa, const QString &input, Config *cfg, bool *ok = 0)
+{
+    QList<Symbol> symbols;
+    Symbol lastSymbol;
+    int state = 0;
+    int lastAcceptingState = -1;
+    QString lastAcceptingLexem;
+    int lastAcceptingPos = -1;
+    for (int i = 0; i < input.length(); ++i) {
+        QChar ch = input.at(i);
+        QChar chForInput = ch;
+        if (cfg->caseSensitivity == Qt::CaseInsensitive)
+            chForInput = chForInput.toLower();
+        int next = dfa.at(state).transitions.value(chForInput.unicode());
+        if (cfg->debug)
+            qDebug() << "input" << input.at(i) << "leads to state" << next;
+        if (next) {
+            lastSymbol.lexem.append(input.at(i));
+            lastSymbol.token = dfa.at(next).symbol;
+            if (!lastSymbol.token.isEmpty()) {
+                lastAcceptingState = next;
+                lastAcceptingLexem = lastSymbol.lexem;
+                lastAcceptingPos = i;
+            }
+            state = next;
+        } else {
+            if (lastAcceptingState != -1) {
+                if (cfg->debug)
+                    qDebug() << "adding" << dfa.at(lastAcceptingState).symbol << "and backtracking to" << lastAcceptingPos;
+                Symbol s;
+                s.token = dfa.at(lastAcceptingState).symbol;
+                s.lexem = lastAcceptingLexem;
+                symbols << s;
+                lastSymbol = Symbol();
+                state = 0;
+                i = lastAcceptingPos;
+                lastAcceptingPos = -1;
+                lastAcceptingState = -1;
+                continue;
+            }
+            if (state == 0 || lastSymbol.token.isEmpty()) {
+                if (cfg->debug)
+                    qDebug() << "invalid input";
+                if (ok)
+                    *ok = false;
+                return symbols;
+            }
+            if (cfg->debug)
+                qDebug() << "appending symbol with token" << lastSymbol.token;
+            symbols << lastSymbol;
+            lastSymbol = Symbol();
+            state = 0;
+            lastAcceptingState = -1;
+            --i;
+        }
+    }
+    if (!lastSymbol.token.isEmpty()) {
+        if (cfg->debug)
+            qDebug() << "appending (last) symbol with token" << lastSymbol.token;
+        symbols << lastSymbol;
+    } else if (lastAcceptingState != -1) {
+        if (cfg->debug)
+            qDebug() << "appending last accepting state with token" << dfa.at(lastAcceptingState).symbol;
+        Symbol s;
+        s.lexem = lastAcceptingLexem;
+        s.token = dfa.at(lastAcceptingState).symbol;
+        symbols << s;
+    }
+    if (ok)
+        *ok = true;
+    return symbols;
+}
+
+static QSet<InputType> determineMaxInputSet(const ConfigFile::Section &section)
+{
+    QSet<InputType> set;
+
+    QString inputTypeName;
+
+    foreach (const ConfigFile::Entry &entry, section)
+        if (entry.key == QLatin1String("InputType")) {
+            if (!inputTypeName.isEmpty()) {
+                qWarning("Error: InputType field specified multiple times in config file");
+                return QSet<InputType>();
+            }
+            inputTypeName = entry.value;
+        }
+
+    if (inputTypeName.isEmpty())
+        inputTypeName = "quint8";
+
+    if (inputTypeName == "quint8") {
+        for (int i = 1; i < 256; ++i)
+            set.insert(i);
+    } /* else if ### */ 
+    else {
+        qWarning("Error: Unknown input type '%s'", qPrintable(inputTypeName));
+        return QSet<InputType>();
+    }
+
+    return set;
+}
+
+static bool loadConfig(const QString &ruleFile, Config *cfg)
+{
+    ConfigFile::SectionMap sections = ConfigFile::parse(ruleFile);
+    if (sections.isEmpty()) {
+        qWarning("Error parsing %s", qPrintable(ruleFile));
+        return false;
+    }
+
+    QSet<InputType> maxInputSet = determineMaxInputSet(sections.value("Options"));
+    if (maxInputSet.isEmpty())
+        return false;
+
+    Qt::CaseSensitivity cs = Qt::CaseInsensitive;
+    if (sections.value("Options").contains("case-sensitive"))
+        cs = Qt::CaseSensitive;
+
+    cfg->configSections = sections;
+    cfg->caseSensitivity = cs;
+    cfg->className = sections.value("Options").value("classname", "Scanner");
+    cfg->maxInputSet = maxInputSet;
+    cfg->ruleFile = ruleFile;
+    return true;
+}
+
+static DFA generateMachine(const Config &cfg)
+{
+    if (cfg.cache) {
+        QFileInfo ruleInfo(cfg.ruleFile);
+        QFileInfo cacheInfo(ruleInfo.baseName() + ".dfa");
+        if (cacheInfo.exists()
+            && cacheInfo.lastModified() > ruleInfo.lastModified()) {
+            QFile f(cacheInfo.absoluteFilePath());
+            f.open(QIODevice::ReadOnly);
+            QDataStream stream(&f);
+            DFA machine;
+            stream >> machine;
+            return machine;
+        }
+    }
+
+    QMap<QString, NFA> macros;
+    foreach (ConfigFile::Entry e, cfg.configSections.value("Macros")) {
+        int errCol = 0;
+        if (cfg.debug)
+            qDebug() << "parsing" << e.value;
+        NFA nfa = RE2NFA(macros, cfg.maxInputSet, cfg.caseSensitivity).parse(e.value, &errCol);
+        if (nfa.isEmpty()) {
+            qWarning("Parse error in line %d column %d", e.lineNumber, errCol);
+            return DFA();
+        }
+        macros.insert(e.key, nfa);
+    }
+
+    if (!cfg.configSections.contains("Tokens")) {
+        qWarning("Rule file does not contain a [Tokens] section!");
+        return DFA();
+    }
+
+    QVector<NFA> tokens;
+
+    foreach (ConfigFile::Entry e, cfg.configSections.value("Tokens")) {
+        int errCol = 0;
+        if (cfg.debug)
+            qDebug() << "parsing" << e.value;
+        NFA tok = RE2NFA(macros, cfg.maxInputSet, cfg.caseSensitivity).parse(e.value, &errCol);
+        if (tok.isEmpty()) {
+            qWarning("Parse error in line %d column %d while parsing token %s", e.lineNumber, errCol, e.key.toLocal8Bit().constData());
+            return DFA();
+        }
+        tok.setTerminationSymbol(e.key);
+        tokens.append(tok);
+    }
+
+    NFA giganticStateMachine;
+    foreach (NFA nfa, tokens)
+        if (giganticStateMachine.isEmpty())
+            giganticStateMachine = nfa;
+        else
+            giganticStateMachine = NFA::createAlternatingNFA(giganticStateMachine, nfa);
+
+    DFA result = giganticStateMachine.toDFA().minimize();
+    if (cfg.cache) {
+        QFileInfo ruleInfo(cfg.ruleFile);
+        QFileInfo cacheInfo(ruleInfo.baseName() + ".dfa");
+        QFile f(cacheInfo.absoluteFilePath());
+        f.open(QIODevice::WriteOnly | QIODevice::Truncate);
+        QDataStream stream(&f);
+        stream << result;
+    }
+    return result;
+}
+
+#if !defined(AUTOTEST)
+int main(int argc, char **argv)
+{
+    QCoreApplication app(argc, argv);
+    QString ruleFile;
+    Config cfg;
+
+    const QStringList arguments = app.arguments().mid(1);
+    cfg.debug = arguments.contains("-debug");
+    const bool testRules = arguments.contains("-test");
+    cfg.cache = arguments.contains("-cache");
+
+    foreach (const QString &arg, arguments)
+        if (!arg.startsWith(QLatin1Char('-'))) {
+            ruleFile = arg;
+            break;
+        }
+
+    if (ruleFile.isEmpty()) {
+        qWarning("usage: lexgen [-test rulefile");
+        qWarning(" ");
+        qWarning("    the -test option will cause lexgen to interpret standard input");
+        qWarning("    according to the specified rules and print out pairs of token and");
+        qWarning("    lexical element");
+        return 1;
+    }
+
+    if (!loadConfig(ruleFile, &cfg))
+        return 1;
+
+    DFA machine = generateMachine(cfg);
+    if (machine.isEmpty())
+        return 1;
+
+    if (testRules) {
+        qWarning("Testing:");
+        QString input = QTextStream(stdin).readAll();
+        /*
+        qDebug() << "NFA has" << machine.stateCount() << "states";
+        qDebug() << "Converting to DFA... (this may take a while)";
+        DFA dfa = machine.toDFA();
+        qDebug() << "DFA has" << dfa.count() << "states";
+        qDebug() << "Minimizing...";
+        dfa = dfa.minimize();
+        qDebug() << "Minimized DFA has" << dfa.count() << "states";
+        */
+        DFA dfa = machine;
+        if (cfg.debug)
+            qDebug() << "tokenizing" << input;
+        bool ok = false;
+        QList<Symbol> symbols = tokenize(dfa, input, &cfg, &ok);
+        if (symbols.isEmpty()) {
+            qWarning("No tokens produced!");
+        } else {
+            foreach (Symbol s, symbols)
+                    qDebug() << s.token << ":" << s.lexem;
+        }
+        if (ok)
+            qDebug() << symbols.count() << "tokens produced.";
+        else
+            qDebug() << "Error while tokenizing!";
+    } else {
+        Generator gen(machine, cfg);
+        QTextStream(stdout) 
+            << gen.generate();
+    }
+
+    return 0;
+}
+#endif
+