util/lexgen/main.cpp
author Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
Tue, 02 Feb 2010 00:43:10 +0200
changeset 3 41300fa6a67c
parent 0 1918ee327afb
child 4 3b1da2848fc7
permissions -rw-r--r--
Revision: 201003 Kit: 201005

/****************************************************************************
**
** Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies).
** All rights reserved.
** Contact: Nokia Corporation (qt-info@nokia.com)
**
** This file is part of the utils of the Qt Toolkit.
**
** $QT_BEGIN_LICENSE:LGPL$
** No Commercial Usage
** This file contains pre-release code and may not be distributed.
** You may use this file in accordance with the terms and conditions
** contained in the Technology Preview License Agreement accompanying
** this package.
**
** GNU Lesser General Public License Usage
** Alternatively, this file may be used under the terms of the GNU Lesser
** General Public License version 2.1 as published by the Free Software
** Foundation and appearing in the file LICENSE.LGPL included in the
** packaging of this file.  Please review the following information to
** ensure the GNU Lesser General Public License version 2.1 requirements
** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
**
** In addition, as a special exception, Nokia gives you certain additional
** rights.  These rights are described in the Nokia Qt LGPL Exception
** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
**
** If you have questions regarding the use of this file, please contact
** Nokia at qt-info@nokia.com.
**
**
**
**
**
**
**
**
** $QT_END_LICENSE$
**
****************************************************************************/

#include "nfa.h"
#include "re2nfa.h"
#include "configfile.h"
#include "generator.h"

#include <QFile>
#include <QCoreApplication>
#include <QFileInfo>
#include <QDateTime>

struct Symbol
{
    QString token;
    QString lexem;
};

static QList<Symbol> tokenize(const DFA &dfa, const QString &input, Config *cfg, bool *ok = 0)
{
    QList<Symbol> symbols;
    Symbol lastSymbol;
    int state = 0;
    int lastAcceptingState = -1;
    QString lastAcceptingLexem;
    int lastAcceptingPos = -1;
    for (int i = 0; i < input.length(); ++i) {
        QChar ch = input.at(i);
        QChar chForInput = ch;
        if (cfg->caseSensitivity == Qt::CaseInsensitive)
            chForInput = chForInput.toLower();
        int next = dfa.at(state).transitions.value(chForInput.unicode());
        if (cfg->debug)
            qDebug() << "input" << input.at(i) << "leads to state" << next;
        if (next) {
            lastSymbol.lexem.append(input.at(i));
            lastSymbol.token = dfa.at(next).symbol;
            if (!lastSymbol.token.isEmpty()) {
                lastAcceptingState = next;
                lastAcceptingLexem = lastSymbol.lexem;
                lastAcceptingPos = i;
            }
            state = next;
        } else {
            if (lastAcceptingState != -1) {
                if (cfg->debug)
                    qDebug() << "adding" << dfa.at(lastAcceptingState).symbol << "and backtracking to" << lastAcceptingPos;
                Symbol s;
                s.token = dfa.at(lastAcceptingState).symbol;
                s.lexem = lastAcceptingLexem;
                symbols << s;
                lastSymbol = Symbol();
                state = 0;
                i = lastAcceptingPos;
                lastAcceptingPos = -1;
                lastAcceptingState = -1;
                continue;
            }
            if (state == 0 || lastSymbol.token.isEmpty()) {
                if (cfg->debug)
                    qDebug() << "invalid input";
                if (ok)
                    *ok = false;
                return symbols;
            }
            if (cfg->debug)
                qDebug() << "appending symbol with token" << lastSymbol.token;
            symbols << lastSymbol;
            lastSymbol = Symbol();
            state = 0;
            lastAcceptingState = -1;
            --i;
        }
    }
    if (!lastSymbol.token.isEmpty()) {
        if (cfg->debug)
            qDebug() << "appending (last) symbol with token" << lastSymbol.token;
        symbols << lastSymbol;
    } else if (lastAcceptingState != -1) {
        if (cfg->debug)
            qDebug() << "appending last accepting state with token" << dfa.at(lastAcceptingState).symbol;
        Symbol s;
        s.lexem = lastAcceptingLexem;
        s.token = dfa.at(lastAcceptingState).symbol;
        symbols << s;
    }
    if (ok)
        *ok = true;
    return symbols;
}

static QSet<InputType> determineMaxInputSet(const ConfigFile::Section &section)
{
    QSet<InputType> set;

    QString inputTypeName;

    foreach (const ConfigFile::Entry &entry, section)
        if (entry.key == QLatin1String("InputType")) {
            if (!inputTypeName.isEmpty()) {
                qWarning("Error: InputType field specified multiple times in config file");
                return QSet<InputType>();
            }
            inputTypeName = entry.value;
        }

    if (inputTypeName.isEmpty())
        inputTypeName = "quint8";

    if (inputTypeName == "quint8") {
        for (int i = 1; i < 256; ++i)
            set.insert(i);
    } /* else if ### */ 
    else {
        qWarning("Error: Unknown input type '%s'", qPrintable(inputTypeName));
        return QSet<InputType>();
    }

    return set;
}

static bool loadConfig(const QString &ruleFile, Config *cfg)
{
    ConfigFile::SectionMap sections = ConfigFile::parse(ruleFile);
    if (sections.isEmpty()) {
        qWarning("Error parsing %s", qPrintable(ruleFile));
        return false;
    }

    QSet<InputType> maxInputSet = determineMaxInputSet(sections.value("Options"));
    if (maxInputSet.isEmpty())
        return false;

    Qt::CaseSensitivity cs = Qt::CaseInsensitive;
    if (sections.value("Options").contains("case-sensitive"))
        cs = Qt::CaseSensitive;

    cfg->configSections = sections;
    cfg->caseSensitivity = cs;
    cfg->className = sections.value("Options").value("classname", "Scanner");
    cfg->maxInputSet = maxInputSet;
    cfg->ruleFile = ruleFile;
    return true;
}

static DFA generateMachine(const Config &cfg)
{
    if (cfg.cache) {
        QFileInfo ruleInfo(cfg.ruleFile);
        QFileInfo cacheInfo(ruleInfo.baseName() + ".dfa");
        if (cacheInfo.exists()
            && cacheInfo.lastModified() > ruleInfo.lastModified()) {
            QFile f(cacheInfo.absoluteFilePath());
            f.open(QIODevice::ReadOnly);
            QDataStream stream(&f);
            DFA machine;
            stream >> machine;
            return machine;
        }
    }

    QMap<QString, NFA> macros;
    foreach (ConfigFile::Entry e, cfg.configSections.value("Macros")) {
        int errCol = 0;
        if (cfg.debug)
            qDebug() << "parsing" << e.value;
        NFA nfa = RE2NFA(macros, cfg.maxInputSet, cfg.caseSensitivity).parse(e.value, &errCol);
        if (nfa.isEmpty()) {
            qWarning("Parse error in line %d column %d", e.lineNumber, errCol);
            return DFA();
        }
        macros.insert(e.key, nfa);
    }

    if (!cfg.configSections.contains("Tokens")) {
        qWarning("Rule file does not contain a [Tokens] section!");
        return DFA();
    }

    QVector<NFA> tokens;

    foreach (ConfigFile::Entry e, cfg.configSections.value("Tokens")) {
        int errCol = 0;
        if (cfg.debug)
            qDebug() << "parsing" << e.value;
        NFA tok = RE2NFA(macros, cfg.maxInputSet, cfg.caseSensitivity).parse(e.value, &errCol);
        if (tok.isEmpty()) {
            qWarning("Parse error in line %d column %d while parsing token %s", e.lineNumber, errCol, e.key.toLocal8Bit().constData());
            return DFA();
        }
        tok.setTerminationSymbol(e.key);
        tokens.append(tok);
    }

    NFA giganticStateMachine;
    foreach (NFA nfa, tokens)
        if (giganticStateMachine.isEmpty())
            giganticStateMachine = nfa;
        else
            giganticStateMachine = NFA::createAlternatingNFA(giganticStateMachine, nfa);

    DFA result = giganticStateMachine.toDFA().minimize();
    if (cfg.cache) {
        QFileInfo ruleInfo(cfg.ruleFile);
        QFileInfo cacheInfo(ruleInfo.baseName() + ".dfa");
        QFile f(cacheInfo.absoluteFilePath());
        f.open(QIODevice::WriteOnly | QIODevice::Truncate);
        QDataStream stream(&f);
        stream << result;
    }
    return result;
}

#if !defined(AUTOTEST)
int main(int argc, char **argv)
{
    QCoreApplication app(argc, argv);
    QString ruleFile;
    Config cfg;

    const QStringList arguments = app.arguments().mid(1);
    cfg.debug = arguments.contains("-debug");
    const bool testRules = arguments.contains("-test");
    cfg.cache = arguments.contains("-cache");

    foreach (const QString &arg, arguments)
        if (!arg.startsWith(QLatin1Char('-'))) {
            ruleFile = arg;
            break;
        }

    if (ruleFile.isEmpty()) {
        qWarning("usage: lexgen [-test rulefile");
        qWarning(" ");
        qWarning("    the -test option will cause lexgen to interpret standard input");
        qWarning("    according to the specified rules and print out pairs of token and");
        qWarning("    lexical element");
        return 1;
    }

    if (!loadConfig(ruleFile, &cfg))
        return 1;

    DFA machine = generateMachine(cfg);
    if (machine.isEmpty())
        return 1;

    if (testRules) {
        qWarning("Testing:");
        QString input = QTextStream(stdin).readAll();
        /*
        qDebug() << "NFA has" << machine.stateCount() << "states";
        qDebug() << "Converting to DFA... (this may take a while)";
        DFA dfa = machine.toDFA();
        qDebug() << "DFA has" << dfa.count() << "states";
        qDebug() << "Minimizing...";
        dfa = dfa.minimize();
        qDebug() << "Minimized DFA has" << dfa.count() << "states";
        */
        DFA dfa = machine;
        if (cfg.debug)
            qDebug() << "tokenizing" << input;
        bool ok = false;
        QList<Symbol> symbols = tokenize(dfa, input, &cfg, &ok);
        if (symbols.isEmpty()) {
            qWarning("No tokens produced!");
        } else {
            foreach (Symbol s, symbols)
                    qDebug() << s.token << ":" << s.lexem;
        }
        if (ok)
            qDebug() << symbols.count() << "tokens produced.";
        else
            qDebug() << "Error while tokenizing!";
    } else {
        Generator gen(machine, cfg);
        QTextStream(stdout) 
            << gen.generate();
    }

    return 0;
}
#endif