diff -r 000000000000 -r 1918ee327afb util/lexgen/main.cpp --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/util/lexgen/main.cpp Mon Jan 11 14:00:40 2010 +0000 @@ -0,0 +1,323 @@ +/**************************************************************************** +** +** Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies). +** All rights reserved. +** Contact: Nokia Corporation (qt-info@nokia.com) +** +** This file is part of the utils of the Qt Toolkit. +** +** $QT_BEGIN_LICENSE:LGPL$ +** No Commercial Usage +** This file contains pre-release code and may not be distributed. +** You may use this file in accordance with the terms and conditions +** contained in the Technology Preview License Agreement accompanying +** this package. +** +** GNU Lesser General Public License Usage +** Alternatively, this file may be used under the terms of the GNU Lesser +** General Public License version 2.1 as published by the Free Software +** Foundation and appearing in the file LICENSE.LGPL included in the +** packaging of this file. Please review the following information to +** ensure the GNU Lesser General Public License version 2.1 requirements +** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html. +** +** In addition, as a special exception, Nokia gives you certain additional +** rights. These rights are described in the Nokia Qt LGPL Exception +** version 1.1, included in the file LGPL_EXCEPTION.txt in this package. +** +** If you have questions regarding the use of this file, please contact +** Nokia at qt-info@nokia.com. +** +** +** +** +** +** +** +** +** $QT_END_LICENSE$ +** +****************************************************************************/ + +#include "nfa.h" +#include "re2nfa.h" +#include "configfile.h" +#include "generator.h" + +#include +#include +#include +#include + +struct Symbol +{ + QString token; + QString lexem; +}; + +static QList tokenize(const DFA &dfa, const QString &input, Config *cfg, bool *ok = 0) +{ + QList symbols; + Symbol lastSymbol; + int state = 0; + int lastAcceptingState = -1; + QString lastAcceptingLexem; + int lastAcceptingPos = -1; + for (int i = 0; i < input.length(); ++i) { + QChar ch = input.at(i); + QChar chForInput = ch; + if (cfg->caseSensitivity == Qt::CaseInsensitive) + chForInput = chForInput.toLower(); + int next = dfa.at(state).transitions.value(chForInput.unicode()); + if (cfg->debug) + qDebug() << "input" << input.at(i) << "leads to state" << next; + if (next) { + lastSymbol.lexem.append(input.at(i)); + lastSymbol.token = dfa.at(next).symbol; + if (!lastSymbol.token.isEmpty()) { + lastAcceptingState = next; + lastAcceptingLexem = lastSymbol.lexem; + lastAcceptingPos = i; + } + state = next; + } else { + if (lastAcceptingState != -1) { + if (cfg->debug) + qDebug() << "adding" << dfa.at(lastAcceptingState).symbol << "and backtracking to" << lastAcceptingPos; + Symbol s; + s.token = dfa.at(lastAcceptingState).symbol; + s.lexem = lastAcceptingLexem; + symbols << s; + lastSymbol = Symbol(); + state = 0; + i = lastAcceptingPos; + lastAcceptingPos = -1; + lastAcceptingState = -1; + continue; + } + if (state == 0 || lastSymbol.token.isEmpty()) { + if (cfg->debug) + qDebug() << "invalid input"; + if (ok) + *ok = false; + return symbols; + } + if (cfg->debug) + qDebug() << "appending symbol with token" << lastSymbol.token; + symbols << lastSymbol; + lastSymbol = Symbol(); + state = 0; + lastAcceptingState = -1; + --i; + } + } + if (!lastSymbol.token.isEmpty()) { + if (cfg->debug) + qDebug() << "appending (last) symbol with token" << lastSymbol.token; + symbols << lastSymbol; + } else if (lastAcceptingState != -1) { + if (cfg->debug) + qDebug() << "appending last accepting state with token" << dfa.at(lastAcceptingState).symbol; + Symbol s; + s.lexem = lastAcceptingLexem; + s.token = dfa.at(lastAcceptingState).symbol; + symbols << s; + } + if (ok) + *ok = true; + return symbols; +} + +static QSet determineMaxInputSet(const ConfigFile::Section §ion) +{ + QSet set; + + QString inputTypeName; + + foreach (const ConfigFile::Entry &entry, section) + if (entry.key == QLatin1String("InputType")) { + if (!inputTypeName.isEmpty()) { + qWarning("Error: InputType field specified multiple times in config file"); + return QSet(); + } + inputTypeName = entry.value; + } + + if (inputTypeName.isEmpty()) + inputTypeName = "quint8"; + + if (inputTypeName == "quint8") { + for (int i = 1; i < 256; ++i) + set.insert(i); + } /* else if ### */ + else { + qWarning("Error: Unknown input type '%s'", qPrintable(inputTypeName)); + return QSet(); + } + + return set; +} + +static bool loadConfig(const QString &ruleFile, Config *cfg) +{ + ConfigFile::SectionMap sections = ConfigFile::parse(ruleFile); + if (sections.isEmpty()) { + qWarning("Error parsing %s", qPrintable(ruleFile)); + return false; + } + + QSet maxInputSet = determineMaxInputSet(sections.value("Options")); + if (maxInputSet.isEmpty()) + return false; + + Qt::CaseSensitivity cs = Qt::CaseInsensitive; + if (sections.value("Options").contains("case-sensitive")) + cs = Qt::CaseSensitive; + + cfg->configSections = sections; + cfg->caseSensitivity = cs; + cfg->className = sections.value("Options").value("classname", "Scanner"); + cfg->maxInputSet = maxInputSet; + cfg->ruleFile = ruleFile; + return true; +} + +static DFA generateMachine(const Config &cfg) +{ + if (cfg.cache) { + QFileInfo ruleInfo(cfg.ruleFile); + QFileInfo cacheInfo(ruleInfo.baseName() + ".dfa"); + if (cacheInfo.exists() + && cacheInfo.lastModified() > ruleInfo.lastModified()) { + QFile f(cacheInfo.absoluteFilePath()); + f.open(QIODevice::ReadOnly); + QDataStream stream(&f); + DFA machine; + stream >> machine; + return machine; + } + } + + QMap macros; + foreach (ConfigFile::Entry e, cfg.configSections.value("Macros")) { + int errCol = 0; + if (cfg.debug) + qDebug() << "parsing" << e.value; + NFA nfa = RE2NFA(macros, cfg.maxInputSet, cfg.caseSensitivity).parse(e.value, &errCol); + if (nfa.isEmpty()) { + qWarning("Parse error in line %d column %d", e.lineNumber, errCol); + return DFA(); + } + macros.insert(e.key, nfa); + } + + if (!cfg.configSections.contains("Tokens")) { + qWarning("Rule file does not contain a [Tokens] section!"); + return DFA(); + } + + QVector tokens; + + foreach (ConfigFile::Entry e, cfg.configSections.value("Tokens")) { + int errCol = 0; + if (cfg.debug) + qDebug() << "parsing" << e.value; + NFA tok = RE2NFA(macros, cfg.maxInputSet, cfg.caseSensitivity).parse(e.value, &errCol); + if (tok.isEmpty()) { + qWarning("Parse error in line %d column %d while parsing token %s", e.lineNumber, errCol, e.key.toLocal8Bit().constData()); + return DFA(); + } + tok.setTerminationSymbol(e.key); + tokens.append(tok); + } + + NFA giganticStateMachine; + foreach (NFA nfa, tokens) + if (giganticStateMachine.isEmpty()) + giganticStateMachine = nfa; + else + giganticStateMachine = NFA::createAlternatingNFA(giganticStateMachine, nfa); + + DFA result = giganticStateMachine.toDFA().minimize(); + if (cfg.cache) { + QFileInfo ruleInfo(cfg.ruleFile); + QFileInfo cacheInfo(ruleInfo.baseName() + ".dfa"); + QFile f(cacheInfo.absoluteFilePath()); + f.open(QIODevice::WriteOnly | QIODevice::Truncate); + QDataStream stream(&f); + stream << result; + } + return result; +} + +#if !defined(AUTOTEST) +int main(int argc, char **argv) +{ + QCoreApplication app(argc, argv); + QString ruleFile; + Config cfg; + + const QStringList arguments = app.arguments().mid(1); + cfg.debug = arguments.contains("-debug"); + const bool testRules = arguments.contains("-test"); + cfg.cache = arguments.contains("-cache"); + + foreach (const QString &arg, arguments) + if (!arg.startsWith(QLatin1Char('-'))) { + ruleFile = arg; + break; + } + + if (ruleFile.isEmpty()) { + qWarning("usage: lexgen [-test rulefile"); + qWarning(" "); + qWarning(" the -test option will cause lexgen to interpret standard input"); + qWarning(" according to the specified rules and print out pairs of token and"); + qWarning(" lexical element"); + return 1; + } + + if (!loadConfig(ruleFile, &cfg)) + return 1; + + DFA machine = generateMachine(cfg); + if (machine.isEmpty()) + return 1; + + if (testRules) { + qWarning("Testing:"); + QString input = QTextStream(stdin).readAll(); + /* + qDebug() << "NFA has" << machine.stateCount() << "states"; + qDebug() << "Converting to DFA... (this may take a while)"; + DFA dfa = machine.toDFA(); + qDebug() << "DFA has" << dfa.count() << "states"; + qDebug() << "Minimizing..."; + dfa = dfa.minimize(); + qDebug() << "Minimized DFA has" << dfa.count() << "states"; + */ + DFA dfa = machine; + if (cfg.debug) + qDebug() << "tokenizing" << input; + bool ok = false; + QList symbols = tokenize(dfa, input, &cfg, &ok); + if (symbols.isEmpty()) { + qWarning("No tokens produced!"); + } else { + foreach (Symbol s, symbols) + qDebug() << s.token << ":" << s.lexem; + } + if (ok) + qDebug() << symbols.count() << "tokens produced."; + else + qDebug() << "Error while tokenizing!"; + } else { + Generator gen(machine, cfg); + QTextStream(stdout) + << gen.generate(); + } + + return 0; +} +#endif +