util/lexgen/main.cpp
changeset 0 1918ee327afb
child 4 3b1da2848fc7
equal deleted inserted replaced
-1:000000000000 0:1918ee327afb
       
     1 /****************************************************************************
       
     2 **
       
     3 ** Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies).
       
     4 ** All rights reserved.
       
     5 ** Contact: Nokia Corporation (qt-info@nokia.com)
       
     6 **
       
     7 ** This file is part of the utils of the Qt Toolkit.
       
     8 **
       
     9 ** $QT_BEGIN_LICENSE:LGPL$
       
    10 ** No Commercial Usage
       
    11 ** This file contains pre-release code and may not be distributed.
       
    12 ** You may use this file in accordance with the terms and conditions
       
    13 ** contained in the Technology Preview License Agreement accompanying
       
    14 ** this package.
       
    15 **
       
    16 ** GNU Lesser General Public License Usage
       
    17 ** Alternatively, this file may be used under the terms of the GNU Lesser
       
    18 ** General Public License version 2.1 as published by the Free Software
       
    19 ** Foundation and appearing in the file LICENSE.LGPL included in the
       
    20 ** packaging of this file.  Please review the following information to
       
    21 ** ensure the GNU Lesser General Public License version 2.1 requirements
       
    22 ** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
       
    23 **
       
    24 ** In addition, as a special exception, Nokia gives you certain additional
       
    25 ** rights.  These rights are described in the Nokia Qt LGPL Exception
       
    26 ** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
       
    27 **
       
    28 ** If you have questions regarding the use of this file, please contact
       
    29 ** Nokia at qt-info@nokia.com.
       
    30 **
       
    31 **
       
    32 **
       
    33 **
       
    34 **
       
    35 **
       
    36 **
       
    37 **
       
    38 ** $QT_END_LICENSE$
       
    39 **
       
    40 ****************************************************************************/
       
    41 
       
    42 #include "nfa.h"
       
    43 #include "re2nfa.h"
       
    44 #include "configfile.h"
       
    45 #include "generator.h"
       
    46 
       
    47 #include <QFile>
       
    48 #include <QCoreApplication>
       
    49 #include <QFileInfo>
       
    50 #include <QDateTime>
       
    51 
       
    52 struct Symbol
       
    53 {
       
    54     QString token;
       
    55     QString lexem;
       
    56 };
       
    57 
       
    58 static QList<Symbol> tokenize(const DFA &dfa, const QString &input, Config *cfg, bool *ok = 0)
       
    59 {
       
    60     QList<Symbol> symbols;
       
    61     Symbol lastSymbol;
       
    62     int state = 0;
       
    63     int lastAcceptingState = -1;
       
    64     QString lastAcceptingLexem;
       
    65     int lastAcceptingPos = -1;
       
    66     for (int i = 0; i < input.length(); ++i) {
       
    67         QChar ch = input.at(i);
       
    68         QChar chForInput = ch;
       
    69         if (cfg->caseSensitivity == Qt::CaseInsensitive)
       
    70             chForInput = chForInput.toLower();
       
    71         int next = dfa.at(state).transitions.value(chForInput.unicode());
       
    72         if (cfg->debug)
       
    73             qDebug() << "input" << input.at(i) << "leads to state" << next;
       
    74         if (next) {
       
    75             lastSymbol.lexem.append(input.at(i));
       
    76             lastSymbol.token = dfa.at(next).symbol;
       
    77             if (!lastSymbol.token.isEmpty()) {
       
    78                 lastAcceptingState = next;
       
    79                 lastAcceptingLexem = lastSymbol.lexem;
       
    80                 lastAcceptingPos = i;
       
    81             }
       
    82             state = next;
       
    83         } else {
       
    84             if (lastAcceptingState != -1) {
       
    85                 if (cfg->debug)
       
    86                     qDebug() << "adding" << dfa.at(lastAcceptingState).symbol << "and backtracking to" << lastAcceptingPos;
       
    87                 Symbol s;
       
    88                 s.token = dfa.at(lastAcceptingState).symbol;
       
    89                 s.lexem = lastAcceptingLexem;
       
    90                 symbols << s;
       
    91                 lastSymbol = Symbol();
       
    92                 state = 0;
       
    93                 i = lastAcceptingPos;
       
    94                 lastAcceptingPos = -1;
       
    95                 lastAcceptingState = -1;
       
    96                 continue;
       
    97             }
       
    98             if (state == 0 || lastSymbol.token.isEmpty()) {
       
    99                 if (cfg->debug)
       
   100                     qDebug() << "invalid input";
       
   101                 if (ok)
       
   102                     *ok = false;
       
   103                 return symbols;
       
   104             }
       
   105             if (cfg->debug)
       
   106                 qDebug() << "appending symbol with token" << lastSymbol.token;
       
   107             symbols << lastSymbol;
       
   108             lastSymbol = Symbol();
       
   109             state = 0;
       
   110             lastAcceptingState = -1;
       
   111             --i;
       
   112         }
       
   113     }
       
   114     if (!lastSymbol.token.isEmpty()) {
       
   115         if (cfg->debug)
       
   116             qDebug() << "appending (last) symbol with token" << lastSymbol.token;
       
   117         symbols << lastSymbol;
       
   118     } else if (lastAcceptingState != -1) {
       
   119         if (cfg->debug)
       
   120             qDebug() << "appending last accepting state with token" << dfa.at(lastAcceptingState).symbol;
       
   121         Symbol s;
       
   122         s.lexem = lastAcceptingLexem;
       
   123         s.token = dfa.at(lastAcceptingState).symbol;
       
   124         symbols << s;
       
   125     }
       
   126     if (ok)
       
   127         *ok = true;
       
   128     return symbols;
       
   129 }
       
   130 
       
   131 static QSet<InputType> determineMaxInputSet(const ConfigFile::Section &section)
       
   132 {
       
   133     QSet<InputType> set;
       
   134 
       
   135     QString inputTypeName;
       
   136 
       
   137     foreach (const ConfigFile::Entry &entry, section)
       
   138         if (entry.key == QLatin1String("InputType")) {
       
   139             if (!inputTypeName.isEmpty()) {
       
   140                 qWarning("Error: InputType field specified multiple times in config file");
       
   141                 return QSet<InputType>();
       
   142             }
       
   143             inputTypeName = entry.value;
       
   144         }
       
   145 
       
   146     if (inputTypeName.isEmpty())
       
   147         inputTypeName = "quint8";
       
   148 
       
   149     if (inputTypeName == "quint8") {
       
   150         for (int i = 1; i < 256; ++i)
       
   151             set.insert(i);
       
   152     } /* else if ### */ 
       
   153     else {
       
   154         qWarning("Error: Unknown input type '%s'", qPrintable(inputTypeName));
       
   155         return QSet<InputType>();
       
   156     }
       
   157 
       
   158     return set;
       
   159 }
       
   160 
       
   161 static bool loadConfig(const QString &ruleFile, Config *cfg)
       
   162 {
       
   163     ConfigFile::SectionMap sections = ConfigFile::parse(ruleFile);
       
   164     if (sections.isEmpty()) {
       
   165         qWarning("Error parsing %s", qPrintable(ruleFile));
       
   166         return false;
       
   167     }
       
   168 
       
   169     QSet<InputType> maxInputSet = determineMaxInputSet(sections.value("Options"));
       
   170     if (maxInputSet.isEmpty())
       
   171         return false;
       
   172 
       
   173     Qt::CaseSensitivity cs = Qt::CaseInsensitive;
       
   174     if (sections.value("Options").contains("case-sensitive"))
       
   175         cs = Qt::CaseSensitive;
       
   176 
       
   177     cfg->configSections = sections;
       
   178     cfg->caseSensitivity = cs;
       
   179     cfg->className = sections.value("Options").value("classname", "Scanner");
       
   180     cfg->maxInputSet = maxInputSet;
       
   181     cfg->ruleFile = ruleFile;
       
   182     return true;
       
   183 }
       
   184 
       
   185 static DFA generateMachine(const Config &cfg)
       
   186 {
       
   187     if (cfg.cache) {
       
   188         QFileInfo ruleInfo(cfg.ruleFile);
       
   189         QFileInfo cacheInfo(ruleInfo.baseName() + ".dfa");
       
   190         if (cacheInfo.exists()
       
   191             && cacheInfo.lastModified() > ruleInfo.lastModified()) {
       
   192             QFile f(cacheInfo.absoluteFilePath());
       
   193             f.open(QIODevice::ReadOnly);
       
   194             QDataStream stream(&f);
       
   195             DFA machine;
       
   196             stream >> machine;
       
   197             return machine;
       
   198         }
       
   199     }
       
   200 
       
   201     QMap<QString, NFA> macros;
       
   202     foreach (ConfigFile::Entry e, cfg.configSections.value("Macros")) {
       
   203         int errCol = 0;
       
   204         if (cfg.debug)
       
   205             qDebug() << "parsing" << e.value;
       
   206         NFA nfa = RE2NFA(macros, cfg.maxInputSet, cfg.caseSensitivity).parse(e.value, &errCol);
       
   207         if (nfa.isEmpty()) {
       
   208             qWarning("Parse error in line %d column %d", e.lineNumber, errCol);
       
   209             return DFA();
       
   210         }
       
   211         macros.insert(e.key, nfa);
       
   212     }
       
   213 
       
   214     if (!cfg.configSections.contains("Tokens")) {
       
   215         qWarning("Rule file does not contain a [Tokens] section!");
       
   216         return DFA();
       
   217     }
       
   218 
       
   219     QVector<NFA> tokens;
       
   220 
       
   221     foreach (ConfigFile::Entry e, cfg.configSections.value("Tokens")) {
       
   222         int errCol = 0;
       
   223         if (cfg.debug)
       
   224             qDebug() << "parsing" << e.value;
       
   225         NFA tok = RE2NFA(macros, cfg.maxInputSet, cfg.caseSensitivity).parse(e.value, &errCol);
       
   226         if (tok.isEmpty()) {
       
   227             qWarning("Parse error in line %d column %d while parsing token %s", e.lineNumber, errCol, e.key.toLocal8Bit().constData());
       
   228             return DFA();
       
   229         }
       
   230         tok.setTerminationSymbol(e.key);
       
   231         tokens.append(tok);
       
   232     }
       
   233 
       
   234     NFA giganticStateMachine;
       
   235     foreach (NFA nfa, tokens)
       
   236         if (giganticStateMachine.isEmpty())
       
   237             giganticStateMachine = nfa;
       
   238         else
       
   239             giganticStateMachine = NFA::createAlternatingNFA(giganticStateMachine, nfa);
       
   240 
       
   241     DFA result = giganticStateMachine.toDFA().minimize();
       
   242     if (cfg.cache) {
       
   243         QFileInfo ruleInfo(cfg.ruleFile);
       
   244         QFileInfo cacheInfo(ruleInfo.baseName() + ".dfa");
       
   245         QFile f(cacheInfo.absoluteFilePath());
       
   246         f.open(QIODevice::WriteOnly | QIODevice::Truncate);
       
   247         QDataStream stream(&f);
       
   248         stream << result;
       
   249     }
       
   250     return result;
       
   251 }
       
   252 
       
   253 #if !defined(AUTOTEST)
       
   254 int main(int argc, char **argv)
       
   255 {
       
   256     QCoreApplication app(argc, argv);
       
   257     QString ruleFile;
       
   258     Config cfg;
       
   259 
       
   260     const QStringList arguments = app.arguments().mid(1);
       
   261     cfg.debug = arguments.contains("-debug");
       
   262     const bool testRules = arguments.contains("-test");
       
   263     cfg.cache = arguments.contains("-cache");
       
   264 
       
   265     foreach (const QString &arg, arguments)
       
   266         if (!arg.startsWith(QLatin1Char('-'))) {
       
   267             ruleFile = arg;
       
   268             break;
       
   269         }
       
   270 
       
   271     if (ruleFile.isEmpty()) {
       
   272         qWarning("usage: lexgen [-test rulefile");
       
   273         qWarning(" ");
       
   274         qWarning("    the -test option will cause lexgen to interpret standard input");
       
   275         qWarning("    according to the specified rules and print out pairs of token and");
       
   276         qWarning("    lexical element");
       
   277         return 1;
       
   278     }
       
   279 
       
   280     if (!loadConfig(ruleFile, &cfg))
       
   281         return 1;
       
   282 
       
   283     DFA machine = generateMachine(cfg);
       
   284     if (machine.isEmpty())
       
   285         return 1;
       
   286 
       
   287     if (testRules) {
       
   288         qWarning("Testing:");
       
   289         QString input = QTextStream(stdin).readAll();
       
   290         /*
       
   291         qDebug() << "NFA has" << machine.stateCount() << "states";
       
   292         qDebug() << "Converting to DFA... (this may take a while)";
       
   293         DFA dfa = machine.toDFA();
       
   294         qDebug() << "DFA has" << dfa.count() << "states";
       
   295         qDebug() << "Minimizing...";
       
   296         dfa = dfa.minimize();
       
   297         qDebug() << "Minimized DFA has" << dfa.count() << "states";
       
   298         */
       
   299         DFA dfa = machine;
       
   300         if (cfg.debug)
       
   301             qDebug() << "tokenizing" << input;
       
   302         bool ok = false;
       
   303         QList<Symbol> symbols = tokenize(dfa, input, &cfg, &ok);
       
   304         if (symbols.isEmpty()) {
       
   305             qWarning("No tokens produced!");
       
   306         } else {
       
   307             foreach (Symbol s, symbols)
       
   308                     qDebug() << s.token << ":" << s.lexem;
       
   309         }
       
   310         if (ok)
       
   311             qDebug() << symbols.count() << "tokens produced.";
       
   312         else
       
   313             qDebug() << "Error while tokenizing!";
       
   314     } else {
       
   315         Generator gen(machine, cfg);
       
   316         QTextStream(stdout) 
       
   317             << gen.generate();
       
   318     }
       
   319 
       
   320     return 0;
       
   321 }
       
   322 #endif
       
   323