util/lexgen/re2nfa.cpp
changeset 0 1918ee327afb
child 4 3b1da2848fc7
equal deleted inserted replaced
-1:000000000000 0:1918ee327afb
       
     1 /****************************************************************************
       
     2 **
       
     3 ** Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies).
       
     4 ** All rights reserved.
       
     5 ** Contact: Nokia Corporation (qt-info@nokia.com)
       
     6 **
       
     7 ** This file is part of the utils of the Qt Toolkit.
       
     8 **
       
     9 ** $QT_BEGIN_LICENSE:LGPL$
       
    10 ** No Commercial Usage
       
    11 ** This file contains pre-release code and may not be distributed.
       
    12 ** You may use this file in accordance with the terms and conditions
       
    13 ** contained in the Technology Preview License Agreement accompanying
       
    14 ** this package.
       
    15 **
       
    16 ** GNU Lesser General Public License Usage
       
    17 ** Alternatively, this file may be used under the terms of the GNU Lesser
       
    18 ** General Public License version 2.1 as published by the Free Software
       
    19 ** Foundation and appearing in the file LICENSE.LGPL included in the
       
    20 ** packaging of this file.  Please review the following information to
       
    21 ** ensure the GNU Lesser General Public License version 2.1 requirements
       
    22 ** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
       
    23 **
       
    24 ** In addition, as a special exception, Nokia gives you certain additional
       
    25 ** rights.  These rights are described in the Nokia Qt LGPL Exception
       
    26 ** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
       
    27 **
       
    28 ** If you have questions regarding the use of this file, please contact
       
    29 ** Nokia at qt-info@nokia.com.
       
    30 **
       
    31 **
       
    32 **
       
    33 **
       
    34 **
       
    35 **
       
    36 **
       
    37 **
       
    38 ** $QT_END_LICENSE$
       
    39 **
       
    40 ****************************************************************************/
       
    41 
       
    42 #include "re2nfa.h"
       
    43 #include "tokenizer.cpp"
       
    44 
       
    45 RE2NFA::RE2NFA(const QMap<QString, NFA> &macros, const QSet<InputType> &maxInputSet, Qt::CaseSensitivity cs)
       
    46     : macros(macros), index(0), errorColumn(-1), maxInputSet(maxInputSet), caseSensitivity(cs)
       
    47 {
       
    48 }
       
    49 
       
    50 NFA RE2NFA::parse(const QString &expression, int *errCol)
       
    51 {
       
    52     tokenize(expression);
       
    53 
       
    54     if (symbols.isEmpty())
       
    55         return NFA();
       
    56 
       
    57     index = 0;
       
    58 
       
    59     NFA result = parseExpr();
       
    60     if (result.isEmpty()) {
       
    61         if (errCol)
       
    62             *errCol = errorColumn;
       
    63     }
       
    64     return result;
       
    65 }
       
    66 
       
    67 NFA RE2NFA::parseExpr()
       
    68 {
       
    69     NFA value = parseBranch();
       
    70     while (test(TOK_OR)) {
       
    71         NFA rhs = parseBranch();
       
    72         value = NFA::createAlternatingNFA(value, rhs);
       
    73     }
       
    74     return value;
       
    75 }
       
    76 
       
    77 NFA RE2NFA::parseBranch()
       
    78 {
       
    79     NFA value = parsePiece();
       
    80     if (!hasNext())
       
    81         return value;
       
    82     NFA next;
       
    83     do {
       
    84         next = parsePiece();
       
    85         if (!next.isEmpty())
       
    86             value = NFA::createConcatenatingNFA(value, next);
       
    87     } while (!next.isEmpty() && hasNext());
       
    88     return value;
       
    89 }
       
    90 
       
    91 NFA RE2NFA::parsePiece()
       
    92 {
       
    93     NFA atom = parseAtom();
       
    94     if (atom.isEmpty() || !hasNext())
       
    95         return atom;
       
    96     return parseMaybeQuantifier(atom);
       
    97 }
       
    98 
       
    99 NFA RE2NFA::parseAtom()
       
   100 {
       
   101     // ####
       
   102     switch (next()) {
       
   103         case TOK_STRING:
       
   104             return createCharNFA();
       
   105         case TOK_LPAREN: {
       
   106             NFA subExpr = parseExpr();
       
   107             next(TOK_RPAREN);
       
   108             return subExpr;
       
   109         }
       
   110         case TOK_LBRACE: {
       
   111             QString macroName = lexemUntil(TOK_RBRACE);
       
   112             QMap<QString, NFA>::ConstIterator macro = macros.find(macroName);
       
   113             if (macro == macros.end()) {
       
   114                 qWarning("Unknown macro '%s' - probably used before defined", qPrintable(macroName));
       
   115                 return NFA();
       
   116             }
       
   117             return *macro;
       
   118         }
       
   119         case TOK_LBRACKET: {
       
   120             NFA set = parseSet();
       
   121             next(TOK_RBRACKET);
       
   122             return set;
       
   123         }
       
   124         case TOK_SEQUENCE:
       
   125             return parseSet2();
       
   126         case TOK_DOT:
       
   127             return NFA::createSetNFA(maxInputSet);
       
   128         default:
       
   129             prev();
       
   130             return NFA();
       
   131     }
       
   132 }
       
   133 
       
   134 NFA RE2NFA::parseMaybeQuantifier(const NFA &nfa)
       
   135 {
       
   136     // ####
       
   137     switch (next()) {
       
   138         case TOK_STAR:
       
   139             return NFA::createOptionalNFA(nfa);
       
   140         case TOK_QUESTION:
       
   141             return NFA::createZeroOrOneNFA(nfa);
       
   142         case TOK_PLUS:
       
   143             return NFA::createConcatenatingNFA(nfa, NFA::createOptionalNFA(nfa));
       
   144         case TOK_LBRACE: {
       
   145               const int rewind = index - 1;
       
   146 
       
   147               QString lexemBeforeComma;
       
   148               QString lexemAfterComma;
       
   149               bool seenComma = false;
       
   150               forever {
       
   151                   if (test(TOK_COMMA)) {
       
   152                       if (seenComma) {
       
   153                           errorColumn = symbol().column;
       
   154                           return NFA();
       
   155                       }
       
   156                       seenComma = true;
       
   157                   } else if (test(TOK_RBRACE)) {
       
   158                       break;
       
   159                   } else {
       
   160                       next(TOK_STRING);
       
   161                       if (seenComma)
       
   162                           lexemAfterComma += symbol().lexem;
       
   163                       else
       
   164                           lexemBeforeComma += symbol().lexem;
       
   165                   }
       
   166               }
       
   167               bool isNumber = false;
       
   168               int min = lexemBeforeComma.toInt(&isNumber);
       
   169               if (!isNumber) {
       
   170                   index = rewind;
       
   171                   return nfa;
       
   172               }
       
   173               int max = min;
       
   174               if (seenComma) {
       
   175                   max = lexemAfterComma.toInt(&isNumber);
       
   176                   if (!isNumber) {
       
   177                       errorColumn = symbol().column;
       
   178                       return NFA();
       
   179                   }
       
   180               }
       
   181               return NFA::applyQuantity(nfa, min, max);
       
   182         }
       
   183         default:
       
   184             prev();
       
   185             return nfa;
       
   186     }
       
   187 }
       
   188 
       
   189 NFA RE2NFA::parseSet()
       
   190 {
       
   191     QSet<InputType> set;
       
   192     bool negate = false;
       
   193 
       
   194     next(TOK_STRING);
       
   195 
       
   196     do {
       
   197         Q_ASSERT(symbol().lexem.length() == 1);
       
   198         // ###
       
   199         QChar ch = symbol().lexem.at(0);
       
   200         if (set.isEmpty() && ch == QLatin1Char('^')) {
       
   201             negate = true;
       
   202             continue;
       
   203         }
       
   204 
       
   205         // look ahead for ranges like a-z
       
   206         bool rangeFound = false;
       
   207         if (test(TOK_STRING)) {
       
   208             if (symbol().lexem.length() == 1
       
   209                 && symbol().lexem.at(0) == QLatin1Char('-')) {
       
   210                 next(TOK_STRING);
       
   211                 Q_ASSERT(symbol().lexem.length() == 1);
       
   212                 QChar last = symbol().lexem.at(0);
       
   213 
       
   214                 if (ch.unicode() > last.unicode())
       
   215                     qSwap(ch, last);
       
   216 
       
   217                 for (ushort i = ch.unicode(); i <= last.unicode(); ++i) {
       
   218                     if (caseSensitivity == Qt::CaseInsensitive) {
       
   219                         set.insert(QChar(i).toLower().unicode());
       
   220                     } else {
       
   221                         set.insert(i);
       
   222                     }
       
   223                 }
       
   224 
       
   225                 rangeFound = true;
       
   226             } else {
       
   227                 prev();
       
   228             }
       
   229         }
       
   230 
       
   231         if (!rangeFound) {
       
   232             if (caseSensitivity == Qt::CaseInsensitive) {
       
   233                 set.insert(ch.toLower().unicode());
       
   234             } else {
       
   235                 set.insert(ch.unicode());
       
   236             }
       
   237         }
       
   238     } while (test(TOK_STRING));
       
   239 
       
   240     if (negate) {
       
   241         QSet<InputType> negatedSet = maxInputSet;
       
   242         negatedSet.subtract(set);
       
   243         set = negatedSet;
       
   244     }
       
   245 
       
   246     return NFA::createSetNFA(set);
       
   247 }
       
   248 
       
   249 NFA RE2NFA::parseSet2()
       
   250 {
       
   251     QSet<InputType> set;
       
   252     bool negate = false;
       
   253 
       
   254     QString str = symbol().lexem;
       
   255     // strip off brackets
       
   256     str.chop(1);
       
   257     str.remove(0, 1);
       
   258 
       
   259     int i = 0;
       
   260     while (i < str.length()) {
       
   261         // ###
       
   262         QChar ch = str.at(i++);
       
   263         if (set.isEmpty() && ch == QLatin1Char('^')) {
       
   264             negate = true;
       
   265             continue;
       
   266         }
       
   267 
       
   268         // look ahead for ranges like a-z
       
   269         bool rangeFound = false;
       
   270         if (i < str.length() - 1 && str.at(i) == QLatin1Char('-')) {
       
   271             ++i;
       
   272             QChar last = str.at(i++);
       
   273 
       
   274             if (ch.unicode() > last.unicode())
       
   275                 qSwap(ch, last);
       
   276 
       
   277             for (ushort i = ch.unicode(); i <= last.unicode(); ++i) {
       
   278                 if (caseSensitivity == Qt::CaseInsensitive) {
       
   279                     set.insert(QChar(i).toLower().unicode());
       
   280                 } else {
       
   281                     set.insert(i);
       
   282                 }
       
   283             }
       
   284 
       
   285             rangeFound = true;
       
   286         }
       
   287 
       
   288         if (!rangeFound) {
       
   289             if (caseSensitivity == Qt::CaseInsensitive) {
       
   290                 set.insert(ch.toLower().unicode());
       
   291             } else {
       
   292                 set.insert(ch.unicode());
       
   293             }
       
   294         }
       
   295     }
       
   296 
       
   297     if (negate) {
       
   298         QSet<InputType> negatedSet = maxInputSet;
       
   299         negatedSet.subtract(set);
       
   300         set = negatedSet;
       
   301     }
       
   302 
       
   303     return NFA::createSetNFA(set);
       
   304 }
       
   305 NFA RE2NFA::createCharNFA()
       
   306 {
       
   307     NFA nfa;
       
   308     // ####
       
   309     if (caseSensitivity == Qt::CaseInsensitive) {
       
   310         nfa = NFA::createStringNFA(symbol().lexem.toLower().toLatin1());
       
   311     } else {
       
   312         nfa = NFA::createStringNFA(symbol().lexem.toLatin1());
       
   313     }
       
   314     return nfa;
       
   315 }
       
   316 
       
   317 static inline int skipQuote(const QString &str, int pos)
       
   318 {
       
   319     while (pos < str.length()
       
   320            && str.at(pos) != QLatin1Char('"')) {
       
   321         if (str.at(pos) == QLatin1Char('\\')) {
       
   322             ++pos;
       
   323             if (pos >= str.length())
       
   324                 break;
       
   325         }
       
   326         ++pos;
       
   327     }
       
   328     if (pos < str.length())
       
   329         ++pos;
       
   330     return pos;
       
   331 }
       
   332 
       
   333 #if 0
       
   334 static const char*tokStr(Token t)
       
   335 {
       
   336     switch (t) {
       
   337         case TOK_INVALID: return "TOK_INVALID";
       
   338         case TOK_STRING: return "TOK_STRING";
       
   339         case TOK_LBRACE: return "TOK_LBRACE";
       
   340         case TOK_RBRACE: return "TOK_RBRACE";
       
   341         case TOK_LBRACKET: return "TOK_LBRACKET";
       
   342         case TOK_RBRACKET: return "TOK_RBRACKET";
       
   343         case TOK_LPAREN: return "TOK_LPAREN";
       
   344         case TOK_RPAREN: return "TOK_RPAREN";
       
   345         case TOK_COMMA: return "TOK_COMMA";
       
   346         case TOK_STAR: return "TOK_STAR";
       
   347         case TOK_OR: return "TOK_OR";
       
   348         case TOK_QUESTION: return "TOK_QUESTION";
       
   349         case TOK_DOT: return "TOK_DOT";
       
   350         case TOK_PLUS: return "TOK_PLUS";
       
   351         case TOK_SEQUENCE: return "TOK_SEQUENCE";
       
   352         case TOK_QUOTED_STRING: return "TOK_QUOTED_STRING";
       
   353     }
       
   354     return "";
       
   355 }
       
   356 #endif
       
   357 
       
   358 void RE2NFA::tokenize(const QString &input)
       
   359 {
       
   360     symbols.clear();
       
   361 #if 1
       
   362     RegExpTokenizer tokenizer(input);
       
   363     Symbol sym;
       
   364     int tok = tokenizer.lex();
       
   365     while (tok != -1) {
       
   366         Symbol sym;
       
   367         sym.token = static_cast<Token>(tok);
       
   368         sym.lexem = input.mid(tokenizer.lexemStart, tokenizer.lexemLength);
       
   369 
       
   370         if (sym.token == TOK_QUOTED_STRING) {
       
   371             sym.lexem.chop(1);
       
   372             sym.lexem.remove(0, 1);
       
   373             sym.token = TOK_STRING;
       
   374         }
       
   375 
       
   376         if (sym.token == TOK_STRING || sym.token == TOK_SEQUENCE) {
       
   377             for (int i = 0; i < sym.lexem.length(); ++i) {
       
   378                 if (sym.lexem.at(i) == '\\') {
       
   379                     if (i >= sym.lexem.length() - 1)
       
   380                         break;
       
   381                     QChar ch = sym.lexem.at(i + 1);
       
   382                     if (ch == QLatin1Char('n')) {
       
   383                         ch = '\n';
       
   384                     } else if (ch == QLatin1Char('r')) {
       
   385                         ch = '\r';
       
   386                     } else if (ch == QLatin1Char('t')) {
       
   387                         ch = '\t';
       
   388                     } else if (ch == QLatin1Char('f')) {
       
   389                         ch = '\f';
       
   390                     }
       
   391                     sym.lexem.replace(i, 2, ch);
       
   392                 }
       
   393             }
       
   394         }
       
   395 
       
   396         /*
       
   397         if (sym.token == TOK_SEQUENCE) {
       
   398             Symbol s;
       
   399             s.token = TOK_LBRACKET;
       
   400             s.lexem = "[";
       
   401             symbols.append(s);
       
   402 
       
   403             for (int i = 1; i < sym.lexem.length() - 1; ++i) {
       
   404                 s.token = TOK_STRING;
       
   405                 s.lexem = sym.lexem.at(i);
       
   406                 symbols.append(s);
       
   407             }
       
   408 
       
   409             s.token = TOK_RBRACKET;
       
   410             s.lexem = "]";
       
   411             symbols.append(s);
       
   412 
       
   413             tok = tokenizer.lex();
       
   414             continue;
       
   415         }
       
   416         */
       
   417 
       
   418         symbols.append(sym);
       
   419         tok = tokenizer.lex();
       
   420     }
       
   421 #else
       
   422     int pos = 0;
       
   423     bool insideSet = false;
       
   424     while (pos < input.length()) {
       
   425         QChar ch = input.at(pos);
       
   426 
       
   427         Symbol sym;
       
   428         sym.column = pos;
       
   429         sym.token = TOK_INVALID;
       
   430         sym.lexem = QString(ch);
       
   431         switch (ch.toLatin1()) {
       
   432             case '"': {
       
   433                 if (insideSet) {
       
   434                     sym.token = TOK_STRING;
       
   435                     sym.lexem = QString(ch);
       
   436                     symbols += sym;
       
   437                     ++pos;
       
   438                     continue;
       
   439                 }
       
   440                 if (pos + 1 >= input.length())
       
   441                     return;
       
   442                 int quoteEnd = skipQuote(input, pos + 1);
       
   443                 sym.token = TOK_STRING;
       
   444                 sym.lexem = input.mid(pos + 1, quoteEnd - pos - 2);
       
   445                 symbols += sym;
       
   446                 pos = quoteEnd;
       
   447                 continue;
       
   448             }
       
   449             case '{':
       
   450                 sym.token = (insideSet ? TOK_STRING : TOK_LBRACE);
       
   451                 break;
       
   452             case '}':
       
   453                 sym.token = (insideSet ? TOK_STRING : TOK_RBRACE);
       
   454                 break;
       
   455             case '[':
       
   456                 insideSet = true;
       
   457                 sym.token = TOK_LBRACKET;
       
   458                 break;
       
   459             case ']':
       
   460                 insideSet = false;
       
   461                 sym.token = TOK_RBRACKET;
       
   462                 break;
       
   463             case '(':
       
   464                 sym.token = (insideSet ? TOK_STRING : TOK_LPAREN);
       
   465                 break;
       
   466             case ')':
       
   467                 sym.token = (insideSet ? TOK_STRING : TOK_RPAREN);
       
   468                 break;
       
   469             case ',':
       
   470                 sym.token = (insideSet ? TOK_STRING : TOK_COMMA);
       
   471                 break;
       
   472             case '*':
       
   473                 sym.token = (insideSet ? TOK_STRING : TOK_STAR);
       
   474                 break;
       
   475             case '|':
       
   476                 sym.token = (insideSet ? TOK_STRING : TOK_OR);
       
   477                 break;
       
   478             case '?':
       
   479                 sym.token = (insideSet ? TOK_STRING : TOK_QUESTION);
       
   480                 break;
       
   481             case '.':
       
   482                 sym.token = (insideSet ? TOK_STRING : TOK_DOT);
       
   483                 break;
       
   484             case '+':
       
   485                 sym.token = (insideSet ? TOK_STRING : TOK_PLUS);
       
   486                 break;
       
   487             case '\\':
       
   488                 ++pos;
       
   489                 if (pos >= input.length())
       
   490                     return;
       
   491                 ch = input.at(pos);
       
   492                 if (ch == QLatin1Char('n')) {
       
   493                     ch = '\n';
       
   494                 } else if (ch == QLatin1Char('r')) {
       
   495                     ch = '\r';
       
   496                 } else if (ch == QLatin1Char('t')) {
       
   497                     ch = '\t';
       
   498                 } else if (ch == QLatin1Char('f')) {
       
   499                     ch = '\f';
       
   500                 }
       
   501                 // fall through
       
   502             default:
       
   503                 sym.token = TOK_STRING;
       
   504                 sym.lexem = QString(ch);
       
   505                 symbols += sym;
       
   506                 ++pos;
       
   507                 continue;
       
   508         }
       
   509         symbols += sym;
       
   510         ++pos;
       
   511     }
       
   512 #endif
       
   513 #if 0
       
   514     foreach (Symbol s, symbols) {
       
   515         qDebug() << "Tok" << tokStr(s.token) << "lexem" << s.lexem;
       
   516     }
       
   517 #endif
       
   518 }
       
   519 
       
   520 bool RE2NFA::next(Token t)
       
   521 {
       
   522     if (hasNext() && next() == t)
       
   523         return true;
       
   524     errorColumn = symbol().column;
       
   525     Q_ASSERT(false);
       
   526     return false;
       
   527 }
       
   528 
       
   529 bool RE2NFA::test(Token t)
       
   530 {
       
   531     if (index >= symbols.count())
       
   532         return false;
       
   533     if (symbols.at(index).token == t) {
       
   534         ++index;
       
   535         return true;
       
   536     }
       
   537     return false;
       
   538 }
       
   539 
       
   540 QString RE2NFA::lexemUntil(Token t)
       
   541 {
       
   542     QString lexem;
       
   543     while (hasNext() && next() != t)
       
   544         lexem += symbol().lexem;
       
   545     return lexem;
       
   546 }
       
   547