tools/qdoc3/tokenizer.cpp
changeset 0 1918ee327afb
child 4 3b1da2848fc7
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/qdoc3/tokenizer.cpp	Mon Jan 11 14:00:40 2010 +0000
@@ -0,0 +1,753 @@
+/****************************************************************************
+**
+** Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies).
+** All rights reserved.
+** Contact: Nokia Corporation (qt-info@nokia.com)
+**
+** This file is part of the tools applications of the Qt Toolkit.
+**
+** $QT_BEGIN_LICENSE:LGPL$
+** No Commercial Usage
+** This file contains pre-release code and may not be distributed.
+** You may use this file in accordance with the terms and conditions
+** contained in the Technology Preview License Agreement accompanying
+** this package.
+**
+** GNU Lesser General Public License Usage
+** Alternatively, this file may be used under the terms of the GNU Lesser
+** General Public License version 2.1 as published by the Free Software
+** Foundation and appearing in the file LICENSE.LGPL included in the
+** packaging of this file.  Please review the following information to
+** ensure the GNU Lesser General Public License version 2.1 requirements
+** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
+**
+** In addition, as a special exception, Nokia gives you certain additional
+** rights.  These rights are described in the Nokia Qt LGPL Exception
+** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
+**
+** If you have questions regarding the use of this file, please contact
+** Nokia at qt-info@nokia.com.
+**
+**
+**
+**
+**
+**
+**
+**
+** $QT_END_LICENSE$
+**
+****************************************************************************/
+
+#include "config.h"
+#include "tokenizer.h"
+
+#include <qdebug.h>
+#include <qfile.h>
+#include <qhash.h>
+#include <qregexp.h>
+#include <qstring.h>
+
+#include <ctype.h>
+#include <string.h>
+
+QT_BEGIN_NAMESPACE
+
+#define LANGUAGE_CPP                        "Cpp"
+
+/* qmake ignore Q_OBJECT */
+
+/*
+  Keep in sync with tokenizer.h.
+*/
+static const char *kwords[] = {
+    "char", "class", "const", "double", "enum", "explicit",
+    "friend", "inline", "int", "long", "namespace", "operator",
+    "private", "protected", "public", "short", "signals", "signed",
+    "slots", "static", "struct", "template", "typedef", "typename",
+    "union", "unsigned", "using", "virtual", "void", "volatile",
+    "__int64", "Q_OBJECT", "Q_OVERRIDE", "Q_PROPERTY",
+    "Q_DECLARE_SEQUENTIAL_ITERATOR",
+    "Q_DECLARE_MUTABLE_SEQUENTIAL_ITERATOR",
+    "Q_DECLARE_ASSOCIATIVE_ITERATOR",
+    "Q_DECLARE_MUTABLE_ASSOCIATIVE_ITERATOR",
+    "Q_DECLARE_FLAGS",
+    "Q_SIGNALS",
+    "Q_SLOTS",
+    "QT_COMPAT",
+    "QT_COMPAT_CONSTRUCTOR",
+    "QT_DEPRECATED",
+    "QT_MOC_COMPAT",
+    "QT_MODULE",
+    "QT3_SUPPORT",
+    "QT3_SUPPORT_CONSTRUCTOR",
+    "QT3_MOC_SUPPORT",
+    "QDOC_PROPERTY"
+};
+
+static const int KwordHashTableSize = 4096;
+static int kwordHashTable[KwordHashTableSize];
+
+static QHash<QByteArray, bool> *ignoredTokensAndDirectives = 0;
+
+static QRegExp *comment = 0;
+static QRegExp *versionX = 0;
+static QRegExp *definedX = 0;
+
+static QRegExp *defines = 0;
+static QRegExp *falsehoods = 0;
+
+/*
+  This function is a perfect hash function for the 37 keywords of C99
+  (with a hash table size of 512). It should perform well on our
+  Qt-enhanced C++ subset.
+*/
+static int hashKword(const char *s, int len)
+{
+    return (((uchar) s[0]) + (((uchar) s[2]) << 5) +
+             (((uchar) s[len - 1]) << 3)) % KwordHashTableSize;
+}
+
+static void insertKwordIntoHash(const char *s, int number)
+{
+    int k = hashKword(s, strlen(s));
+    while (kwordHashTable[k]) {
+        if (++k == KwordHashTableSize)
+            k = 0;
+    }
+    kwordHashTable[k] = number;
+}
+
+Tokenizer::Tokenizer(const Location& loc, FILE *in)
+{
+    init();
+    QFile file;
+    file.open(in, QIODevice::ReadOnly);
+    yyIn = file.readAll();
+    file.close();
+    yyPos = 0;
+    start(loc);
+}
+
+Tokenizer::Tokenizer(const Location& loc, const QByteArray &in)
+  : yyIn(in)
+{
+    init();
+    yyPos = 0;
+    start(loc);
+}
+
+Tokenizer::~Tokenizer()
+{
+    delete[] yyLexBuf1;
+    delete[] yyLexBuf2;
+}
+
+int Tokenizer::getToken()
+{
+    char *t = yyPrevLex;
+    yyPrevLex = yyLex;
+    yyLex = t;
+
+    while (yyCh != EOF) {
+        yyTokLoc = yyCurLoc;
+        yyLexLen = 0;
+
+        if (isspace(yyCh)) {
+            do {
+                yyCh = getChar();
+            } while (isspace(yyCh));
+        }
+        else if (isalpha(yyCh) || yyCh == '_') {
+            do {
+                yyCh = getChar();
+            } while (isalnum(yyCh) || yyCh == '_');
+
+            int k = hashKword(yyLex, yyLexLen);
+            for (;;) {
+                int i = kwordHashTable[k];
+                if (i == 0) {
+                    return Tok_Ident;
+                }
+                else if (i == -1) {
+                    if (!parsingMacro && ignoredTokensAndDirectives->contains(yyLex)) {
+                        if (ignoredTokensAndDirectives->value(yyLex)) { // it's a directive
+                            int parenDepth = 0;
+                            while (yyCh != EOF && (yyCh != ')' || parenDepth > 1)) {
+                                if (yyCh == '(')
+                                    ++parenDepth;
+                                else if (yyCh == ')')
+                                    --parenDepth;
+                                yyCh = getChar();
+                            }
+                            if (yyCh == ')')
+                                yyCh = getChar();
+                        }
+                        break;
+                    }
+                }
+                else if (strcmp(yyLex, kwords[i - 1]) == 0) {
+                    int ret = (int) Tok_FirstKeyword + i - 1;
+                    if (ret != Tok_explicit && ret != Tok_inline && ret != Tok_typename)
+                        return ret;
+                    break;
+                }
+
+                if (++k == KwordHashTableSize)
+                    k = 0;
+            }
+        }
+        else if (isdigit(yyCh)) {
+            do {
+                yyCh = getChar();
+            } while (isalnum(yyCh) || yyCh == '.' || yyCh == '+' ||
+                      yyCh == '-');
+            return Tok_Number;
+        }
+        else {
+            switch (yyCh) {
+            case '!':
+            case '%':
+                yyCh = getChar();
+                if (yyCh == '=')
+                    yyCh = getChar();
+                return Tok_SomeOperator;
+            case '"':
+                yyCh = getChar();
+
+                while (yyCh != EOF && yyCh != '"') {
+                    if (yyCh == '\\')
+                        yyCh = getChar();
+                    yyCh = getChar();
+                }
+                yyCh = getChar();
+
+                if (yyCh == EOF)
+                    yyTokLoc.warning(tr("Unterminated C++ string literal"),
+                                     tr("Maybe you forgot '/*!' at the beginning of the file?"));
+                else
+                    return Tok_String;
+                break;
+            case '#':
+                return getTokenAfterPreprocessor();
+            case '&':
+                yyCh = getChar();
+                if (yyCh == '&' || yyCh == '=') {
+                    yyCh = getChar();
+                    return Tok_SomeOperator;
+                }
+                else {
+                    return Tok_Ampersand;
+                }
+            case '\'':
+                yyCh = getChar();
+                if (yyCh == '\\')
+                    yyCh = getChar();
+                do {
+                    yyCh = getChar();
+                } while (yyCh != EOF && yyCh != '\'');
+
+                if (yyCh == EOF) {
+                    yyTokLoc.warning(tr("Unterminated C++ character"
+                                         " literal"));
+                }
+                else {
+                    yyCh = getChar();
+                    return Tok_Number;
+                }
+                break;
+            case '(':
+                yyCh = getChar();
+                if (yyNumPreprocessorSkipping == 0)
+                    yyParenDepth++;
+                if (isspace(yyCh)) {
+                    do {
+                        yyCh = getChar();
+                    } while (isspace(yyCh));
+                    yyLexLen = 1;
+                    yyLex[1] = '\0';
+                }
+                if (yyCh == '*') {
+                    yyCh = getChar();
+                    return Tok_LeftParenAster;
+                }
+                return Tok_LeftParen;
+            case ')':
+                yyCh = getChar();
+                if (yyNumPreprocessorSkipping == 0)
+                    yyParenDepth--;
+                return Tok_RightParen;
+            case '*':
+                yyCh = getChar();
+                if (yyCh == '=') {
+                    yyCh = getChar();
+                    return Tok_SomeOperator;
+                } else {
+                    return Tok_Aster;
+                }
+            case '^':
+                yyCh = getChar();
+                if (yyCh == '=') {
+                    yyCh = getChar();
+                    return Tok_SomeOperator;
+                } else {
+                    return Tok_Caret;
+                }
+            case '+':
+                yyCh = getChar();
+                if (yyCh == '+' || yyCh == '=')
+                    yyCh = getChar();
+                return Tok_SomeOperator;
+            case ',':
+                yyCh = getChar();
+                return Tok_Comma;
+            case '-':
+                yyCh = getChar();
+                if (yyCh == '-' || yyCh == '=') {
+                    yyCh = getChar();
+                } else if (yyCh == '>') {
+                    yyCh = getChar();
+                    if (yyCh == '*')
+                        yyCh = getChar();
+                }
+                return Tok_SomeOperator;
+            case '.':
+                yyCh = getChar();
+                if (yyCh == '*') {
+                    yyCh = getChar();
+                } else if (yyCh == '.') {
+                    do {
+                        yyCh = getChar();
+                    } while (yyCh == '.');
+                    return Tok_Ellipsis;
+                } else if (isdigit(yyCh)) {
+                    do {
+                        yyCh = getChar();
+                    } while (isalnum(yyCh) || yyCh == '.' || yyCh == '+' ||
+                              yyCh == '-');
+                    return Tok_Number;
+                }
+                return Tok_SomeOperator;
+            case '/':
+                yyCh = getChar();
+                if (yyCh == '/') {
+                    do {
+                        yyCh = getChar();
+                    } while (yyCh != EOF && yyCh != '\n');
+                } else if (yyCh == '*') {
+                    bool metDoc = false; // empty doc is no doc
+                    bool metSlashAsterBang = false;
+                    bool metAster = false;
+                    bool metAsterSlash = false;
+
+                    yyCh = getChar();
+                    if (yyCh == '!')
+                        metSlashAsterBang = true;
+
+                    while (!metAsterSlash) {
+                        if (yyCh == EOF) {
+                            yyTokLoc.warning(tr("Unterminated C++ comment"));
+                            break;
+                        } else {
+                            if (yyCh == '*') {
+                                metAster = true;
+                            } else if (metAster && yyCh == '/') {
+                                metAsterSlash = true;
+                            } else {
+                                metAster = false;
+                                if (isgraph(yyCh))
+                                    metDoc = true;
+                            }
+                        }
+                        yyCh = getChar();
+                    }
+                    if (metSlashAsterBang && metDoc)
+                        return Tok_Doc;
+                    else if (yyParenDepth > 0)
+                        return Tok_Comment;
+                } else {
+                    if (yyCh == '=')
+                        yyCh = getChar();
+                    return Tok_SomeOperator;
+                }
+                break;
+            case ':':
+                yyCh = getChar();
+                if (yyCh == ':') {
+                    yyCh = getChar();
+                    return Tok_Gulbrandsen;
+                } else {
+                    return Tok_Colon;
+                }
+            case ';':
+                yyCh = getChar();
+                return Tok_Semicolon;
+            case '<':
+                yyCh = getChar();
+                if (yyCh == '<') {
+                    yyCh = getChar();
+                    if (yyCh == '=')
+                        yyCh = getChar();
+                    return Tok_SomeOperator;
+                } else if (yyCh == '=') {
+                    yyCh = getChar();
+                    return Tok_SomeOperator;
+                } else {
+                    return Tok_LeftAngle;
+                }
+            case '=':
+                yyCh = getChar();
+                if (yyCh == '=') {
+                    yyCh = getChar();
+                    return Tok_SomeOperator;
+                } else {
+                    return Tok_Equal;
+                }
+            case '>':
+                yyCh = getChar();
+                if (yyCh == '>') {
+                    yyCh = getChar();
+                    if (yyCh == '=')
+                        yyCh = getChar();
+                    return Tok_SomeOperator;
+                } else if (yyCh == '=') {
+                    yyCh = getChar();
+                    return Tok_SomeOperator;
+                } else {
+                    return Tok_RightAngle;
+                }
+            case '?':
+                yyCh = getChar();
+                return Tok_SomeOperator;
+            case '[':
+                yyCh = getChar();
+                if (yyNumPreprocessorSkipping == 0)
+                    yyBracketDepth++;
+                return Tok_LeftBracket;
+            case '\\':
+                yyCh = getChar();
+                yyCh = getChar(); // skip one character
+                break;
+            case ']':
+                yyCh = getChar();
+                if (yyNumPreprocessorSkipping == 0)
+                    yyBracketDepth--;
+                return Tok_RightBracket;
+            case '{':
+                yyCh = getChar();
+                if (yyNumPreprocessorSkipping == 0)
+                    yyBraceDepth++;
+                return Tok_LeftBrace;
+            case '}':
+                yyCh = getChar();
+                if (yyNumPreprocessorSkipping == 0)
+                    yyBraceDepth--;
+                return Tok_RightBrace;
+            case '|':
+                yyCh = getChar();
+                if (yyCh == '|' || yyCh == '=')
+                    yyCh = getChar();
+                return Tok_SomeOperator;
+            case '~':
+                yyCh = getChar();
+                return Tok_Tilde;
+            case '@':
+                yyCh = getChar();
+                return Tok_At;
+            default:
+                // ### We should really prevent qdoc from looking at snippet files rather than
+                // ### suppress warnings when reading them.
+                if (yyNumPreprocessorSkipping == 0 && !yyTokLoc.fileName().endsWith(".qdoc")) {
+                    yyTokLoc.warning(tr("Hostile character 0x%1 in C++ source")
+                                      .arg((uchar)yyCh, 1, 16));
+                }
+                yyCh = getChar();
+            }
+        }
+    }
+
+    if (yyPreprocessorSkipping.count() > 1) {
+        yyTokLoc.warning(tr("Expected #endif before end of file"));
+        // clear it out or we get an infinite loop!
+        while (!yyPreprocessorSkipping.isEmpty()) {
+            popSkipping();
+        }
+    }
+
+    strcpy(yyLex, "end-of-input");
+    yyLexLen = strlen(yyLex);
+    return Tok_Eoi;
+}
+
+void Tokenizer::initialize(const Config &config)
+{
+    QString versionSym = config.getString(CONFIG_VERSIONSYM);
+
+    comment = new QRegExp("/(?:\\*.*\\*/|/.*\n|/[^\n]*$)");
+    comment->setMinimal(true);
+    versionX = new QRegExp("$cannot possibly match^");
+    if (!versionSym.isEmpty())
+        versionX->setPattern("[ \t]*(?:" + QRegExp::escape(versionSym)
+                             + ")[ \t]+\"([^\"]*)\"[ \t]*");
+    definedX = new QRegExp("defined ?\\(?([A-Z_0-9a-z]+) ?\\)");
+
+    QStringList d = config.getStringList(CONFIG_DEFINES);
+    d += "qdoc";
+    defines = new QRegExp(d.join("|"));
+    falsehoods = new QRegExp(config.getStringList(CONFIG_FALSEHOODS).join("|"));
+
+    memset(kwordHashTable, 0, sizeof(kwordHashTable));
+    for (int i = 0; i < Tok_LastKeyword - Tok_FirstKeyword + 1; i++)
+        insertKwordIntoHash(kwords[i], i + 1);
+
+    ignoredTokensAndDirectives = new QHash<QByteArray, bool>;
+
+    QStringList tokens = config.getStringList(LANGUAGE_CPP + Config::dot + CONFIG_IGNORETOKENS);
+    foreach (const QString &t, tokens) {
+        const QByteArray tb = t.toAscii();
+        ignoredTokensAndDirectives->insert(tb, false);
+        insertKwordIntoHash(tb.data(), -1);
+    }
+
+    QStringList directives = config.getStringList(LANGUAGE_CPP + Config::dot
+                                                  + CONFIG_IGNOREDIRECTIVES);
+    foreach (const QString &d, directives) {
+        const QByteArray db = d.toAscii();
+        ignoredTokensAndDirectives->insert(db, true);
+        insertKwordIntoHash(db.data(), -1);
+    }
+}
+
+void Tokenizer::terminate()
+{
+    delete comment;
+    comment = 0;
+    delete versionX;
+    versionX = 0;
+    delete definedX;
+    definedX = 0;
+    delete defines;
+    defines = 0;
+    delete falsehoods;
+    falsehoods = 0;
+    delete ignoredTokensAndDirectives;
+    ignoredTokensAndDirectives = 0;
+}
+
+void Tokenizer::init()
+{
+    yyLexBuf1 = new char[(int) yyLexBufSize];
+    yyLexBuf2 = new char[(int) yyLexBufSize];
+    yyPrevLex = yyLexBuf1;
+    yyPrevLex[0] = '\0';
+    yyLex = yyLexBuf2;
+    yyLex[0] = '\0';
+    yyLexLen = 0;
+    yyPreprocessorSkipping.push(false);
+    yyNumPreprocessorSkipping = 0;
+    yyBraceDepth = 0;
+    yyParenDepth = 0;
+    yyBracketDepth = 0;
+    yyCh = '\0';
+    parsingMacro = false;
+}
+
+void Tokenizer::start(const Location& loc)
+{
+    yyTokLoc = loc;
+    yyCurLoc = loc;
+    yyCurLoc.start();
+    strcpy(yyPrevLex, "beginning-of-input");
+    strcpy(yyLex, "beginning-of-input");
+    yyLexLen = strlen(yyLex);
+    yyBraceDepth = 0;
+    yyParenDepth = 0;
+    yyBracketDepth = 0;
+    yyCh = '\0';
+    yyCh = getChar();
+}
+
+/*
+  Returns the next token, if # was met.  This function interprets the
+  preprocessor directive, skips over any #ifdef'd out tokens, and returns the
+  token after all of that.
+*/
+int Tokenizer::getTokenAfterPreprocessor()
+{
+    yyCh = getChar();
+    while (isspace(yyCh) && yyCh != '\n')
+        yyCh = getChar();
+
+    /*
+      #directive condition
+    */
+    QString directive;
+    QString condition;
+
+    while (isalpha(yyCh)) {
+        directive += QChar(yyCh);
+        yyCh = getChar();
+    }
+    if (!directive.isEmpty()) {
+        while (yyCh != EOF && yyCh != '\n') {
+            if (yyCh == '\\')
+                yyCh = getChar();
+            condition += yyCh;
+            yyCh = getChar();
+        }
+        condition.replace(*comment, "");
+        condition = condition.simplified();
+
+        /*
+          The #if, #ifdef, #ifndef, #elif, #else, and #endif
+          directives have an effect on the skipping stack.  For
+          instance, if the code processed so far is
+
+              #if 1
+              #if 0
+              #if 1
+              // ...
+              #else
+
+          the skipping stack contains, from bottom to top, false true
+          true (assuming 0 is false and 1 is true).  If at least one
+          entry of the stack is true, the tokens are skipped.
+
+          This mechanism is simple yet hard to understand.
+        */
+        if (directive[0] == QChar('i')) {
+            if (directive == QString("if"))
+                pushSkipping(!isTrue(condition));
+            else if (directive == QString("ifdef"))
+                pushSkipping(!defines->exactMatch(condition));
+            else if (directive == QString("ifndef"))
+                pushSkipping(defines->exactMatch(condition));
+        } else if (directive[0] == QChar('e')) {
+            if (directive == QString("elif")) {
+                bool old = popSkipping();
+                if (old)
+                    pushSkipping(!isTrue(condition));
+                else
+                    pushSkipping(true);
+            } else if (directive == QString("else")) {
+                pushSkipping(!popSkipping());
+            } else if (directive == QString("endif")) {
+                popSkipping();
+            }
+        } else if (directive == QString("define")) {
+            if (versionX->exactMatch(condition))
+                yyVersion = versionX->cap(1);
+        }
+    }
+
+    int tok;
+    do {
+        /*
+          We set yyLex now, and after getToken() this will be
+          yyPrevLex. This way, we skip over the preprocessor
+          directive.
+        */
+        qstrcpy(yyLex, yyPrevLex);
+
+        /*
+          If getToken() meets another #, it will call
+          getTokenAfterPreprocessor() once again, which could in turn
+          call getToken() again, etc. Unless there are 10,000 or so
+          preprocessor directives in a row, this shouldn't overflow
+          the stack.
+        */
+        tok = getToken();
+    } while (yyNumPreprocessorSkipping > 0);
+    return tok;
+}
+
+/*
+  Pushes a new skipping value onto the stack.  This corresponds to entering a
+  new #if block.
+*/
+void Tokenizer::pushSkipping(bool skip)
+{
+    yyPreprocessorSkipping.push(skip);
+    if (skip)
+        yyNumPreprocessorSkipping++;
+}
+
+/*
+  Pops a skipping value from the stack.  This corresponds to reaching a #endif.
+*/
+bool Tokenizer::popSkipping()
+{
+    if (yyPreprocessorSkipping.isEmpty()) {
+        yyTokLoc.warning(tr("Unexpected #elif, #else or #endif"));
+        return true;
+    }
+
+    bool skip = yyPreprocessorSkipping.pop();
+    if (skip)
+        yyNumPreprocessorSkipping--;
+    return skip;
+}
+
+/*
+  Returns true if the condition evaluates as true, otherwise false.  The
+  condition is represented by a string.  Unsophisticated parsing techniques are
+  used.  The preprocessing method could be named StriNg-Oriented PreProcessing,
+  as SNOBOL stands for StriNg-Oriented symBOlic Language.
+*/
+bool Tokenizer::isTrue(const QString &condition)
+{
+    int firstOr = -1;
+    int firstAnd = -1;
+    int parenDepth = 0;
+
+    /*
+      Find the first logical operator at top level, but be careful
+      about precedence. Examples:
+
+          X || Y          // the or
+          X || Y || Z     // the leftmost or
+          X || Y && Z     // the or
+          X && Y || Z     // the or
+          (X || Y) && Z   // the and
+    */
+    for (int i = 0; i < (int) condition.length() - 1; i++) {
+        QChar ch = condition[i];
+        if (ch == QChar('(')) {
+            parenDepth++;
+        } else if (ch == QChar(')')) {
+            parenDepth--;
+        } else if (parenDepth == 0) {
+            if (condition[i + 1] == ch) {
+                if (ch == QChar('|')) {
+                    firstOr = i;
+                    break;
+                } else if (ch == QChar('&')) {
+                    if (firstAnd == -1)
+                        firstAnd = i;
+                }
+            }
+        }
+    }
+    if (firstOr != -1)
+        return isTrue(condition.left(firstOr)) ||
+               isTrue(condition.mid(firstOr + 2));
+    if (firstAnd != -1)
+        return isTrue(condition.left(firstAnd)) &&
+               isTrue(condition.mid(firstAnd + 2));
+
+    QString t = condition.simplified();
+    if (t.isEmpty())
+        return true;
+
+    if (t[0] == QChar('!'))
+        return !isTrue(t.mid(1));
+    if (t[0] == QChar('(') && t.right(1)[0] == QChar(')'))
+        return isTrue(t.mid(1, t.length() - 2));
+
+    if (definedX->exactMatch(t))
+        return defines->exactMatch(definedX->cap(1));
+    else
+        return !falsehoods->exactMatch(t);
+}
+
+QT_END_NAMESPACE