diff -r 000000000000 -r 1918ee327afb tools/qdoc3/tokenizer.cpp --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/qdoc3/tokenizer.cpp Mon Jan 11 14:00:40 2010 +0000 @@ -0,0 +1,753 @@ +/**************************************************************************** +** +** Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies). +** All rights reserved. +** Contact: Nokia Corporation (qt-info@nokia.com) +** +** This file is part of the tools applications of the Qt Toolkit. +** +** $QT_BEGIN_LICENSE:LGPL$ +** No Commercial Usage +** This file contains pre-release code and may not be distributed. +** You may use this file in accordance with the terms and conditions +** contained in the Technology Preview License Agreement accompanying +** this package. +** +** GNU Lesser General Public License Usage +** Alternatively, this file may be used under the terms of the GNU Lesser +** General Public License version 2.1 as published by the Free Software +** Foundation and appearing in the file LICENSE.LGPL included in the +** packaging of this file. Please review the following information to +** ensure the GNU Lesser General Public License version 2.1 requirements +** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html. +** +** In addition, as a special exception, Nokia gives you certain additional +** rights. These rights are described in the Nokia Qt LGPL Exception +** version 1.1, included in the file LGPL_EXCEPTION.txt in this package. +** +** If you have questions regarding the use of this file, please contact +** Nokia at qt-info@nokia.com. +** +** +** +** +** +** +** +** +** $QT_END_LICENSE$ +** +****************************************************************************/ + +#include "config.h" +#include "tokenizer.h" + +#include +#include +#include +#include +#include + +#include +#include + +QT_BEGIN_NAMESPACE + +#define LANGUAGE_CPP "Cpp" + +/* qmake ignore Q_OBJECT */ + +/* + Keep in sync with tokenizer.h. +*/ +static const char *kwords[] = { + "char", "class", "const", "double", "enum", "explicit", + "friend", "inline", "int", "long", "namespace", "operator", + "private", "protected", "public", "short", "signals", "signed", + "slots", "static", "struct", "template", "typedef", "typename", + "union", "unsigned", "using", "virtual", "void", "volatile", + "__int64", "Q_OBJECT", "Q_OVERRIDE", "Q_PROPERTY", + "Q_DECLARE_SEQUENTIAL_ITERATOR", + "Q_DECLARE_MUTABLE_SEQUENTIAL_ITERATOR", + "Q_DECLARE_ASSOCIATIVE_ITERATOR", + "Q_DECLARE_MUTABLE_ASSOCIATIVE_ITERATOR", + "Q_DECLARE_FLAGS", + "Q_SIGNALS", + "Q_SLOTS", + "QT_COMPAT", + "QT_COMPAT_CONSTRUCTOR", + "QT_DEPRECATED", + "QT_MOC_COMPAT", + "QT_MODULE", + "QT3_SUPPORT", + "QT3_SUPPORT_CONSTRUCTOR", + "QT3_MOC_SUPPORT", + "QDOC_PROPERTY" +}; + +static const int KwordHashTableSize = 4096; +static int kwordHashTable[KwordHashTableSize]; + +static QHash *ignoredTokensAndDirectives = 0; + +static QRegExp *comment = 0; +static QRegExp *versionX = 0; +static QRegExp *definedX = 0; + +static QRegExp *defines = 0; +static QRegExp *falsehoods = 0; + +/* + This function is a perfect hash function for the 37 keywords of C99 + (with a hash table size of 512). It should perform well on our + Qt-enhanced C++ subset. +*/ +static int hashKword(const char *s, int len) +{ + return (((uchar) s[0]) + (((uchar) s[2]) << 5) + + (((uchar) s[len - 1]) << 3)) % KwordHashTableSize; +} + +static void insertKwordIntoHash(const char *s, int number) +{ + int k = hashKword(s, strlen(s)); + while (kwordHashTable[k]) { + if (++k == KwordHashTableSize) + k = 0; + } + kwordHashTable[k] = number; +} + +Tokenizer::Tokenizer(const Location& loc, FILE *in) +{ + init(); + QFile file; + file.open(in, QIODevice::ReadOnly); + yyIn = file.readAll(); + file.close(); + yyPos = 0; + start(loc); +} + +Tokenizer::Tokenizer(const Location& loc, const QByteArray &in) + : yyIn(in) +{ + init(); + yyPos = 0; + start(loc); +} + +Tokenizer::~Tokenizer() +{ + delete[] yyLexBuf1; + delete[] yyLexBuf2; +} + +int Tokenizer::getToken() +{ + char *t = yyPrevLex; + yyPrevLex = yyLex; + yyLex = t; + + while (yyCh != EOF) { + yyTokLoc = yyCurLoc; + yyLexLen = 0; + + if (isspace(yyCh)) { + do { + yyCh = getChar(); + } while (isspace(yyCh)); + } + else if (isalpha(yyCh) || yyCh == '_') { + do { + yyCh = getChar(); + } while (isalnum(yyCh) || yyCh == '_'); + + int k = hashKword(yyLex, yyLexLen); + for (;;) { + int i = kwordHashTable[k]; + if (i == 0) { + return Tok_Ident; + } + else if (i == -1) { + if (!parsingMacro && ignoredTokensAndDirectives->contains(yyLex)) { + if (ignoredTokensAndDirectives->value(yyLex)) { // it's a directive + int parenDepth = 0; + while (yyCh != EOF && (yyCh != ')' || parenDepth > 1)) { + if (yyCh == '(') + ++parenDepth; + else if (yyCh == ')') + --parenDepth; + yyCh = getChar(); + } + if (yyCh == ')') + yyCh = getChar(); + } + break; + } + } + else if (strcmp(yyLex, kwords[i - 1]) == 0) { + int ret = (int) Tok_FirstKeyword + i - 1; + if (ret != Tok_explicit && ret != Tok_inline && ret != Tok_typename) + return ret; + break; + } + + if (++k == KwordHashTableSize) + k = 0; + } + } + else if (isdigit(yyCh)) { + do { + yyCh = getChar(); + } while (isalnum(yyCh) || yyCh == '.' || yyCh == '+' || + yyCh == '-'); + return Tok_Number; + } + else { + switch (yyCh) { + case '!': + case '%': + yyCh = getChar(); + if (yyCh == '=') + yyCh = getChar(); + return Tok_SomeOperator; + case '"': + yyCh = getChar(); + + while (yyCh != EOF && yyCh != '"') { + if (yyCh == '\\') + yyCh = getChar(); + yyCh = getChar(); + } + yyCh = getChar(); + + if (yyCh == EOF) + yyTokLoc.warning(tr("Unterminated C++ string literal"), + tr("Maybe you forgot '/*!' at the beginning of the file?")); + else + return Tok_String; + break; + case '#': + return getTokenAfterPreprocessor(); + case '&': + yyCh = getChar(); + if (yyCh == '&' || yyCh == '=') { + yyCh = getChar(); + return Tok_SomeOperator; + } + else { + return Tok_Ampersand; + } + case '\'': + yyCh = getChar(); + if (yyCh == '\\') + yyCh = getChar(); + do { + yyCh = getChar(); + } while (yyCh != EOF && yyCh != '\''); + + if (yyCh == EOF) { + yyTokLoc.warning(tr("Unterminated C++ character" + " literal")); + } + else { + yyCh = getChar(); + return Tok_Number; + } + break; + case '(': + yyCh = getChar(); + if (yyNumPreprocessorSkipping == 0) + yyParenDepth++; + if (isspace(yyCh)) { + do { + yyCh = getChar(); + } while (isspace(yyCh)); + yyLexLen = 1; + yyLex[1] = '\0'; + } + if (yyCh == '*') { + yyCh = getChar(); + return Tok_LeftParenAster; + } + return Tok_LeftParen; + case ')': + yyCh = getChar(); + if (yyNumPreprocessorSkipping == 0) + yyParenDepth--; + return Tok_RightParen; + case '*': + yyCh = getChar(); + if (yyCh == '=') { + yyCh = getChar(); + return Tok_SomeOperator; + } else { + return Tok_Aster; + } + case '^': + yyCh = getChar(); + if (yyCh == '=') { + yyCh = getChar(); + return Tok_SomeOperator; + } else { + return Tok_Caret; + } + case '+': + yyCh = getChar(); + if (yyCh == '+' || yyCh == '=') + yyCh = getChar(); + return Tok_SomeOperator; + case ',': + yyCh = getChar(); + return Tok_Comma; + case '-': + yyCh = getChar(); + if (yyCh == '-' || yyCh == '=') { + yyCh = getChar(); + } else if (yyCh == '>') { + yyCh = getChar(); + if (yyCh == '*') + yyCh = getChar(); + } + return Tok_SomeOperator; + case '.': + yyCh = getChar(); + if (yyCh == '*') { + yyCh = getChar(); + } else if (yyCh == '.') { + do { + yyCh = getChar(); + } while (yyCh == '.'); + return Tok_Ellipsis; + } else if (isdigit(yyCh)) { + do { + yyCh = getChar(); + } while (isalnum(yyCh) || yyCh == '.' || yyCh == '+' || + yyCh == '-'); + return Tok_Number; + } + return Tok_SomeOperator; + case '/': + yyCh = getChar(); + if (yyCh == '/') { + do { + yyCh = getChar(); + } while (yyCh != EOF && yyCh != '\n'); + } else if (yyCh == '*') { + bool metDoc = false; // empty doc is no doc + bool metSlashAsterBang = false; + bool metAster = false; + bool metAsterSlash = false; + + yyCh = getChar(); + if (yyCh == '!') + metSlashAsterBang = true; + + while (!metAsterSlash) { + if (yyCh == EOF) { + yyTokLoc.warning(tr("Unterminated C++ comment")); + break; + } else { + if (yyCh == '*') { + metAster = true; + } else if (metAster && yyCh == '/') { + metAsterSlash = true; + } else { + metAster = false; + if (isgraph(yyCh)) + metDoc = true; + } + } + yyCh = getChar(); + } + if (metSlashAsterBang && metDoc) + return Tok_Doc; + else if (yyParenDepth > 0) + return Tok_Comment; + } else { + if (yyCh == '=') + yyCh = getChar(); + return Tok_SomeOperator; + } + break; + case ':': + yyCh = getChar(); + if (yyCh == ':') { + yyCh = getChar(); + return Tok_Gulbrandsen; + } else { + return Tok_Colon; + } + case ';': + yyCh = getChar(); + return Tok_Semicolon; + case '<': + yyCh = getChar(); + if (yyCh == '<') { + yyCh = getChar(); + if (yyCh == '=') + yyCh = getChar(); + return Tok_SomeOperator; + } else if (yyCh == '=') { + yyCh = getChar(); + return Tok_SomeOperator; + } else { + return Tok_LeftAngle; + } + case '=': + yyCh = getChar(); + if (yyCh == '=') { + yyCh = getChar(); + return Tok_SomeOperator; + } else { + return Tok_Equal; + } + case '>': + yyCh = getChar(); + if (yyCh == '>') { + yyCh = getChar(); + if (yyCh == '=') + yyCh = getChar(); + return Tok_SomeOperator; + } else if (yyCh == '=') { + yyCh = getChar(); + return Tok_SomeOperator; + } else { + return Tok_RightAngle; + } + case '?': + yyCh = getChar(); + return Tok_SomeOperator; + case '[': + yyCh = getChar(); + if (yyNumPreprocessorSkipping == 0) + yyBracketDepth++; + return Tok_LeftBracket; + case '\\': + yyCh = getChar(); + yyCh = getChar(); // skip one character + break; + case ']': + yyCh = getChar(); + if (yyNumPreprocessorSkipping == 0) + yyBracketDepth--; + return Tok_RightBracket; + case '{': + yyCh = getChar(); + if (yyNumPreprocessorSkipping == 0) + yyBraceDepth++; + return Tok_LeftBrace; + case '}': + yyCh = getChar(); + if (yyNumPreprocessorSkipping == 0) + yyBraceDepth--; + return Tok_RightBrace; + case '|': + yyCh = getChar(); + if (yyCh == '|' || yyCh == '=') + yyCh = getChar(); + return Tok_SomeOperator; + case '~': + yyCh = getChar(); + return Tok_Tilde; + case '@': + yyCh = getChar(); + return Tok_At; + default: + // ### We should really prevent qdoc from looking at snippet files rather than + // ### suppress warnings when reading them. + if (yyNumPreprocessorSkipping == 0 && !yyTokLoc.fileName().endsWith(".qdoc")) { + yyTokLoc.warning(tr("Hostile character 0x%1 in C++ source") + .arg((uchar)yyCh, 1, 16)); + } + yyCh = getChar(); + } + } + } + + if (yyPreprocessorSkipping.count() > 1) { + yyTokLoc.warning(tr("Expected #endif before end of file")); + // clear it out or we get an infinite loop! + while (!yyPreprocessorSkipping.isEmpty()) { + popSkipping(); + } + } + + strcpy(yyLex, "end-of-input"); + yyLexLen = strlen(yyLex); + return Tok_Eoi; +} + +void Tokenizer::initialize(const Config &config) +{ + QString versionSym = config.getString(CONFIG_VERSIONSYM); + + comment = new QRegExp("/(?:\\*.*\\*/|/.*\n|/[^\n]*$)"); + comment->setMinimal(true); + versionX = new QRegExp("$cannot possibly match^"); + if (!versionSym.isEmpty()) + versionX->setPattern("[ \t]*(?:" + QRegExp::escape(versionSym) + + ")[ \t]+\"([^\"]*)\"[ \t]*"); + definedX = new QRegExp("defined ?\\(?([A-Z_0-9a-z]+) ?\\)"); + + QStringList d = config.getStringList(CONFIG_DEFINES); + d += "qdoc"; + defines = new QRegExp(d.join("|")); + falsehoods = new QRegExp(config.getStringList(CONFIG_FALSEHOODS).join("|")); + + memset(kwordHashTable, 0, sizeof(kwordHashTable)); + for (int i = 0; i < Tok_LastKeyword - Tok_FirstKeyword + 1; i++) + insertKwordIntoHash(kwords[i], i + 1); + + ignoredTokensAndDirectives = new QHash; + + QStringList tokens = config.getStringList(LANGUAGE_CPP + Config::dot + CONFIG_IGNORETOKENS); + foreach (const QString &t, tokens) { + const QByteArray tb = t.toAscii(); + ignoredTokensAndDirectives->insert(tb, false); + insertKwordIntoHash(tb.data(), -1); + } + + QStringList directives = config.getStringList(LANGUAGE_CPP + Config::dot + + CONFIG_IGNOREDIRECTIVES); + foreach (const QString &d, directives) { + const QByteArray db = d.toAscii(); + ignoredTokensAndDirectives->insert(db, true); + insertKwordIntoHash(db.data(), -1); + } +} + +void Tokenizer::terminate() +{ + delete comment; + comment = 0; + delete versionX; + versionX = 0; + delete definedX; + definedX = 0; + delete defines; + defines = 0; + delete falsehoods; + falsehoods = 0; + delete ignoredTokensAndDirectives; + ignoredTokensAndDirectives = 0; +} + +void Tokenizer::init() +{ + yyLexBuf1 = new char[(int) yyLexBufSize]; + yyLexBuf2 = new char[(int) yyLexBufSize]; + yyPrevLex = yyLexBuf1; + yyPrevLex[0] = '\0'; + yyLex = yyLexBuf2; + yyLex[0] = '\0'; + yyLexLen = 0; + yyPreprocessorSkipping.push(false); + yyNumPreprocessorSkipping = 0; + yyBraceDepth = 0; + yyParenDepth = 0; + yyBracketDepth = 0; + yyCh = '\0'; + parsingMacro = false; +} + +void Tokenizer::start(const Location& loc) +{ + yyTokLoc = loc; + yyCurLoc = loc; + yyCurLoc.start(); + strcpy(yyPrevLex, "beginning-of-input"); + strcpy(yyLex, "beginning-of-input"); + yyLexLen = strlen(yyLex); + yyBraceDepth = 0; + yyParenDepth = 0; + yyBracketDepth = 0; + yyCh = '\0'; + yyCh = getChar(); +} + +/* + Returns the next token, if # was met. This function interprets the + preprocessor directive, skips over any #ifdef'd out tokens, and returns the + token after all of that. +*/ +int Tokenizer::getTokenAfterPreprocessor() +{ + yyCh = getChar(); + while (isspace(yyCh) && yyCh != '\n') + yyCh = getChar(); + + /* + #directive condition + */ + QString directive; + QString condition; + + while (isalpha(yyCh)) { + directive += QChar(yyCh); + yyCh = getChar(); + } + if (!directive.isEmpty()) { + while (yyCh != EOF && yyCh != '\n') { + if (yyCh == '\\') + yyCh = getChar(); + condition += yyCh; + yyCh = getChar(); + } + condition.replace(*comment, ""); + condition = condition.simplified(); + + /* + The #if, #ifdef, #ifndef, #elif, #else, and #endif + directives have an effect on the skipping stack. For + instance, if the code processed so far is + + #if 1 + #if 0 + #if 1 + // ... + #else + + the skipping stack contains, from bottom to top, false true + true (assuming 0 is false and 1 is true). If at least one + entry of the stack is true, the tokens are skipped. + + This mechanism is simple yet hard to understand. + */ + if (directive[0] == QChar('i')) { + if (directive == QString("if")) + pushSkipping(!isTrue(condition)); + else if (directive == QString("ifdef")) + pushSkipping(!defines->exactMatch(condition)); + else if (directive == QString("ifndef")) + pushSkipping(defines->exactMatch(condition)); + } else if (directive[0] == QChar('e')) { + if (directive == QString("elif")) { + bool old = popSkipping(); + if (old) + pushSkipping(!isTrue(condition)); + else + pushSkipping(true); + } else if (directive == QString("else")) { + pushSkipping(!popSkipping()); + } else if (directive == QString("endif")) { + popSkipping(); + } + } else if (directive == QString("define")) { + if (versionX->exactMatch(condition)) + yyVersion = versionX->cap(1); + } + } + + int tok; + do { + /* + We set yyLex now, and after getToken() this will be + yyPrevLex. This way, we skip over the preprocessor + directive. + */ + qstrcpy(yyLex, yyPrevLex); + + /* + If getToken() meets another #, it will call + getTokenAfterPreprocessor() once again, which could in turn + call getToken() again, etc. Unless there are 10,000 or so + preprocessor directives in a row, this shouldn't overflow + the stack. + */ + tok = getToken(); + } while (yyNumPreprocessorSkipping > 0); + return tok; +} + +/* + Pushes a new skipping value onto the stack. This corresponds to entering a + new #if block. +*/ +void Tokenizer::pushSkipping(bool skip) +{ + yyPreprocessorSkipping.push(skip); + if (skip) + yyNumPreprocessorSkipping++; +} + +/* + Pops a skipping value from the stack. This corresponds to reaching a #endif. +*/ +bool Tokenizer::popSkipping() +{ + if (yyPreprocessorSkipping.isEmpty()) { + yyTokLoc.warning(tr("Unexpected #elif, #else or #endif")); + return true; + } + + bool skip = yyPreprocessorSkipping.pop(); + if (skip) + yyNumPreprocessorSkipping--; + return skip; +} + +/* + Returns true if the condition evaluates as true, otherwise false. The + condition is represented by a string. Unsophisticated parsing techniques are + used. The preprocessing method could be named StriNg-Oriented PreProcessing, + as SNOBOL stands for StriNg-Oriented symBOlic Language. +*/ +bool Tokenizer::isTrue(const QString &condition) +{ + int firstOr = -1; + int firstAnd = -1; + int parenDepth = 0; + + /* + Find the first logical operator at top level, but be careful + about precedence. Examples: + + X || Y // the or + X || Y || Z // the leftmost or + X || Y && Z // the or + X && Y || Z // the or + (X || Y) && Z // the and + */ + for (int i = 0; i < (int) condition.length() - 1; i++) { + QChar ch = condition[i]; + if (ch == QChar('(')) { + parenDepth++; + } else if (ch == QChar(')')) { + parenDepth--; + } else if (parenDepth == 0) { + if (condition[i + 1] == ch) { + if (ch == QChar('|')) { + firstOr = i; + break; + } else if (ch == QChar('&')) { + if (firstAnd == -1) + firstAnd = i; + } + } + } + } + if (firstOr != -1) + return isTrue(condition.left(firstOr)) || + isTrue(condition.mid(firstOr + 2)); + if (firstAnd != -1) + return isTrue(condition.left(firstAnd)) && + isTrue(condition.mid(firstAnd + 2)); + + QString t = condition.simplified(); + if (t.isEmpty()) + return true; + + if (t[0] == QChar('!')) + return !isTrue(t.mid(1)); + if (t[0] == QChar('(') && t.right(1)[0] == QChar(')')) + return isTrue(t.mid(1, t.length() - 2)); + + if (definedX->exactMatch(t)) + return defines->exactMatch(definedX->cap(1)); + else + return !falsehoods->exactMatch(t); +} + +QT_END_NAMESPACE