src/xmlpatterns/parser/qxquerytokenizer_p.h
changeset 0 1918ee327afb
child 4 3b1da2848fc7
equal deleted inserted replaced
-1:000000000000 0:1918ee327afb
       
     1 /****************************************************************************
       
     2 **
       
     3 ** Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies).
       
     4 ** All rights reserved.
       
     5 ** Contact: Nokia Corporation (qt-info@nokia.com)
       
     6 **
       
     7 ** This file is part of the QtXmlPatterns module of the Qt Toolkit.
       
     8 **
       
     9 ** $QT_BEGIN_LICENSE:LGPL$
       
    10 ** No Commercial Usage
       
    11 ** This file contains pre-release code and may not be distributed.
       
    12 ** You may use this file in accordance with the terms and conditions
       
    13 ** contained in the Technology Preview License Agreement accompanying
       
    14 ** this package.
       
    15 **
       
    16 ** GNU Lesser General Public License Usage
       
    17 ** Alternatively, this file may be used under the terms of the GNU Lesser
       
    18 ** General Public License version 2.1 as published by the Free Software
       
    19 ** Foundation and appearing in the file LICENSE.LGPL included in the
       
    20 ** packaging of this file.  Please review the following information to
       
    21 ** ensure the GNU Lesser General Public License version 2.1 requirements
       
    22 ** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
       
    23 **
       
    24 ** In addition, as a special exception, Nokia gives you certain additional
       
    25 ** rights.  These rights are described in the Nokia Qt LGPL Exception
       
    26 ** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
       
    27 **
       
    28 ** If you have questions regarding the use of this file, please contact
       
    29 ** Nokia at qt-info@nokia.com.
       
    30 **
       
    31 **
       
    32 **
       
    33 **
       
    34 **
       
    35 **
       
    36 **
       
    37 **
       
    38 ** $QT_END_LICENSE$
       
    39 **
       
    40 ****************************************************************************/
       
    41 
       
    42 //
       
    43 //  W A R N I N G
       
    44 //  -------------
       
    45 //
       
    46 // This file is not part of the Qt API.  It exists purely as an
       
    47 // implementation detail.  This header file may change from version to
       
    48 // version without notice, or even be removed.
       
    49 //
       
    50 // We mean it.
       
    51 #ifndef Patternist_XQueryTokenizer_H
       
    52 #define Patternist_XQueryTokenizer_H
       
    53 
       
    54 #include <QHash>
       
    55 #include <QSet>
       
    56 #include <QStack>
       
    57 #include <QString>
       
    58 #include <QUrl>
       
    59 
       
    60 #include "qtokenizer_p.h"
       
    61 
       
    62 QT_BEGIN_HEADER
       
    63 
       
    64 QT_BEGIN_NAMESPACE
       
    65 
       
    66 namespace QPatternist
       
    67 {
       
    68     struct TokenMap;
       
    69 
       
    70     /**
       
    71      * @short A hand-written tokenizer which tokenizes XQuery 1.0 & XPath 2.0,
       
    72      * and delivers tokens to the Bison generated parser.
       
    73      *
       
    74      * @author Frans Englich <frans.englich@nokia.com>
       
    75      */
       
    76     class XQueryTokenizer : public Tokenizer
       
    77     {
       
    78     public:
       
    79         /**
       
    80          * Tokenizer states. Organized alphabetically.
       
    81          */
       
    82         enum State
       
    83         {
       
    84             AfterAxisSeparator,
       
    85             AposAttributeContent,
       
    86             Axis,
       
    87             Default,
       
    88             ElementContent,
       
    89             EndTag,
       
    90             ItemType,
       
    91             KindTest,
       
    92             KindTestForPI,
       
    93             NamespaceDecl,
       
    94             NamespaceKeyword,
       
    95             OccurrenceIndicator,
       
    96             Operator,
       
    97             Pragma,
       
    98             PragmaContent,
       
    99             ProcessingInstructionContent,
       
   100             ProcessingInstructionName,
       
   101             QuotAttributeContent,
       
   102             StartTag,
       
   103             VarName,
       
   104             XMLComment,
       
   105             XMLSpaceDecl,
       
   106             XQueryVersion
       
   107         };
       
   108 
       
   109         XQueryTokenizer(const QString &query,
       
   110                         const QUrl &location,
       
   111                         const State startingState = Default);
       
   112 
       
   113         virtual Token nextToken(YYLTYPE *const sourceLocator);
       
   114         virtual int commenceScanOnly();
       
   115         virtual void resumeTokenizationFrom(const int position);
       
   116 
       
   117         /**
       
   118          * Does nothing.
       
   119          */
       
   120         virtual void setParserContext(const ParserContext::Ptr &parseInfo);
       
   121 
       
   122     private:
       
   123 
       
   124         /**
       
   125          * Returns the character corresponding to the builtin reference @p
       
   126          * reference. For instance, passing @c gt will give you '>' in return.
       
   127          *
       
   128          * If @p reference is an invalid character reference, a null QChar is
       
   129          * returned.
       
   130          *
       
   131          * @see QChar::isNull()
       
   132          */
       
   133         QChar charForReference(const QString &reference);
       
   134 
       
   135         inline Token tokenAndChangeState(const TokenType code,
       
   136                                          const State state,
       
   137                                          const int advance = 1);
       
   138         inline Token tokenAndChangeState(const TokenType code,
       
   139                                          const QString &value,
       
   140                                          const State state);
       
   141         inline Token tokenAndAdvance(const TokenType code,
       
   142                                      const int advance = 1);
       
   143         QString tokenizeCharacterReference();
       
   144 
       
   145         inline Token tokenizeStringLiteral();
       
   146         inline Token tokenizeNumberLiteral();
       
   147 
       
   148         /**
       
   149          * @returns the character @p length characters from the current
       
   150          * position.
       
   151          */
       
   152         inline char peekAhead(const int length = 1) const;
       
   153 
       
   154         /**
       
   155          * @returns whether the stream, starting from @p offset from the
       
   156          * current position, matches @p chs. The length of @p chs is @p len.
       
   157          */
       
   158         inline bool aheadEquals(const char *const chs,
       
   159                                 const int len,
       
   160                                 const int offset = 1) const;
       
   161 
       
   162         inline Token tokenizeNCName();
       
   163         static inline bool isOperatorKeyword(const TokenType);
       
   164 
       
   165         static inline bool isDigit(const char ch);
       
   166         static inline Token error();
       
   167         inline TokenType consumeWhitespace();
       
   168 
       
   169         /**
       
   170          * @short Returns the character at the current position, converted to
       
   171          * @c ASCII.
       
   172          *
       
   173          * Equivalent to calling:
       
   174          *
       
   175          * @code
       
   176          * current().toAscii();
       
   177          * @endcode
       
   178          */
       
   179         inline char peekCurrent() const;
       
   180 
       
   181         /**
       
   182          * Disregarding encoding conversion, equivalent to calling:
       
   183          *
       
   184          * @code
       
   185          * peekAhead(0);
       
   186          * @endcode
       
   187          */
       
   188         inline const QChar current() const;
       
   189 
       
   190         /**
       
   191          * @p hadWhitespace is always set to a proper value.
       
   192          *
       
   193          * @returns the length of whitespace scanned before reaching "::", or
       
   194          * -1 if something else was found.
       
   195          */
       
   196         int peekForColonColon() const;
       
   197 
       
   198         static inline bool isNCNameStart(const QChar ch);
       
   199         static inline bool isNCNameBody(const QChar ch);
       
   200         static inline const TokenMap *lookupKeyword(const QString &keyword);
       
   201         inline void popState();
       
   202         inline void pushState(const State state);
       
   203         inline State state() const;
       
   204         inline void setState(const State s);
       
   205         static bool isTypeToken(const TokenType t);
       
   206 
       
   207         inline Token tokenizeNCNameOrQName();
       
   208         /**
       
   209          * Advances m_pos until content is encountered.
       
   210          *
       
   211          * Returned is the length stretching from m_pos when starting, until
       
   212          * @p content is encountered. @p content is not included in the length.
       
   213          */
       
   214         int scanUntil(const char *const content);
       
   215 
       
   216         /**
       
   217          * Same as calling:
       
   218          * @code
       
   219          * pushState(currentState());
       
   220          * @endcode
       
   221          */
       
   222         inline void pushState();
       
   223 
       
   224         /**
       
   225          * Consumes only whitespace, in the traditional sense. The function exits
       
   226          * if non-whitespace is encountered, such as the start of a comment.
       
   227          *
       
   228          * @returns @c true if the end was reached, otherwise @c false
       
   229          */
       
   230         inline bool consumeRawWhitespace();
       
   231 
       
   232         /**
       
   233          * @short Parses comments: <tt>(: comment content :)</tt>. It recurses for
       
   234          * parsing nested comments.
       
   235          *
       
   236          * It is assumed that the start token for the comment, "(:", has
       
   237          * already been parsed.
       
   238          *
       
   239          * Typically, don't call this function, but ignoreWhitespace().
       
   240          *
       
   241          * @see <a href="http://www.w3.org/TR/xpath20/#comments">XML Path Language (XPath)
       
   242          * 2.0, 2.6 Comments</a>
       
   243          * @returns
       
   244          * - SUCCESS if everything went ok
       
   245          * - ERROR if there was an error in parsing one or more comments
       
   246          * - END_OF_FILE if the end was reached
       
   247          */
       
   248         Tokenizer::TokenType consumeComment();
       
   249 
       
   250         /**
       
   251          * Determines whether @p code is a keyword
       
   252          * that is followed by a second keyword. For instance <tt>declare
       
   253          * function</tt>.
       
   254          */
       
   255         static inline bool isPhraseKeyword(const TokenType code);
       
   256 
       
   257         /**
       
   258          * A set of indexes into a QString, the one being passed to
       
   259          * normalizeEOL() whose characters shouldn't be normalized. */
       
   260         typedef QSet<int> CharacterSkips;
       
   261 
       
   262         /**
       
   263          * Returns @p input, normalized according to
       
   264          * <a href="http://www.w3.org/TR/xquery/#id-eol-handling">XQuery 1.0:
       
   265          * An XML Query Language, A.2.3 End-of-Line Handling</a>
       
   266          */
       
   267         static QString normalizeEOL(const QString &input,
       
   268                                     const CharacterSkips &characterSkips);
       
   269 
       
   270         inline bool atEnd() const
       
   271         {
       
   272             return m_pos == m_length;
       
   273         }
       
   274 
       
   275         Token nextToken();
       
   276         /**
       
   277          * Instead of recognizing and tokenizing embedded expressions in
       
   278          * direct attriute constructors, this function is essentially a mini
       
   279          * recursive-descent parser that has the necessary logic to recognize
       
   280          * embedded expressions and their potentially interfering string literals, in
       
   281          * order to scan to the very end of the attribute value, and return the
       
   282          * whole as a string.
       
   283          *
       
   284          * There is of course syntax errors this function will not detect, but
       
   285          * that is ok since the attributes will be parsed once more.
       
   286          *
       
   287          * An inelegant solution, but which gets the job done.
       
   288          *
       
   289          * @see commenceScanOnly(), resumeTokenizationFrom()
       
   290          */
       
   291         Token attributeAsRaw(const QChar separator,
       
   292                              int &stack,
       
   293                              const int startPos,
       
   294                              const bool inLiteral,
       
   295                              QString &result);
       
   296 
       
   297         const QString           m_data;
       
   298         const int               m_length;
       
   299         State                   m_state;
       
   300         QStack<State>           m_stateStack;
       
   301         int                     m_pos;
       
   302 
       
   303         /**
       
   304          * The current line number.
       
   305          *
       
   306          * The line number and column number both starts at 1.
       
   307          */
       
   308         int                     m_line;
       
   309 
       
   310         /**
       
   311          * The offset into m_length for where
       
   312          * the current column starts. So m_length - m_columnOffset
       
   313          * is the current column.
       
   314          *
       
   315          * The line number and column number both starts at 1.
       
   316          */
       
   317         int                     m_columnOffset;
       
   318 
       
   319         const NamePool::Ptr     m_namePool;
       
   320         QStack<Token>           m_tokenStack;
       
   321         QHash<QString, QChar>   m_charRefs;
       
   322         bool                    m_scanOnly;
       
   323 
       
   324         Q_DISABLE_COPY(XQueryTokenizer)
       
   325     };
       
   326 }
       
   327 
       
   328 QT_END_NAMESPACE
       
   329 
       
   330 QT_END_HEADER
       
   331 
       
   332 #endif