src/xmlpatterns/parser/qxquerytokenizer.cpp
changeset 0 1918ee327afb
child 4 3b1da2848fc7
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/xmlpatterns/parser/qxquerytokenizer.cpp	Mon Jan 11 14:00:40 2010 +0000
@@ -0,0 +1,2249 @@
+/****************************************************************************
+**
+** Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies).
+** All rights reserved.
+** Contact: Nokia Corporation (qt-info@nokia.com)
+**
+** This file is part of the QtXmlPatterns module of the Qt Toolkit.
+**
+** $QT_BEGIN_LICENSE:LGPL$
+** No Commercial Usage
+** This file contains pre-release code and may not be distributed.
+** You may use this file in accordance with the terms and conditions
+** contained in the Technology Preview License Agreement accompanying
+** this package.
+**
+** GNU Lesser General Public License Usage
+** Alternatively, this file may be used under the terms of the GNU Lesser
+** General Public License version 2.1 as published by the Free Software
+** Foundation and appearing in the file LICENSE.LGPL included in the
+** packaging of this file.  Please review the following information to
+** ensure the GNU Lesser General Public License version 2.1 requirements
+** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
+**
+** In addition, as a special exception, Nokia gives you certain additional
+** rights.  These rights are described in the Nokia Qt LGPL Exception
+** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
+**
+** If you have questions regarding the use of this file, please contact
+** Nokia at qt-info@nokia.com.
+**
+**
+**
+**
+**
+**
+**
+**
+** $QT_END_LICENSE$
+**
+****************************************************************************/
+
+#include <QByteArray>
+
+#include "qquerytransformparser_p.h"
+
+#include "qxquerytokenizer_p.h"
+
+#include "qtokenlookup.cpp"
+
+QT_BEGIN_NAMESPACE
+
+namespace QPatternist
+{
+
+#define handleWhitespace()                      \
+{                                               \
+    const TokenType t = consumeWhitespace();    \
+    if(t != SUCCESS)                            \
+        return Token(t);                        \
+}
+
+XQueryTokenizer::XQueryTokenizer(const QString &query,
+                                 const QUrl &location,
+                                 const State startingState) : Tokenizer(location)
+                                                            , m_data(query)
+                                                            , m_length(query.length())
+                                                            , m_state(startingState)
+                                                            , m_pos(0)
+                                                            , m_line(1)
+                                                            , m_columnOffset(0)
+                                                            , m_scanOnly(false)
+{
+    Q_ASSERT(location.isValid() || location.isEmpty());
+}
+
+const QChar XQueryTokenizer::current() const
+{
+    if(m_pos < m_length)
+        return m_data.at(m_pos);
+    else
+        return QChar();
+}
+
+char XQueryTokenizer::peekCurrent() const
+{
+    return current().toAscii();
+}
+
+int XQueryTokenizer::peekForColonColon() const
+{
+    /* Note, we don't modify m_pos in this function, so we need to do offset
+     * calculations. */
+    int pos = m_pos;
+
+    while(pos < m_length)
+    {
+        switch(m_data.at(pos).toAscii())
+        {
+            /* Fallthrough these four. */
+            case ' ':
+            case '\t':
+            case '\n':
+            case '\r':
+                break;
+            case ':':
+            {
+                if(peekAhead((pos - m_pos) + 1) == ':')
+                    return pos - m_pos;
+                /* Fallthrough. */
+            }
+            default:
+                return -1;
+        }
+        ++pos;
+    }
+
+    return -1;
+}
+
+Tokenizer::Token XQueryTokenizer::tokenAndChangeState(const TokenType code,
+                                                      const State s,
+                                                      const int advance)
+{
+    Q_ASSERT(advance >= 0);
+    m_pos += advance;
+    setState(s);
+    return Token(code);
+}
+
+Tokenizer::Token XQueryTokenizer::tokenAndChangeState(const TokenType code,
+                                                      const QString &value,
+                                                      const State s)
+{
+    setState(s);
+    return Token(code, value);
+}
+
+Tokenizer::Token XQueryTokenizer::tokenAndAdvance(const TokenType code,
+                                                  const int advance)
+{
+    Q_ASSERT(advance >= 0);
+    m_pos += advance;
+    return Token(code);
+}
+
+QString XQueryTokenizer::normalizeEOL(const QString &input,
+                                      const CharacterSkips &characterSkips)
+{
+    const int len = input.count();
+    QString result;
+
+    /* The likely hood is rather high it'll be the same content. */
+    result.reserve(len);
+
+    for(int i = 0; i < len; ++i)
+    {
+        const QChar &at = input.at(i);
+
+        if(characterSkips.contains(i))
+        {
+            result.append(at);
+            continue;
+        }
+        switch(input.at(i).unicode())
+        {
+            case '\r':
+            {
+                if(i + 1 < len && input.at(i + 1) == QLatin1Char('\n'))
+                    ++i;
+
+                /* Else, fallthrough. */
+            }
+            case '\n':
+            {
+                result.append(QLatin1Char('\n'));
+                continue;
+            }
+            default:
+            {
+                result.append(at);
+            }
+        }
+    }
+
+    return result;
+}
+
+Tokenizer::TokenType XQueryTokenizer::consumeComment()
+{
+    /* Below, we return ERROR instead of END_OF_FILE such that the parser
+     * sees an invalid comment. */
+    while(m_pos < m_length)
+    {
+        switch(peekCurrent())
+        {
+            case ':':
+            {
+                ++m_pos; /* Consume ':' */
+                if(atEnd())
+                    return ERROR;
+
+                if(peekCurrent() == ')')
+                {
+                    ++m_pos; /* Consume ')' */
+                    return SUCCESS; /* The comment closed nicely. */
+                }
+                continue; /* We don't want to increment m_pos twice. */
+            }
+            case '(':
+            { /* It looks like the start of a comment. */
+                ++m_pos;
+
+                if(atEnd())
+                    return END_OF_FILE;
+                else if(peekCurrent() == ':')
+                {
+                    /* And it is a nested comment -- parse it. */
+                    const TokenType retval = consumeComment();
+                    if(retval == SUCCESS)
+                        continue; /* Continue with our "own" comment. */
+                    else
+                        return retval; /* Return the error in the nested comment. */
+                }
+                break;
+            }
+            case '\n':
+            /* Fallthrough. */
+            case '\r':
+            {
+                /* We want to count \r\n as a single line break. */
+                if(peekAhead() == '\n')
+                    ++m_pos;
+
+                m_columnOffset = m_pos;
+                ++m_line;
+
+                break;
+            }
+        }
+        ++m_pos;
+    }
+
+    return ERROR; /* Error: we reached the end while inside a comment. */
+}
+
+bool XQueryTokenizer::consumeRawWhitespace()
+{
+    while(m_pos < m_length)
+    {
+        switch(peekCurrent())
+        {
+            case ' ':
+            case '\t':
+                break;
+            case '\n':
+            case '\r':
+            {
+                if(peekAhead() == '\n')
+                    ++m_pos;
+
+                m_columnOffset = m_pos;
+                ++m_line;
+
+                break;
+            }
+            default:
+                return false;
+        }
+        ++m_pos;
+    }
+    return true;
+}
+
+Tokenizer::TokenType XQueryTokenizer::consumeWhitespace()
+{
+    while(m_pos < m_length)
+    {
+        switch(peekCurrent())
+        {
+            case ' ':
+            case '\t':
+                break;
+            case '\n':
+            case '\r':
+            {
+                /* We want to count \r\n as a single line break. */
+                if(peekAhead() == '\n')
+                    ++m_pos;
+
+                m_columnOffset = m_pos;
+                ++m_line;
+
+                break;
+            }
+            case '(':
+            {
+                if(peekAhead() == ':')
+                {
+                    m_pos += 2; /* Consume "(:" */
+
+                    const TokenType comment = consumeComment();
+                    if(comment == SUCCESS)
+                        continue;
+                    else
+                        return comment;
+                }
+            }
+            default:
+                return SUCCESS;
+        }
+        ++m_pos;
+    }
+
+    return END_OF_FILE;
+}
+
+char XQueryTokenizer::peekAhead(const int length) const
+{
+    if(m_pos + length < m_length)
+        return m_data.at(m_pos + length).toAscii();
+    else
+        return 0;
+}
+
+Tokenizer::Token XQueryTokenizer::error()
+{
+    return Token(ERROR);
+}
+
+bool XQueryTokenizer::isDigit(const char ch)
+{
+    return ch >= '0' && ch <= '9';
+}
+
+/* Replace with function in QXmlUtils. Write test cases for this. */
+bool XQueryTokenizer::isNCNameStart(const QChar ch)
+{
+    if(ch == QLatin1Char('_'))
+        return true;
+
+    switch(ch.category())
+    {
+        case QChar::Letter_Lowercase:
+        case QChar::Letter_Uppercase:
+        case QChar::Letter_Other:
+        case QChar::Letter_Titlecase:
+        case QChar::Number_Letter:
+            return true;
+        default:
+            return false;
+    }
+}
+
+bool XQueryTokenizer::isNCNameBody(const QChar ch)
+{
+    switch(ch.unicode())
+    {
+        case '.':
+        case '_':
+        case '-':
+            return true;
+    }
+
+    switch(ch.category())
+    {
+        case QChar::Letter_Lowercase:
+        case QChar::Letter_Uppercase:
+        case QChar::Letter_Other:
+        case QChar::Letter_Titlecase:
+        case QChar::Number_Letter:
+        case QChar::Mark_SpacingCombining:
+        case QChar::Mark_Enclosing:
+        case QChar::Mark_NonSpacing:
+        case QChar::Letter_Modifier:
+        case QChar::Number_DecimalDigit:
+            return true;
+        default:
+            return false;
+    }
+}
+
+bool XQueryTokenizer::isPhraseKeyword(const TokenType code)
+{
+    switch(code)
+    {
+        /* Fallthrough all these. */
+        case CASTABLE:
+        case CAST:
+        case COPY_NAMESPACES:
+        case DECLARE:
+        case EMPTY:
+        case MODULE:
+        case IMPORT:
+        case INSTANCE:
+        case ORDER:
+        case ORDERING:
+        case XQUERY:
+        case STABLE:
+        case TREAT:
+            return true;
+        default:
+            return false;
+    }
+}
+
+bool XQueryTokenizer::isOperatorKeyword(const TokenType code)
+{
+    switch(code)
+    {
+        /* Fallthrough all these. */
+        case AS:
+        case ASCENDING:
+        case AT:
+        case CASE:
+        case CAST:
+        case CASTABLE:
+        case EQ:
+        case EXTERNAL:
+        case GE:
+        case G_EQ:
+        case G_GT:
+        case G_LT:
+        case G_NE:
+        case GT:
+        case IN:
+        case INHERIT:
+        case INSTANCE:
+        case IS:
+        case ITEM:
+        case LE:
+        case LT:
+        case NE:
+        case NO_INHERIT:
+        case NO_PRESERVE:
+        case OF:
+        case PRESERVE:
+        case RETURN:
+        case STABLE:
+        case TO:
+        case TREAT:
+            return true;
+        default:
+            return false;
+    };
+}
+
+bool XQueryTokenizer::isTypeToken(const TokenType t)
+{
+    switch(t)
+    {
+        /* Fallthrough all these. */
+        case ATTRIBUTE:
+        case COMMENT:
+        case DOCUMENT:
+        case DOCUMENT_NODE:
+        case ELEMENT:
+        case ITEM:
+        case NODE:
+        case PROCESSING_INSTRUCTION:
+        case SCHEMA_ATTRIBUTE:
+        case SCHEMA_ELEMENT:
+        case TEXT:
+            return true;
+        default:
+            return false;
+    }
+}
+
+Tokenizer::Token XQueryTokenizer::tokenizeNCNameOrQName()
+{
+    const int start = m_pos;
+
+    const Token t1 = tokenizeNCName();
+    if(t1.hasError())
+        return t1;
+
+    if(peekCurrent() != ':' || peekAhead() == '=')
+        return t1;
+
+    ++m_pos;
+
+    const Token t2 = tokenizeNCName();
+    if(t2.hasError())
+        return t2;
+    else
+        return Token(QNAME, m_data.mid(start, m_pos - start));
+}
+
+Tokenizer::Token XQueryTokenizer::tokenizeNumberLiteral()
+{
+    setState(Operator);
+    const int startPos = m_pos;
+    bool hasDot = false;
+    bool isXPath20 = false;
+
+    for(; m_pos < m_length; ++m_pos)
+    {
+        QChar ch(current());
+
+        char cell = ch.cell();
+
+        if(cell == 'e' || cell == 'E')
+        {
+            isXPath20 = true;
+            ++m_pos;
+            ch = current();
+
+            if(ch.row() != 0)
+                break;
+
+            cell = ch.cell();
+
+            if(cell == '+' || cell == '-')
+                continue;
+        }
+
+        if(isNCNameStart(ch))
+            return error();
+
+        if(cell < '0' || cell > '9')
+        {
+            if(cell == '.' && !hasDot)
+                hasDot = true;
+            else
+                break;
+        }
+    }
+
+    return Token(isXPath20 ? XPATH2_NUMBER : NUMBER, m_data.mid(startPos, m_pos - startPos));
+}
+
+QString XQueryTokenizer::tokenizeCharacterReference()
+{
+    Q_ASSERT(peekCurrent() == '&');
+
+    const int theEnd = m_data.indexOf(QLatin1Char(';'), m_pos + 1);
+
+    if(theEnd == -1) /* No ';' found, a syntax error. i18n. */
+        return QString();
+
+    QString content(m_data.mid(m_pos + 1, (theEnd - m_pos) - 1));
+    m_pos = theEnd;
+
+    const QChar charRef(charForReference(content));
+
+    if(!charRef.isNull())
+        return charRef;
+    else if(content.startsWith(QLatin1Char('#')))
+    {
+        int base;
+
+        /* It is only '#' or '#x'. */
+        if(content.length() < 2)
+            return QString();
+
+        /* We got a hex number if it starts with 'x', otherwise it's a decimal. */
+        if(content.at(1) == QLatin1Char('x'))
+        {
+            base = 16;
+            content = content.mid(2); /* Remove "#x". */
+        }
+        else
+        {
+            base = 10;
+            content = content.mid(1); /* Remove "#". */
+        }
+
+        bool conversionOK = false;
+        const int codepoint = content.toInt(&conversionOK, base);
+
+        if(conversionOK)
+        {
+            const QChar ch(codepoint);
+
+            if(ch.isNull())
+            {
+                /* We likely have something which require surrogate pairs. */
+                QString result;
+                result += QChar(QChar::highSurrogate(codepoint));
+                result += QChar(QChar::lowSurrogate(codepoint));
+                return result;
+            }
+            else
+                return ch;
+        }
+        else
+            return QString();
+    }
+    else
+        return QString();
+}
+
+int XQueryTokenizer::scanUntil(const char *const content)
+{
+    const int end = m_data.indexOf(QString::fromLatin1(content), m_pos);
+
+    if(end == -1)
+        return -1;
+    else
+    {
+        const int len = end - m_pos;
+        m_pos += len;
+        return len;
+    }
+}
+
+QChar XQueryTokenizer::charForReference(const QString &reference)
+{
+    if(m_charRefs.isEmpty())
+    {
+        /* Initialize. */
+        m_charRefs.reserve(5);
+        m_charRefs.insert(QLatin1String("lt"),     QLatin1Char('<'));
+        m_charRefs.insert(QLatin1String("gt"),     QLatin1Char('>'));
+        m_charRefs.insert(QLatin1String("amp"),    QLatin1Char('&'));
+        m_charRefs.insert(QLatin1String("quot"),   QLatin1Char('"'));
+        m_charRefs.insert(QLatin1String("apos"),   QLatin1Char('\''));
+    }
+
+    return m_charRefs.value(reference);
+}
+
+Tokenizer::Token XQueryTokenizer::tokenizeStringLiteral()
+{
+    const QChar delimiter(current());
+    /* We cannot unfortunately just scan and then do mid(),
+     * since we can encounter character references. */
+    QString result;
+
+    /* This is more likely than QString's default allocation. */
+    result.reserve(8);
+
+    CharacterSkips skipEOLNormalization;
+
+    /* Advance over the initial quote character. */
+    ++m_pos;
+
+    for(; m_pos < m_length; ++m_pos)
+    {
+        const QChar c(current());
+
+        if(c == QLatin1Char('&'))
+        {
+            const QString charRef(tokenizeCharacterReference());
+
+            if(charRef.isNull())
+                return error();
+            else
+            {
+                skipEOLNormalization.insert(result.count());
+                result.append(charRef);
+            }
+
+        }
+        else if(c == delimiter)
+        {
+            /* Maybe the escaping mechanism is used. For instance, "s""s"
+             * has the value `s"s'. */
+            ++m_pos;
+
+            if(current() == delimiter) /* Double quote. */
+                result += delimiter;
+            else
+                return Token(STRING_LITERAL, normalizeEOL(result, skipEOLNormalization));
+        }
+        else
+            result += c;
+    }
+
+    return error();
+}
+
+Tokenizer::Token XQueryTokenizer::tokenizeNCName()
+{
+    const int startPos = m_pos;
+
+    if(m_pos < m_length && isNCNameStart(current()))
+    {
+        ++m_pos;
+
+        for(; m_pos < m_length; ++m_pos)
+        {
+            if(!isNCNameBody(current()))
+                break;
+        }
+
+        return Token(NCNAME, m_data.mid(startPos, m_pos - startPos));
+    }
+    else
+        return error();
+}
+
+bool XQueryTokenizer::aheadEquals(const char *const chs,
+                                  const int len,
+                                  const int offset) const
+{
+    Q_ASSERT(len > 0);
+    Q_ASSERT(qstrlen(chs) == uint(len));
+
+    if(m_pos + len >= m_length)
+        return false;
+
+    for(int i = offset; i < (len + offset); ++i)
+    {
+        if(m_data.at(m_pos + i).toAscii() != chs[i - offset])
+            return false;
+    }
+
+    return true;
+}
+
+const TokenMap *XQueryTokenizer::lookupKeyword(const QString &keyword)
+{
+    return TokenLookup::value(keyword.toAscii().constData(), keyword.length());
+}
+
+XQueryTokenizer::State XQueryTokenizer::state() const
+{
+    return m_state;
+}
+
+void XQueryTokenizer::setState(const State s)
+{
+    m_state = s;
+}
+
+void XQueryTokenizer::pushState(const State s)
+{
+    m_stateStack.push(s);
+}
+
+void XQueryTokenizer::pushState()
+{
+    m_stateStack.push(m_state);
+}
+
+void XQueryTokenizer::popState()
+{
+    /* QStack::pop() asserts if it's empty, so we need to check
+     * it, since we might receive unbalanced curlies. */
+    if(!m_stateStack.isEmpty())
+        m_state = m_stateStack.pop();
+}
+
+Tokenizer::Token XQueryTokenizer::nextToken()
+{
+    switch(state())
+    {
+        /* We want to skip or do special whitespace handling for these
+         * states. So fallthrough all of the following. */
+        case AposAttributeContent:
+        case Axis:
+        case ElementContent:
+        case EndTag:
+        case Pragma:
+        case PragmaContent:
+        case ProcessingInstructionName:
+        case QuotAttributeContent:
+        case StartTag:
+        case XMLComment:
+            break;
+        default:
+            handleWhitespace();
+    }
+
+    switch(state())
+    {
+        case XMLSpaceDecl:
+        /* Fallthrough. */
+        case NamespaceKeyword:
+        {
+            switch(peekCurrent())
+            {
+                case ',':
+                    return tokenAndAdvance(COMMA);
+                case '"':
+                /* Fallthrough. */
+                case '\'':
+                {
+                    setState(NamespaceDecl);
+                    return tokenizeStringLiteral();
+                }
+            }
+
+            const Token id(tokenizeNCName());
+
+            if(id.type != NCNAME)
+                return id;
+
+            const TokenMap *const keyword = lookupKeyword(id.value);
+            if(keyword)
+            {
+                switch(keyword->token)
+                {
+                    case INHERIT:
+                    /* Fallthrough. */
+                    case NO_INHERIT:
+                    {
+                        setState(Default);
+                        break;
+                    }
+                    case NAMESPACE:
+                    {
+                        setState(NamespaceDecl);
+                        break;
+                    }
+                    case ORDERED:
+                    /* Fallthrough. */
+                    case UNORDERED:
+                    /* Fallthrough. */
+                    case STRIP:
+                    {
+                        setState(Default);
+                        break;
+                    }
+                    case PRESERVE:
+                    {
+                        if(state() != NamespaceKeyword)
+                            setState(Default);
+                    }
+                    default:
+                        break;
+                }
+
+                return Token(keyword->token);
+            }
+            else
+                return id;
+
+            Q_ASSERT(false);
+        }
+        case NamespaceDecl:
+        {
+            switch(peekCurrent())
+            {
+                case '=':
+                    return tokenAndAdvance(G_EQ);
+                case ';':
+                    return tokenAndChangeState(SEMI_COLON, Default);
+                case '\'':
+                /* Fallthrough. */
+                case '\"':
+                    return tokenizeStringLiteral();
+            }
+
+            const Token nc(tokenizeNCName());
+
+            handleWhitespace();
+
+            const char pc = peekCurrent();
+            const TokenMap* const t = lookupKeyword(nc.value);
+
+            if(pc == '\'' || (pc == '"' && t))
+                return tokenAndChangeState(t->token, Default, 0);
+            else
+                return nc;
+
+            Q_ASSERT(false);
+        }
+        case Axis:
+        {
+            if(peekCurrent() == ':')
+            {
+                Q_ASSERT(peekAhead() == ':');
+                m_pos += 2;
+                setState(AfterAxisSeparator);
+                return Token(COLONCOLON);
+            }
+            /* Fallthrough. */
+        }
+        case AfterAxisSeparator:
+        /* Fallthrough. */
+        case Default:
+           /* State Operator and state Default have a lot of tokens in common except
+            * for minor differences. So we treat them the same way, and sprinkles logic
+            * here and there to handle the small differences. */
+        /* Fallthrough. */
+        case Operator:
+        {
+            switch(peekCurrent())
+            {
+                case '=':
+                    return tokenAndChangeState(G_EQ, Default);
+                case '-':
+                    return tokenAndChangeState(MINUS, Default);
+                case '+':
+                    return tokenAndChangeState(PLUS, Default);
+                case '[':
+                    return tokenAndChangeState(LBRACKET, Default);
+                case ']':
+                    return tokenAndChangeState(RBRACKET, Operator);
+                case ',':
+                    return tokenAndChangeState(COMMA, Default);
+                case ';':
+                    return tokenAndChangeState(SEMI_COLON, Default);
+                case '$':
+                    return tokenAndChangeState(DOLLAR, VarName);
+                case '|':
+                    return tokenAndChangeState(BAR, Default);
+                case '?':
+                    return tokenAndChangeState(QUESTION, Operator);
+                case ')':
+                    return tokenAndChangeState(RPAREN, Operator);
+                case '@':
+                    return tokenAndChangeState(AT_SIGN, Default);
+                /* Fallthrough all these. */
+                case '1':
+                case '2':
+                case '3':
+                case '4':
+                case '5':
+                case '6':
+                case '7':
+                case '8':
+                case '9':
+                case '0':
+                    return tokenizeNumberLiteral();
+                case '.':
+                {
+                    const char next = peekAhead();
+                    if(next == '.')
+                        return tokenAndChangeState(DOTDOT, Operator, 2);
+                    /* .5 is allowed, as short form for 0.5:
+                     * <tt>[142]     DecimalLiteral     ::=     ("." Digits) | (Digits "." [0-9]*)</tt>
+                     */
+                    else if(isDigit(next))
+                        return tokenizeNumberLiteral();
+                    else
+                        return tokenAndChangeState(DOT, Operator);
+                }
+                case '\'':
+                /* Fallthrough. */
+                case '"':
+                {
+                    setState(Operator);
+                    return tokenizeStringLiteral();
+
+                }
+                case '(':
+                {
+                    if(peekAhead() == '#')
+                        return tokenAndChangeState(PRAGMA_START, Pragma, 2);
+                    else
+                        return tokenAndChangeState(LPAREN, Default);
+                }
+                case '*':
+                {
+                    if(peekAhead() == ':')
+                    {
+                        m_pos += 2; /* Consume *:. */
+                        const Token nc = tokenizeNCName();
+
+                        if(nc.hasError())
+                            return error();
+                        else
+                            return tokenAndChangeState(ANY_PREFIX, nc.value, Operator);
+                    }
+                    else
+                        return tokenAndChangeState(STAR, state() == Default ? Operator : Default);
+                }
+                case ':':
+                {
+                    switch(peekAhead())
+                    {
+                        case '=':
+                            return tokenAndChangeState(ASSIGN, Default, 2);
+                        case ':':
+                            return tokenAndChangeState(COLONCOLON, Default, 2);
+                        default:
+                            return error();
+                    }
+                }
+                case '!':
+                {
+                    if(peekAhead() == '=')
+                        return tokenAndChangeState(G_NE, Default, 2);
+                    else
+                        return error();
+                }
+                case '<':
+                {
+                    switch(peekAhead())
+                    {
+                        case '=':
+                            return tokenAndChangeState(G_LE, Default, 2);
+                        case '<':
+                            return tokenAndChangeState(PRECEDES, Default, 2);
+                        case '?':
+                        {
+                            pushState(Operator);
+                            return tokenAndChangeState(PI_START, ProcessingInstructionName, 2);
+                        }
+                        case '!':
+                        {
+                            if(aheadEquals("!--", 3))
+                            {
+                                m_pos += 3; /* Consume "!--". */
+                                pushState(Operator);
+                                return tokenAndChangeState(COMMENT_START, XMLComment);
+                            }
+                            /* Fallthrough. It's a syntax error, and this is a good way to report it. */
+                        }
+                        default:
+                        {
+                            if((m_pos + 1) < m_length && isNCNameStart(m_data.at(m_pos + 1)))
+                            {
+                                /* We assume it's an element constructor. */
+                                pushState(Operator);
+                            }
+
+                            return tokenAndChangeState(G_LT, state() == Operator ? Default : StartTag);
+                        }
+                    }
+                }
+                case '>':
+                {
+                    switch(peekAhead())
+                    {
+                        case '=':
+                            return tokenAndChangeState(G_GE, Default, 2);
+                        case '>':
+                            return tokenAndChangeState(FOLLOWS, Default, 2);
+                        default:
+                            return tokenAndChangeState(G_GT, Default);
+                    }
+                }
+                case '/':
+                {
+                    if(peekAhead() == '/')
+                        return tokenAndChangeState(SLASHSLASH, Default, 2);
+                    else
+                        return tokenAndChangeState(SLASH, Default);
+                }
+                case '{':
+                {
+                    pushState(Operator);
+                    return tokenAndChangeState(CURLY_LBRACE, Default);
+                }
+                case '}':
+                {
+                    popState();
+
+                    return tokenAndAdvance(CURLY_RBRACE);
+                }
+            }
+
+            /* Ok. We're in state Default or Operator, and it wasn't a simple
+             * character. */
+
+            const Token id(tokenizeNCName());
+
+            if(id.type != NCNAME)
+                return id;
+
+            const TokenMap *const keyword = lookupKeyword(id.value);
+
+            if(state() == Operator)
+            {
+                if(keyword)
+                {
+                    if(keyword->token == DEFAULT || keyword->token == ASCENDING || keyword->token == DESCENDING)
+                        setState(Operator);
+                    else if(keyword->token == RETURN)
+                        setState(Default);
+                    else if(isPhraseKeyword(keyword->token))
+                    {
+                        const TokenType ws = consumeWhitespace();
+                        if(ws == ERROR)
+                            return error();
+
+                        const Token id2(tokenizeNCName());
+                        const TokenMap *const keyword2 = lookupKeyword(id2.value);
+
+                        if(keyword2)
+                        {
+                            if(keyword->token == TREAT && keyword2->token == AS)
+                                setState(ItemType);
+                            else if (keyword->token == CAST || (keyword->token == CASTABLE && keyword2->token == AS) || keyword2->token == BY)
+                                setState(Default);
+
+                            m_tokenStack.push(Token(keyword2->token));
+                        }
+                        else
+                            m_tokenStack.push(id2);
+
+                        return Token(keyword->token);
+                    }
+                    else
+                    {
+                        /* Such that we tokenize the second token in "empty greatest". */
+                        if(keyword->token != EMPTY)
+                            setState(Default);
+                    }
+
+                    if(keyword->token == AS || keyword->token == CASE)
+                        setState(ItemType);
+
+                    return Token(keyword->token);
+                }
+                else
+                    return id;
+            }
+
+            Q_ASSERT(state() == Default || state() == Axis || state() == AfterAxisSeparator);
+
+            /*
+             * This is hard. Consider this:
+             *
+             * Valid:           child       ::nameTest
+             * Valid:           child::     nameTest
+             * Syntax Error:    child       :localName
+             * Syntax Error:    child:      localName
+             *
+             * Consider "child ::name". Right now, we're here:
+             *                ^
+             * We don't know whether "child" is a prefix and hence the whitespace is invalid,
+             * or whether it's an axis and hence skippable. */
+            {
+                const int wsLength = peekForColonColon();
+                /* We cannot call handleWhitespace() because it returns on
+                 * END_OF_FILE, and we have parsed up keyword, and we need to
+                 * deal with that.
+                 *
+                 * If we have a colon colon, which means the whitespace is
+                 * allowed, we skip it. */
+                if(wsLength != -1)
+                    m_pos += wsLength;
+            }
+
+            /* Handle name tests. */
+            if(peekCurrent() == ':')
+            {
+                switch(peekAhead())
+                {
+                    case '=':
+                        return id;
+                    case '*':
+                    {
+                        m_pos += 2;
+                        return tokenAndChangeState(ANY_LOCAL_NAME, id.value, Operator);
+                    }
+                    case ':':
+                    {
+                        /* We have an axis. */
+                        setState(Axis);
+                        return keyword ? Token(keyword->token) : id;
+                    }
+                    default:
+                    {
+                        /* It's a QName. */
+                        ++m_pos; /* Consume the colon. */
+
+                        const Token id2(tokenizeNCName());
+
+                        if(id2.type != NCNAME)
+                        {
+                            --m_pos;
+                            return id;
+                        }
+
+                        setState(Operator);
+                        const int qNameLen = id.value.length() + id2.value.length() + 1;
+                        return Token(QNAME, m_data.mid(m_pos - qNameLen, qNameLen));
+                    }
+                }
+            }
+
+            if(!keyword || isOperatorKeyword(keyword->token))
+            {
+                setState(Operator);
+                return id;
+            }
+
+            const TokenType ws = consumeWhitespace();
+            if(ws == ERROR) // TODO this should test for success. Write test.
+                return Token(ERROR);
+
+            if(atEnd())
+            {
+                setState(Operator);
+                return id;
+            }
+
+            /* Let the if-body apply for constructors, and node type tests. */
+            if(isTypeToken(keyword->token) ||
+               keyword->token == TYPESWITCH ||
+               keyword->token == ORDERED ||
+               keyword->token == UNORDERED ||
+               keyword->token == IF)
+            {
+                switch(peekCurrent())
+                {
+                    case '(':
+                    {
+                        // TODO See if we can remove DOCUMENT from isTypeToken.
+                        if(isTypeToken(keyword->token) && keyword->token != DOCUMENT)
+                        {
+                            m_tokenStack.push(Token(LPAREN));
+                            ++m_pos; /* Consume '('. */
+                            pushState(Operator);
+
+                            if(keyword->token == PROCESSING_INSTRUCTION)
+                                setState(KindTestForPI);
+                            else
+                                setState(KindTest);
+
+                            return Token(keyword->token);
+                        }
+                        else if(keyword->token == TYPESWITCH || keyword->token == IF)
+                            return Token(keyword->token);
+                        else /* It's a function call. */
+                            return id;
+                    }
+                    case '{':
+                    {
+                        m_tokenStack.push(Token(CURLY_LBRACE));
+                        ++m_pos; /* Consume '{'. */
+                        pushState(Operator);
+                        /* Stay in state Default. */
+                        return Token(keyword->token);
+                    }
+                    default:
+                    {
+                        /* We have read in a token which is for instance
+                         * "return", and now it can be an element
+                         * test("element") a node kind test("element()"), or a
+                         * computed element constructor("element name {...").
+                         * We need to do a two-token lookahead here, because
+                         * "element return" can be an element test followed by
+                         * the return keyword, but it can also be an element
+                         * constructor("element return {"). */
+                        if(isNCNameStart(current()))
+                        {
+                            const int currentPos = m_pos;
+                            const Token token2 = tokenizeNCNameOrQName();
+
+                            if(token2.hasError())
+                                return token2;
+
+                            handleWhitespace();
+
+                            if(peekCurrent() == '{')
+                            {
+                                /* An element constructor. */
+                                m_tokenStack.push(token2);
+                                return Token(keyword->token);
+                            }
+
+                            /* We jump back in the stream, we need to tokenize token2 according
+                             * to the state. */
+                            m_pos = currentPos;
+                            setState(Operator);
+                            return Token(NCNAME, QLatin1String(keyword->name));
+                        }
+                    }
+                }
+            }
+
+            if(peekCurrent() == '$')
+            {
+                setState(VarName);
+                return Token(keyword->token);
+            }
+
+            /* It's not a node type, it's not the typeswitch expression, but it is a function callsite. */
+            if(peekCurrent() == '(')
+                return id;
+            else if(peekCurrent() == '{' && keyword->token == VALIDATE)
+                return Token(keyword->token);
+
+            if(!isNCNameStart(current()))
+            {
+                setState(Operator);
+                return id;
+            }
+
+            const Token id2(tokenizeNCName());
+            const TokenMap *const keyword2 = lookupKeyword(id2.value);
+
+            if(!keyword2)
+            {
+                /* It's a syntax error. All cases of two subsequent ncnames are keywords(e.g, declarations). */
+                setState(Operator);
+                return id;
+            }
+
+            switch(keyword->token)
+            {
+                case DECLARE:
+                {
+                    switch(keyword2->token)
+                    {
+                        case VARIABLE:
+                        /* Fallthrough. */
+                        case FUNCTION:
+                        {
+                            m_tokenStack.push(Token(keyword2->token));
+                            setState(Default);
+                            return Token(keyword->token);
+                        }
+                        case OPTION:
+                        {
+                            m_tokenStack.push(Token(keyword2->token));
+                            setState(Default);
+                            return Token(keyword->token);
+                        }
+                        case COPY_NAMESPACES:
+                        /* Fallthrough. */
+                        case ORDERING:
+                        {
+                            m_tokenStack.push(Token(keyword2->token));
+                            setState(NamespaceKeyword);
+                            return Token(keyword->token);
+                        }
+                        case CONSTRUCTION:
+                        {
+                            // TODO identical to CONSTRUCTION?
+                            m_tokenStack.push(Token(keyword2->token));
+                            setState(Operator);
+                            return Token(keyword->token);
+                        }
+                        case NAMESPACE:
+                        /* Fallthrough. */
+                        case BASEURI:
+                        {
+                            m_tokenStack.push(Token(keyword2->token));
+                            setState(NamespaceDecl);
+                            return Token(keyword->token);
+                        }
+                        case BOUNDARY_SPACE:
+                        {
+                            m_tokenStack.push(Token(keyword2->token));
+                            setState(XMLSpaceDecl);
+                            return Token(keyword->token);
+                        }
+                        case DEFAULT:
+                        {
+                            m_tokenStack.push(Token(keyword2->token));
+
+                            const TokenType ws2 = consumeWhitespace();
+                            if(ws2 != SUCCESS)
+                            {
+                                m_tokenStack.prepend(Token(ws2));
+                                return Token(keyword->token);
+                            }
+
+                            const Token id3(tokenizeNCName());
+
+                            if(id3.type != NCNAME)
+                            {
+                                m_tokenStack.prepend(id3);
+                                return Token(keyword->token);
+                            }
+
+                            const TokenMap *const keyword3 = lookupKeyword(id3.value);
+                            if(!keyword3)
+                            {
+                                m_tokenStack.prepend(id3);
+                                return Token(keyword->token);
+                            }
+                            else
+                            {
+                                m_tokenStack.prepend(Token(keyword3->token));
+
+                                if(keyword3->token == ORDER)
+                                    setState(Operator);
+                                else
+                                    setState(NamespaceDecl);
+                            }
+
+                            return Token(keyword->token);
+                        }
+                        default:
+                        {
+                            m_tokenStack.push(Token(keyword2->token));
+                            setState(Default);
+                            return id;
+                        }
+                    }
+                }
+                case XQUERY:
+                {
+                    m_tokenStack.push(Token(keyword2->token));
+
+                    if(keyword2->token == VERSION)
+                    {
+                        setState(NamespaceDecl);
+                        return Token(keyword->token);
+                    }
+                    else
+                    {
+                        setState(Operator);
+                        return id;
+                    }
+                }
+                case IMPORT:
+                {
+                    m_tokenStack.push(Token(keyword2->token));
+
+                    switch(keyword2->token)
+                    {
+                        case SCHEMA:
+                        /* Fallthrough. */
+                        case MODULE:
+                        {
+                            setState(NamespaceKeyword);
+                            return Token(keyword->token);
+                        }
+                        default:
+                        {
+                            setState(Operator);
+                            return id;
+                        }
+                    }
+                }
+                case VALIDATE:
+                {
+                    m_tokenStack.push(Token(keyword2->token));
+
+                    switch(keyword2->token)
+                    {
+                        case LAX:
+                        case STRICT:
+                        {
+                            pushState(Operator);
+                            return Token(keyword->token);
+                        }
+                        default:
+                        {
+                            setState(Operator);
+                            return id;
+                        }
+                    }
+                }
+                default:
+                {
+                    m_tokenStack.push(Token(keyword2->token));
+                    setState(Operator);
+                    return id;
+                }
+            }
+
+            Q_ASSERT(false);
+
+        }
+        case VarName:
+        {
+            if(peekCurrent() == '$')
+                return tokenAndAdvance(DOLLAR);
+
+            setState(Operator);
+            return tokenizeNCNameOrQName();
+            Q_ASSERT(false);
+        }
+        case ItemType:
+        {
+            switch(peekCurrent())
+            {
+                case '(':
+                    return tokenAndChangeState(LPAREN, KindTest);
+                case '$':
+                    return tokenAndChangeState(DOLLAR, VarName);
+            }
+
+            const Token name(tokenizeNCNameOrQName());
+
+            if(name.hasError())
+                return error();
+
+            else if(name.type == QNAME)
+            {
+                setState(OccurrenceIndicator);
+                return name;
+            }
+            else
+            {
+                const TokenMap *const keyword = lookupKeyword(name.value);
+
+                if(keyword)
+                {
+                    pushState(OccurrenceIndicator);
+                    return Token(keyword->token);
+                }
+                else
+                {
+                    setState(Default);
+                    return name;
+                }
+            }
+            Q_ASSERT(false);
+        }
+        case KindTest:
+        {
+            switch(peekCurrent())
+            {
+                case ')':
+                {
+                    popState();
+                    return tokenAndAdvance(RPAREN);
+                }
+                case '(':
+                    return tokenAndAdvance(LPAREN);
+                case ',':
+                    return tokenAndAdvance(COMMA);
+                case '*':
+                    return tokenAndAdvance(STAR);
+                case '?':
+                    return tokenAndAdvance(QUESTION);
+                case '\'':
+                /* Fallthrough. */
+                case '"':
+                    return tokenizeStringLiteral();
+            }
+
+            const Token nc(tokenizeNCNameOrQName());
+            if(nc.hasError())
+                return nc;
+
+            const TokenType ws = consumeWhitespace();
+            if(ws == ERROR)
+                return error();
+
+            if(peekCurrent() == '(')
+            {
+                const TokenMap *const keyword = lookupKeyword(nc.value);
+                if(keyword)
+                {
+                    pushState(KindTest);
+                    return Token(keyword->token);
+                }
+                else
+                    return nc;
+            }
+            else
+                return nc;
+            Q_ASSERT(false);
+        }
+        case KindTestForPI:
+        {
+            switch(peekCurrent())
+            {
+                case ')':
+                {
+                    popState();
+                    return tokenAndAdvance(RPAREN);
+                }
+                case '\'':
+                /* Fallthrough. */
+                case '"':
+                    return tokenizeStringLiteral();
+                default:
+                    return tokenizeNCName();
+            }
+            Q_ASSERT(false);
+        }
+        case OccurrenceIndicator:
+        {
+            switch(peekCurrent())
+            {
+                case '?':
+                    return tokenAndChangeState(QUESTION, Operator);
+                case '*':
+                    return tokenAndChangeState(STAR, Operator);
+                case '+':
+                    return tokenAndChangeState(PLUS, Operator);
+                default:
+                {
+                    setState(Operator);
+                    return nextToken();
+                }
+            }
+            Q_ASSERT(false);
+        }
+        case XQueryVersion:
+        {
+            switch(peekCurrent())
+            {
+                case '\'':
+                /* Fallthrough. */
+                case '"':
+                    return tokenizeStringLiteral();
+                case ';':
+                    return tokenAndChangeState(SEMI_COLON, Default);
+            }
+
+            const Token id(tokenizeNCName());
+
+            if(id.type != NCNAME)
+                return id;
+
+            const TokenMap *const keyword = lookupKeyword(id.value);
+            if(keyword)
+                return tokenAndChangeState(keyword->token, Default);
+            else
+                return id;
+            Q_ASSERT(false);
+        }
+        case StartTag:
+        {
+            if(peekAhead(-1) == '<')
+            {
+                if(current().isSpace())
+                    return Token(ERROR);
+            }
+            else
+            {
+                if(consumeRawWhitespace())
+                    return Token(END_OF_FILE);
+            }
+
+            switch(peekCurrent())
+            {
+                case '/':
+                {
+                    if(peekAhead() == '>')
+                    {
+                        m_pos += 2;
+
+                        if(m_scanOnly)
+                            return Token(POSITION_SET);
+                        else
+                        {
+                            popState();
+                            return Token(QUICK_TAG_END);
+                        }
+                    }
+                    else
+                        return error();
+                }
+                case '>':
+                {
+                    if(m_scanOnly)
+                        return tokenAndChangeState(POSITION_SET, StartTag);
+                    else
+                        return tokenAndChangeState(G_GT, ElementContent);
+                }
+                case '=':
+                    return tokenAndAdvance(G_EQ);
+                case '\'':
+                    return tokenAndChangeState(APOS, AposAttributeContent);
+                case '"':
+                    return tokenAndChangeState(QUOTE, QuotAttributeContent);
+                default:
+                    return tokenizeNCNameOrQName();
+            }
+            Q_ASSERT(false);
+        }
+        case AposAttributeContent:
+        /* Fallthrough. */
+        case QuotAttributeContent:
+        {
+            const QChar sep(state() == AposAttributeContent ? QLatin1Char('\'') : QLatin1Char('"'));
+            QString result;
+            result.reserve(20);
+
+            if(m_scanOnly)
+            {
+                int stack = 0;
+                return attributeAsRaw(sep, stack, m_pos, true, result);
+            }
+
+            Q_ASSERT(!m_scanOnly);
+            while(true)
+            {
+                if(atEnd())
+                {
+                    /* In the case that the XSL-T tokenizer invokes us with
+                     * default state QuotAttributeContent, we need to be able
+                     * to return a single string, in case that is all we have
+                     * accumulated. */
+                    if(result.isEmpty())
+                        return Token(END_OF_FILE);
+                    else
+                        return Token(STRING_LITERAL, result);
+                }
+
+                const QChar curr(current());
+
+                if(curr == sep)
+                {
+                    if(m_pos + 1 == m_length)
+                        return Token(END_OF_FILE);
+
+                    if(m_data.at(m_pos + 1) == sep)
+                    {
+                        /* The quoting mechanism was used. */
+                        m_pos += 2;
+                        result.append(sep);
+                        continue;
+                    }
+
+                    const QChar next(m_data.at(m_pos + 1));
+                    if(!next.isSpace() && next != QLatin1Char('/') && next != QLatin1Char('>'))
+                        return Token(ERROR); // i18n Space must separate attributes
+                    else if(result.isEmpty())
+                    {
+                        return tokenAndChangeState(state() == AposAttributeContent ? APOS : QUOTE,
+                                                   StartTag, 1);
+                    }
+                    else
+                    {
+                        /* Don't consume the sep, but leave it so we next time return a token for it. */
+                        return Token(STRING_LITERAL, result);
+                    }
+
+                    ++m_pos;
+                    continue;
+                }
+                else if(curr == QLatin1Char('{'))
+                {
+                    if(m_pos + 1 == m_length)
+                        return Token(END_OF_FILE);
+                    else if(peekAhead() == '{')
+                    {
+                        ++m_pos;
+                        result.append(QLatin1Char('{'));
+                    }
+                    else
+                    {
+                        if(result.isEmpty())
+                        {
+                            /* The Attribute Value Template appeared directly in the attribute. */
+                            pushState();
+                            return tokenAndChangeState(CURLY_LBRACE, Default);
+                        }
+                        else
+                        {
+                            /* We don't advance, keep '{' as next token. */
+                            return Token(STRING_LITERAL, result);
+                        }
+                    }
+                }
+                else if(curr == QLatin1Char('}'))
+                {
+                    if(m_pos + 1 == m_length)
+                        return Token(END_OF_FILE);
+                    else if(peekAhead() == '}')
+                    {
+                        ++m_pos;
+                        result.append(QLatin1Char('}'));
+                    }
+                    else
+                        return Token(ERROR);
+                }
+                else if(curr == QLatin1Char('&'))
+                {
+                    const QString ret(tokenizeCharacterReference());
+                    if(ret.isNull())
+                        return Token(ERROR);
+                    else
+                        result.append(ret);
+                }
+                else if(curr == QLatin1Char('<'))
+                    return Token(STRING_LITERAL, result);
+                else
+                {
+                    /* See Extensible Markup Language (XML) 1.0 (Fourth Edition),
+                     * 3.3.3 Attribute-Value Normalization.
+                     *
+                     * However, it is complicated a bit by that AVN is defined on top of
+                     * EOL normalization and we do those two in one go here. */
+                    switch(curr.unicode())
+                    {
+                        case 0xD:
+                        {
+                            if(peekAhead() == '\n')
+                            {
+                                result.append(QLatin1Char(' '));
+                                ++m_pos;
+                                break;
+                            }
+                        }
+                        case 0xA:
+                        /* Fallthrough. */
+                        case 0x9:
+                        {
+                            result.append(QLatin1Char(' '));
+                            break;
+                        }
+                        default:
+                            result.append(curr);
+                    }
+                }
+
+                ++m_pos;
+            }
+            Q_ASSERT(false);
+        }
+        case ElementContent:
+        {
+            QString result;
+            result.reserve(20);
+
+            /* Whether the text node, result, may be whitespace only. Character references
+             * and CDATA sections disables that. */
+            bool mayBeWS = true;
+
+            CharacterSkips skipEOLNormalization;
+
+            while(true)
+            {
+                if(atEnd())
+                    return Token(END_OF_FILE);
+
+                switch(peekCurrent())
+                {
+                    case '<':
+                    {
+                        if(!result.isEmpty() && peekAhead(2) != '[')
+                        {
+                            /* We encountered the end, and it was not a CDATA section. */
+                            /* We don't advance. Next time we'll handle the <... stuff. */
+                            return Token(mayBeWS ? STRING_LITERAL : NON_BOUNDARY_WS, normalizeEOL(result, skipEOLNormalization));
+                        }
+
+                        ++m_pos;
+                        if(atEnd())
+                            return Token(END_OF_FILE);
+
+                        const QChar ahead(current());
+                        if(ahead.isSpace())
+                            return error();
+                        else if(ahead == QLatin1Char('/'))
+                        {
+                            if(m_pos + 1 == m_length)
+                                return Token(END_OF_FILE);
+                            else if(m_data.at(m_pos + 1).isSpace())
+                                return error();
+                            else
+                                return tokenAndChangeState(BEGIN_END_TAG, EndTag);
+                        }
+                        else if(isNCNameStart(ahead))
+                        {
+                            pushState();
+                            return tokenAndChangeState(G_LT, StartTag, 0);
+                        }
+                        else if(aheadEquals("!--", 3, 0))
+                        {
+                            pushState();
+                            m_pos += 3;
+                            return tokenAndChangeState(COMMENT_START, XMLComment, 0);
+                        }
+                        else if(aheadEquals("![CDATA[", 8, 0))
+                        {
+                            mayBeWS = false;
+                            m_pos += 8;
+                            const int start = m_pos;
+                            const int len = scanUntil("]]>");
+
+                            if(len == -1)
+                                return Token(END_OF_FILE);
+
+                            m_pos += 2; /* Consume "]]>". Note that m_pos is on '!'. */
+                            result.append(m_data.mid(start, len));
+                            break;
+                        }
+                        else if(ahead == QLatin1Char('?'))
+                        {
+                            pushState();
+                            return tokenAndChangeState(PI_START, ProcessingInstructionName);
+                        }
+                        else
+                            return Token(G_LT);
+                    }
+                    case '&':
+                    {
+                        const QString ret(tokenizeCharacterReference());
+                        if(ret.isNull())
+                            return Token(ERROR);
+                        else
+                        {
+                            skipEOLNormalization.insert(result.count());
+                            result.append(ret);
+                            mayBeWS = false;
+                            break;
+                        }
+                    }
+                    case '{':
+                    {
+                        // TODO remove this check, also below.
+                        if(m_pos + 1 == m_length)
+                            return Token(END_OF_FILE);
+                        else if(peekAhead() == '{')
+                        {
+                            ++m_pos;
+                            result.append(QLatin1Char('{'));
+                        }
+                        else
+                        {
+                            if(result.isEmpty())
+                            {
+                                pushState();
+                                return tokenAndChangeState(CURLY_LBRACE, Default);
+                            }
+                            else
+                            {
+                                /* We don't advance here. */
+                                return Token(mayBeWS ? STRING_LITERAL : NON_BOUNDARY_WS, normalizeEOL(result, skipEOLNormalization));
+                            }
+                        }
+                        break;
+                    }
+                    case '}':
+                    {
+                        if(m_pos + 1 == m_length)
+                            return Token(END_OF_FILE);
+                        else if(peekAhead() == '}')
+                        {
+                            ++m_pos;
+                            result.append(QLatin1Char('}'));
+                        }
+                        else
+                        {
+                            /* This is a parse error, and the grammar won't be able
+                             * to reduce this CURLY_RBRACE. */
+                            return tokenAndChangeState(CURLY_RBRACE, Default);
+                        }
+                        break;
+                    }
+                    case '\n':
+                    {
+                        /* We want to translate \r\n into \n. */
+                        if(peekAhead(-1) == '\r')
+                            break;
+                        /* else, fallthrough. */
+                    }
+                    case '\r':
+                    {
+                        result.append(QLatin1Char('\n'));
+                        break;
+                    }
+                    default:
+                    {
+                        result.append(current());
+                        break;
+                    }
+                }
+                ++m_pos;
+            }
+            Q_ASSERT(false);
+        }
+        case ProcessingInstructionName:
+        {
+            const int start = m_pos;
+
+            while(true)
+            {
+                ++m_pos;
+                if(m_pos >= m_length)
+                    return Token(END_OF_FILE);
+
+                const QChar next(current());
+                if(next.isSpace() || next == QLatin1Char('?'))
+                {
+                    return tokenAndChangeState(PI_TARGET, m_data.mid(start, m_pos - start),
+                                               ProcessingInstructionContent);
+                }
+            }
+            Q_ASSERT(false);
+        }
+        case ProcessingInstructionContent:
+        {
+            /* Consume whitespace between the name and the content. */
+            if(consumeRawWhitespace())
+                return Token(END_OF_FILE);
+
+            const int start = m_pos;
+            const int len = scanUntil("?>");
+
+            if(len == -1)
+                return Token(END_OF_FILE);
+            else
+            {
+                m_pos += 2; /* Consume "?>" */
+                popState();
+                return Token(PI_CONTENT, normalizeEOL(m_data.mid(start, len), CharacterSkips()));
+            }
+            Q_ASSERT(false);
+        }
+        case EndTag:
+        {
+            if(consumeRawWhitespace())
+                return END_OF_FILE;
+
+            if(peekCurrent() == '>')
+            {
+                popState();
+                return tokenAndAdvance(G_GT);
+            }
+            else
+                return tokenizeNCNameOrQName();
+            Q_ASSERT(false);
+        }
+        case XMLComment:
+        {
+            const int start = m_pos;
+            const int len = scanUntil("--");
+
+            if(len == -1)
+                return END_OF_FILE;
+            else
+            {
+                m_pos += 2; /* Consume "--". */
+                popState();
+
+                if(peekCurrent() == '>')
+                {
+                    ++m_pos;
+                    return Token(COMMENT_CONTENT, normalizeEOL(m_data.mid(start, len), CharacterSkips()));
+                }
+                else
+                    return error();
+            }
+            Q_ASSERT(false);
+        }
+        case Pragma:
+        {
+            /* Consume whitespace. */
+            if(consumeRawWhitespace())
+                return Token(END_OF_FILE);
+
+            setState(PragmaContent);
+            return tokenizeNCNameOrQName();
+        }
+        case PragmaContent:
+        {
+            QString result;
+            result.reserve(20);
+
+            const bool hasWS = m_pos < m_length && current().isSpace();
+
+            /* Consume all whitespace up to the pragma content(if any). */
+            if(consumeRawWhitespace())
+                return Token(END_OF_FILE);
+
+            if(peekCurrent() == '#' && peekAhead() == ')')
+            {
+                /* We reached the end, and there's no pragma content. */
+                return tokenAndChangeState(PRAGMA_END, Default, 2);
+            }
+            else if(!hasWS)
+            {
+                /* A separating space is required if there's pragma content. */
+                return error(); /* i18n */
+            }
+
+            const int start = m_pos;
+            const int len = scanUntil("#)");
+            if(len == -1)
+                return Token(END_OF_FILE);
+
+            return Token(STRING_LITERAL, m_data.mid(start, len));
+            Q_ASSERT(false);
+        }
+    }
+
+    Q_ASSERT(false);
+    return error();
+}
+
+Tokenizer::Token XQueryTokenizer::attributeAsRaw(const QChar sep,
+                                                 int &sepStack,
+                                                 const int startPos,
+                                                 const bool aInLiteral,
+                                                 QString &result)
+{
+    bool inLiteral = aInLiteral;
+    const char otherSep = (sep == QLatin1Char('"') ? '\'' : '"');
+
+    while(true)
+    {
+        if(atEnd())
+            return END_OF_FILE;
+
+        if(peekCurrent() == sep.unicode())
+        {
+            if(inLiteral)
+                inLiteral = false;
+            else
+                inLiteral = true;
+
+            if(peekAhead() == sep.unicode())
+            {
+                /* The quoting mechanism was used. */
+                result.append(current());
+                m_pos += 2;
+                continue;
+            }
+            else
+            {
+                /* Don't consume the separator, such that we
+                 * return a token for it next time. */
+                if(m_pos == startPos)
+                {
+                    ++m_pos;
+                    setState(StartTag);
+                    return Token(sep == QLatin1Char('"') ? QUOTE : APOS);
+                }
+
+
+                if(sepStack == 0)
+                {
+                    return Token(STRING_LITERAL, result);
+                }
+                else
+                {
+                    result.append(current());
+                    ++m_pos;
+                    continue;
+                }
+            }
+        }
+        else if(peekCurrent() == '&')
+        {
+            const QString ret(tokenizeCharacterReference());
+            if(ret.isNull())
+                return Token(ERROR);
+            else
+            {
+                result.append(ret);
+                ++m_pos;
+                continue;
+            }
+        }
+        else if(peekCurrent() == otherSep)
+        {
+            result.append(current());
+            ++m_pos;
+
+            if(peekCurrent() == otherSep)
+                ++m_pos;
+
+            if(inLiteral)
+                inLiteral = false;
+            else
+                inLiteral = true;
+
+            continue;
+        }
+        else if(peekCurrent() == '{')
+        {
+            result.append(current());
+
+            if(peekAhead() == '{')
+            {
+                m_pos += 2;
+                continue;
+            }
+            else
+            {
+                ++m_pos;
+                ++sepStack;
+                const Token t(attributeAsRaw(sep, sepStack, startPos, false, result));
+                if(t.type != SUCCESS)
+                    return t;
+            }
+
+        }
+        else if(peekCurrent() == '}')
+        {
+            if(inLiteral && peekAhead() == '}')
+            {
+                result.append(current());
+                m_pos += 2;
+                continue;
+            }
+            else
+            {
+                ++m_pos;
+                --sepStack;
+                return Token(SUCCESS); /* The return value is arbitrary. */
+            }
+        }
+        else
+        {
+            result.append(current());
+            ++m_pos;
+        }
+    }
+}
+
+Tokenizer::Token XQueryTokenizer::nextToken(YYLTYPE *const sourceLocator)
+{
+    sourceLocator->first_line = m_line;
+    sourceLocator->first_column = m_pos - m_columnOffset + 1; /* Plus 1, since m_pos is 0-based. */
+
+    if(m_tokenStack.isEmpty())
+        return nextToken();
+    else
+    {
+        const Token retval(m_tokenStack.pop());
+
+        switch(retval.type)
+        {
+            case MODULE:
+            /* Fallthrough.*/
+            case SCHEMA:
+            /* Fallthrough.*/
+            case COPY_NAMESPACES:
+            {
+                setState(NamespaceKeyword);
+                break;
+            }
+            case VERSION:
+            {
+                setState(XQueryVersion);
+                break;
+            }
+            case AS:
+            /* Fallthrough. */
+            case OF:
+            {
+                setState(ItemType);
+                break;
+            }
+            default:
+            {
+                if(isOperatorKeyword(retval.type))
+                    setState(Default);
+
+                break;
+            }
+        };
+
+        return retval;
+    }
+}
+
+int XQueryTokenizer::commenceScanOnly()
+{
+    m_scanOnly = true;
+    return m_pos;
+}
+
+void XQueryTokenizer::resumeTokenizationFrom(const int pos)
+{
+    m_scanOnly = false;
+    m_pos = pos;
+}
+
+void XQueryTokenizer::setParserContext(const ParserContext::Ptr &)
+{
+}
+
+#undef handleWhitespace
+
+} // namespace QPatternist
+
+QT_END_NAMESPACE