src/xmlpatterns/parser/qxquerytokenizer.cpp
changeset 0 1918ee327afb
child 4 3b1da2848fc7
equal deleted inserted replaced
-1:000000000000 0:1918ee327afb
       
     1 /****************************************************************************
       
     2 **
       
     3 ** Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies).
       
     4 ** All rights reserved.
       
     5 ** Contact: Nokia Corporation (qt-info@nokia.com)
       
     6 **
       
     7 ** This file is part of the QtXmlPatterns module of the Qt Toolkit.
       
     8 **
       
     9 ** $QT_BEGIN_LICENSE:LGPL$
       
    10 ** No Commercial Usage
       
    11 ** This file contains pre-release code and may not be distributed.
       
    12 ** You may use this file in accordance with the terms and conditions
       
    13 ** contained in the Technology Preview License Agreement accompanying
       
    14 ** this package.
       
    15 **
       
    16 ** GNU Lesser General Public License Usage
       
    17 ** Alternatively, this file may be used under the terms of the GNU Lesser
       
    18 ** General Public License version 2.1 as published by the Free Software
       
    19 ** Foundation and appearing in the file LICENSE.LGPL included in the
       
    20 ** packaging of this file.  Please review the following information to
       
    21 ** ensure the GNU Lesser General Public License version 2.1 requirements
       
    22 ** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
       
    23 **
       
    24 ** In addition, as a special exception, Nokia gives you certain additional
       
    25 ** rights.  These rights are described in the Nokia Qt LGPL Exception
       
    26 ** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
       
    27 **
       
    28 ** If you have questions regarding the use of this file, please contact
       
    29 ** Nokia at qt-info@nokia.com.
       
    30 **
       
    31 **
       
    32 **
       
    33 **
       
    34 **
       
    35 **
       
    36 **
       
    37 **
       
    38 ** $QT_END_LICENSE$
       
    39 **
       
    40 ****************************************************************************/
       
    41 
       
    42 #include <QByteArray>
       
    43 
       
    44 #include "qquerytransformparser_p.h"
       
    45 
       
    46 #include "qxquerytokenizer_p.h"
       
    47 
       
    48 #include "qtokenlookup.cpp"
       
    49 
       
    50 QT_BEGIN_NAMESPACE
       
    51 
       
    52 namespace QPatternist
       
    53 {
       
    54 
       
    55 #define handleWhitespace()                      \
       
    56 {                                               \
       
    57     const TokenType t = consumeWhitespace();    \
       
    58     if(t != SUCCESS)                            \
       
    59         return Token(t);                        \
       
    60 }
       
    61 
       
    62 XQueryTokenizer::XQueryTokenizer(const QString &query,
       
    63                                  const QUrl &location,
       
    64                                  const State startingState) : Tokenizer(location)
       
    65                                                             , m_data(query)
       
    66                                                             , m_length(query.length())
       
    67                                                             , m_state(startingState)
       
    68                                                             , m_pos(0)
       
    69                                                             , m_line(1)
       
    70                                                             , m_columnOffset(0)
       
    71                                                             , m_scanOnly(false)
       
    72 {
       
    73     Q_ASSERT(location.isValid() || location.isEmpty());
       
    74 }
       
    75 
       
    76 const QChar XQueryTokenizer::current() const
       
    77 {
       
    78     if(m_pos < m_length)
       
    79         return m_data.at(m_pos);
       
    80     else
       
    81         return QChar();
       
    82 }
       
    83 
       
    84 char XQueryTokenizer::peekCurrent() const
       
    85 {
       
    86     return current().toAscii();
       
    87 }
       
    88 
       
    89 int XQueryTokenizer::peekForColonColon() const
       
    90 {
       
    91     /* Note, we don't modify m_pos in this function, so we need to do offset
       
    92      * calculations. */
       
    93     int pos = m_pos;
       
    94 
       
    95     while(pos < m_length)
       
    96     {
       
    97         switch(m_data.at(pos).toAscii())
       
    98         {
       
    99             /* Fallthrough these four. */
       
   100             case ' ':
       
   101             case '\t':
       
   102             case '\n':
       
   103             case '\r':
       
   104                 break;
       
   105             case ':':
       
   106             {
       
   107                 if(peekAhead((pos - m_pos) + 1) == ':')
       
   108                     return pos - m_pos;
       
   109                 /* Fallthrough. */
       
   110             }
       
   111             default:
       
   112                 return -1;
       
   113         }
       
   114         ++pos;
       
   115     }
       
   116 
       
   117     return -1;
       
   118 }
       
   119 
       
   120 Tokenizer::Token XQueryTokenizer::tokenAndChangeState(const TokenType code,
       
   121                                                       const State s,
       
   122                                                       const int advance)
       
   123 {
       
   124     Q_ASSERT(advance >= 0);
       
   125     m_pos += advance;
       
   126     setState(s);
       
   127     return Token(code);
       
   128 }
       
   129 
       
   130 Tokenizer::Token XQueryTokenizer::tokenAndChangeState(const TokenType code,
       
   131                                                       const QString &value,
       
   132                                                       const State s)
       
   133 {
       
   134     setState(s);
       
   135     return Token(code, value);
       
   136 }
       
   137 
       
   138 Tokenizer::Token XQueryTokenizer::tokenAndAdvance(const TokenType code,
       
   139                                                   const int advance)
       
   140 {
       
   141     Q_ASSERT(advance >= 0);
       
   142     m_pos += advance;
       
   143     return Token(code);
       
   144 }
       
   145 
       
   146 QString XQueryTokenizer::normalizeEOL(const QString &input,
       
   147                                       const CharacterSkips &characterSkips)
       
   148 {
       
   149     const int len = input.count();
       
   150     QString result;
       
   151 
       
   152     /* The likely hood is rather high it'll be the same content. */
       
   153     result.reserve(len);
       
   154 
       
   155     for(int i = 0; i < len; ++i)
       
   156     {
       
   157         const QChar &at = input.at(i);
       
   158 
       
   159         if(characterSkips.contains(i))
       
   160         {
       
   161             result.append(at);
       
   162             continue;
       
   163         }
       
   164         switch(input.at(i).unicode())
       
   165         {
       
   166             case '\r':
       
   167             {
       
   168                 if(i + 1 < len && input.at(i + 1) == QLatin1Char('\n'))
       
   169                     ++i;
       
   170 
       
   171                 /* Else, fallthrough. */
       
   172             }
       
   173             case '\n':
       
   174             {
       
   175                 result.append(QLatin1Char('\n'));
       
   176                 continue;
       
   177             }
       
   178             default:
       
   179             {
       
   180                 result.append(at);
       
   181             }
       
   182         }
       
   183     }
       
   184 
       
   185     return result;
       
   186 }
       
   187 
       
   188 Tokenizer::TokenType XQueryTokenizer::consumeComment()
       
   189 {
       
   190     /* Below, we return ERROR instead of END_OF_FILE such that the parser
       
   191      * sees an invalid comment. */
       
   192     while(m_pos < m_length)
       
   193     {
       
   194         switch(peekCurrent())
       
   195         {
       
   196             case ':':
       
   197             {
       
   198                 ++m_pos; /* Consume ':' */
       
   199                 if(atEnd())
       
   200                     return ERROR;
       
   201 
       
   202                 if(peekCurrent() == ')')
       
   203                 {
       
   204                     ++m_pos; /* Consume ')' */
       
   205                     return SUCCESS; /* The comment closed nicely. */
       
   206                 }
       
   207                 continue; /* We don't want to increment m_pos twice. */
       
   208             }
       
   209             case '(':
       
   210             { /* It looks like the start of a comment. */
       
   211                 ++m_pos;
       
   212 
       
   213                 if(atEnd())
       
   214                     return END_OF_FILE;
       
   215                 else if(peekCurrent() == ':')
       
   216                 {
       
   217                     /* And it is a nested comment -- parse it. */
       
   218                     const TokenType retval = consumeComment();
       
   219                     if(retval == SUCCESS)
       
   220                         continue; /* Continue with our "own" comment. */
       
   221                     else
       
   222                         return retval; /* Return the error in the nested comment. */
       
   223                 }
       
   224                 break;
       
   225             }
       
   226             case '\n':
       
   227             /* Fallthrough. */
       
   228             case '\r':
       
   229             {
       
   230                 /* We want to count \r\n as a single line break. */
       
   231                 if(peekAhead() == '\n')
       
   232                     ++m_pos;
       
   233 
       
   234                 m_columnOffset = m_pos;
       
   235                 ++m_line;
       
   236 
       
   237                 break;
       
   238             }
       
   239         }
       
   240         ++m_pos;
       
   241     }
       
   242 
       
   243     return ERROR; /* Error: we reached the end while inside a comment. */
       
   244 }
       
   245 
       
   246 bool XQueryTokenizer::consumeRawWhitespace()
       
   247 {
       
   248     while(m_pos < m_length)
       
   249     {
       
   250         switch(peekCurrent())
       
   251         {
       
   252             case ' ':
       
   253             case '\t':
       
   254                 break;
       
   255             case '\n':
       
   256             case '\r':
       
   257             {
       
   258                 if(peekAhead() == '\n')
       
   259                     ++m_pos;
       
   260 
       
   261                 m_columnOffset = m_pos;
       
   262                 ++m_line;
       
   263 
       
   264                 break;
       
   265             }
       
   266             default:
       
   267                 return false;
       
   268         }
       
   269         ++m_pos;
       
   270     }
       
   271     return true;
       
   272 }
       
   273 
       
   274 Tokenizer::TokenType XQueryTokenizer::consumeWhitespace()
       
   275 {
       
   276     while(m_pos < m_length)
       
   277     {
       
   278         switch(peekCurrent())
       
   279         {
       
   280             case ' ':
       
   281             case '\t':
       
   282                 break;
       
   283             case '\n':
       
   284             case '\r':
       
   285             {
       
   286                 /* We want to count \r\n as a single line break. */
       
   287                 if(peekAhead() == '\n')
       
   288                     ++m_pos;
       
   289 
       
   290                 m_columnOffset = m_pos;
       
   291                 ++m_line;
       
   292 
       
   293                 break;
       
   294             }
       
   295             case '(':
       
   296             {
       
   297                 if(peekAhead() == ':')
       
   298                 {
       
   299                     m_pos += 2; /* Consume "(:" */
       
   300 
       
   301                     const TokenType comment = consumeComment();
       
   302                     if(comment == SUCCESS)
       
   303                         continue;
       
   304                     else
       
   305                         return comment;
       
   306                 }
       
   307             }
       
   308             default:
       
   309                 return SUCCESS;
       
   310         }
       
   311         ++m_pos;
       
   312     }
       
   313 
       
   314     return END_OF_FILE;
       
   315 }
       
   316 
       
   317 char XQueryTokenizer::peekAhead(const int length) const
       
   318 {
       
   319     if(m_pos + length < m_length)
       
   320         return m_data.at(m_pos + length).toAscii();
       
   321     else
       
   322         return 0;
       
   323 }
       
   324 
       
   325 Tokenizer::Token XQueryTokenizer::error()
       
   326 {
       
   327     return Token(ERROR);
       
   328 }
       
   329 
       
   330 bool XQueryTokenizer::isDigit(const char ch)
       
   331 {
       
   332     return ch >= '0' && ch <= '9';
       
   333 }
       
   334 
       
   335 /* Replace with function in QXmlUtils. Write test cases for this. */
       
   336 bool XQueryTokenizer::isNCNameStart(const QChar ch)
       
   337 {
       
   338     if(ch == QLatin1Char('_'))
       
   339         return true;
       
   340 
       
   341     switch(ch.category())
       
   342     {
       
   343         case QChar::Letter_Lowercase:
       
   344         case QChar::Letter_Uppercase:
       
   345         case QChar::Letter_Other:
       
   346         case QChar::Letter_Titlecase:
       
   347         case QChar::Number_Letter:
       
   348             return true;
       
   349         default:
       
   350             return false;
       
   351     }
       
   352 }
       
   353 
       
   354 bool XQueryTokenizer::isNCNameBody(const QChar ch)
       
   355 {
       
   356     switch(ch.unicode())
       
   357     {
       
   358         case '.':
       
   359         case '_':
       
   360         case '-':
       
   361             return true;
       
   362     }
       
   363 
       
   364     switch(ch.category())
       
   365     {
       
   366         case QChar::Letter_Lowercase:
       
   367         case QChar::Letter_Uppercase:
       
   368         case QChar::Letter_Other:
       
   369         case QChar::Letter_Titlecase:
       
   370         case QChar::Number_Letter:
       
   371         case QChar::Mark_SpacingCombining:
       
   372         case QChar::Mark_Enclosing:
       
   373         case QChar::Mark_NonSpacing:
       
   374         case QChar::Letter_Modifier:
       
   375         case QChar::Number_DecimalDigit:
       
   376             return true;
       
   377         default:
       
   378             return false;
       
   379     }
       
   380 }
       
   381 
       
   382 bool XQueryTokenizer::isPhraseKeyword(const TokenType code)
       
   383 {
       
   384     switch(code)
       
   385     {
       
   386         /* Fallthrough all these. */
       
   387         case CASTABLE:
       
   388         case CAST:
       
   389         case COPY_NAMESPACES:
       
   390         case DECLARE:
       
   391         case EMPTY:
       
   392         case MODULE:
       
   393         case IMPORT:
       
   394         case INSTANCE:
       
   395         case ORDER:
       
   396         case ORDERING:
       
   397         case XQUERY:
       
   398         case STABLE:
       
   399         case TREAT:
       
   400             return true;
       
   401         default:
       
   402             return false;
       
   403     }
       
   404 }
       
   405 
       
   406 bool XQueryTokenizer::isOperatorKeyword(const TokenType code)
       
   407 {
       
   408     switch(code)
       
   409     {
       
   410         /* Fallthrough all these. */
       
   411         case AS:
       
   412         case ASCENDING:
       
   413         case AT:
       
   414         case CASE:
       
   415         case CAST:
       
   416         case CASTABLE:
       
   417         case EQ:
       
   418         case EXTERNAL:
       
   419         case GE:
       
   420         case G_EQ:
       
   421         case G_GT:
       
   422         case G_LT:
       
   423         case G_NE:
       
   424         case GT:
       
   425         case IN:
       
   426         case INHERIT:
       
   427         case INSTANCE:
       
   428         case IS:
       
   429         case ITEM:
       
   430         case LE:
       
   431         case LT:
       
   432         case NE:
       
   433         case NO_INHERIT:
       
   434         case NO_PRESERVE:
       
   435         case OF:
       
   436         case PRESERVE:
       
   437         case RETURN:
       
   438         case STABLE:
       
   439         case TO:
       
   440         case TREAT:
       
   441             return true;
       
   442         default:
       
   443             return false;
       
   444     };
       
   445 }
       
   446 
       
   447 bool XQueryTokenizer::isTypeToken(const TokenType t)
       
   448 {
       
   449     switch(t)
       
   450     {
       
   451         /* Fallthrough all these. */
       
   452         case ATTRIBUTE:
       
   453         case COMMENT:
       
   454         case DOCUMENT:
       
   455         case DOCUMENT_NODE:
       
   456         case ELEMENT:
       
   457         case ITEM:
       
   458         case NODE:
       
   459         case PROCESSING_INSTRUCTION:
       
   460         case SCHEMA_ATTRIBUTE:
       
   461         case SCHEMA_ELEMENT:
       
   462         case TEXT:
       
   463             return true;
       
   464         default:
       
   465             return false;
       
   466     }
       
   467 }
       
   468 
       
   469 Tokenizer::Token XQueryTokenizer::tokenizeNCNameOrQName()
       
   470 {
       
   471     const int start = m_pos;
       
   472 
       
   473     const Token t1 = tokenizeNCName();
       
   474     if(t1.hasError())
       
   475         return t1;
       
   476 
       
   477     if(peekCurrent() != ':' || peekAhead() == '=')
       
   478         return t1;
       
   479 
       
   480     ++m_pos;
       
   481 
       
   482     const Token t2 = tokenizeNCName();
       
   483     if(t2.hasError())
       
   484         return t2;
       
   485     else
       
   486         return Token(QNAME, m_data.mid(start, m_pos - start));
       
   487 }
       
   488 
       
   489 Tokenizer::Token XQueryTokenizer::tokenizeNumberLiteral()
       
   490 {
       
   491     setState(Operator);
       
   492     const int startPos = m_pos;
       
   493     bool hasDot = false;
       
   494     bool isXPath20 = false;
       
   495 
       
   496     for(; m_pos < m_length; ++m_pos)
       
   497     {
       
   498         QChar ch(current());
       
   499 
       
   500         char cell = ch.cell();
       
   501 
       
   502         if(cell == 'e' || cell == 'E')
       
   503         {
       
   504             isXPath20 = true;
       
   505             ++m_pos;
       
   506             ch = current();
       
   507 
       
   508             if(ch.row() != 0)
       
   509                 break;
       
   510 
       
   511             cell = ch.cell();
       
   512 
       
   513             if(cell == '+' || cell == '-')
       
   514                 continue;
       
   515         }
       
   516 
       
   517         if(isNCNameStart(ch))
       
   518             return error();
       
   519 
       
   520         if(cell < '0' || cell > '9')
       
   521         {
       
   522             if(cell == '.' && !hasDot)
       
   523                 hasDot = true;
       
   524             else
       
   525                 break;
       
   526         }
       
   527     }
       
   528 
       
   529     return Token(isXPath20 ? XPATH2_NUMBER : NUMBER, m_data.mid(startPos, m_pos - startPos));
       
   530 }
       
   531 
       
   532 QString XQueryTokenizer::tokenizeCharacterReference()
       
   533 {
       
   534     Q_ASSERT(peekCurrent() == '&');
       
   535 
       
   536     const int theEnd = m_data.indexOf(QLatin1Char(';'), m_pos + 1);
       
   537 
       
   538     if(theEnd == -1) /* No ';' found, a syntax error. i18n. */
       
   539         return QString();
       
   540 
       
   541     QString content(m_data.mid(m_pos + 1, (theEnd - m_pos) - 1));
       
   542     m_pos = theEnd;
       
   543 
       
   544     const QChar charRef(charForReference(content));
       
   545 
       
   546     if(!charRef.isNull())
       
   547         return charRef;
       
   548     else if(content.startsWith(QLatin1Char('#')))
       
   549     {
       
   550         int base;
       
   551 
       
   552         /* It is only '#' or '#x'. */
       
   553         if(content.length() < 2)
       
   554             return QString();
       
   555 
       
   556         /* We got a hex number if it starts with 'x', otherwise it's a decimal. */
       
   557         if(content.at(1) == QLatin1Char('x'))
       
   558         {
       
   559             base = 16;
       
   560             content = content.mid(2); /* Remove "#x". */
       
   561         }
       
   562         else
       
   563         {
       
   564             base = 10;
       
   565             content = content.mid(1); /* Remove "#". */
       
   566         }
       
   567 
       
   568         bool conversionOK = false;
       
   569         const int codepoint = content.toInt(&conversionOK, base);
       
   570 
       
   571         if(conversionOK)
       
   572         {
       
   573             const QChar ch(codepoint);
       
   574 
       
   575             if(ch.isNull())
       
   576             {
       
   577                 /* We likely have something which require surrogate pairs. */
       
   578                 QString result;
       
   579                 result += QChar(QChar::highSurrogate(codepoint));
       
   580                 result += QChar(QChar::lowSurrogate(codepoint));
       
   581                 return result;
       
   582             }
       
   583             else
       
   584                 return ch;
       
   585         }
       
   586         else
       
   587             return QString();
       
   588     }
       
   589     else
       
   590         return QString();
       
   591 }
       
   592 
       
   593 int XQueryTokenizer::scanUntil(const char *const content)
       
   594 {
       
   595     const int end = m_data.indexOf(QString::fromLatin1(content), m_pos);
       
   596 
       
   597     if(end == -1)
       
   598         return -1;
       
   599     else
       
   600     {
       
   601         const int len = end - m_pos;
       
   602         m_pos += len;
       
   603         return len;
       
   604     }
       
   605 }
       
   606 
       
   607 QChar XQueryTokenizer::charForReference(const QString &reference)
       
   608 {
       
   609     if(m_charRefs.isEmpty())
       
   610     {
       
   611         /* Initialize. */
       
   612         m_charRefs.reserve(5);
       
   613         m_charRefs.insert(QLatin1String("lt"),     QLatin1Char('<'));
       
   614         m_charRefs.insert(QLatin1String("gt"),     QLatin1Char('>'));
       
   615         m_charRefs.insert(QLatin1String("amp"),    QLatin1Char('&'));
       
   616         m_charRefs.insert(QLatin1String("quot"),   QLatin1Char('"'));
       
   617         m_charRefs.insert(QLatin1String("apos"),   QLatin1Char('\''));
       
   618     }
       
   619 
       
   620     return m_charRefs.value(reference);
       
   621 }
       
   622 
       
   623 Tokenizer::Token XQueryTokenizer::tokenizeStringLiteral()
       
   624 {
       
   625     const QChar delimiter(current());
       
   626     /* We cannot unfortunately just scan and then do mid(),
       
   627      * since we can encounter character references. */
       
   628     QString result;
       
   629 
       
   630     /* This is more likely than QString's default allocation. */
       
   631     result.reserve(8);
       
   632 
       
   633     CharacterSkips skipEOLNormalization;
       
   634 
       
   635     /* Advance over the initial quote character. */
       
   636     ++m_pos;
       
   637 
       
   638     for(; m_pos < m_length; ++m_pos)
       
   639     {
       
   640         const QChar c(current());
       
   641 
       
   642         if(c == QLatin1Char('&'))
       
   643         {
       
   644             const QString charRef(tokenizeCharacterReference());
       
   645 
       
   646             if(charRef.isNull())
       
   647                 return error();
       
   648             else
       
   649             {
       
   650                 skipEOLNormalization.insert(result.count());
       
   651                 result.append(charRef);
       
   652             }
       
   653 
       
   654         }
       
   655         else if(c == delimiter)
       
   656         {
       
   657             /* Maybe the escaping mechanism is used. For instance, "s""s"
       
   658              * has the value `s"s'. */
       
   659             ++m_pos;
       
   660 
       
   661             if(current() == delimiter) /* Double quote. */
       
   662                 result += delimiter;
       
   663             else
       
   664                 return Token(STRING_LITERAL, normalizeEOL(result, skipEOLNormalization));
       
   665         }
       
   666         else
       
   667             result += c;
       
   668     }
       
   669 
       
   670     return error();
       
   671 }
       
   672 
       
   673 Tokenizer::Token XQueryTokenizer::tokenizeNCName()
       
   674 {
       
   675     const int startPos = m_pos;
       
   676 
       
   677     if(m_pos < m_length && isNCNameStart(current()))
       
   678     {
       
   679         ++m_pos;
       
   680 
       
   681         for(; m_pos < m_length; ++m_pos)
       
   682         {
       
   683             if(!isNCNameBody(current()))
       
   684                 break;
       
   685         }
       
   686 
       
   687         return Token(NCNAME, m_data.mid(startPos, m_pos - startPos));
       
   688     }
       
   689     else
       
   690         return error();
       
   691 }
       
   692 
       
   693 bool XQueryTokenizer::aheadEquals(const char *const chs,
       
   694                                   const int len,
       
   695                                   const int offset) const
       
   696 {
       
   697     Q_ASSERT(len > 0);
       
   698     Q_ASSERT(qstrlen(chs) == uint(len));
       
   699 
       
   700     if(m_pos + len >= m_length)
       
   701         return false;
       
   702 
       
   703     for(int i = offset; i < (len + offset); ++i)
       
   704     {
       
   705         if(m_data.at(m_pos + i).toAscii() != chs[i - offset])
       
   706             return false;
       
   707     }
       
   708 
       
   709     return true;
       
   710 }
       
   711 
       
   712 const TokenMap *XQueryTokenizer::lookupKeyword(const QString &keyword)
       
   713 {
       
   714     return TokenLookup::value(keyword.toAscii().constData(), keyword.length());
       
   715 }
       
   716 
       
   717 XQueryTokenizer::State XQueryTokenizer::state() const
       
   718 {
       
   719     return m_state;
       
   720 }
       
   721 
       
   722 void XQueryTokenizer::setState(const State s)
       
   723 {
       
   724     m_state = s;
       
   725 }
       
   726 
       
   727 void XQueryTokenizer::pushState(const State s)
       
   728 {
       
   729     m_stateStack.push(s);
       
   730 }
       
   731 
       
   732 void XQueryTokenizer::pushState()
       
   733 {
       
   734     m_stateStack.push(m_state);
       
   735 }
       
   736 
       
   737 void XQueryTokenizer::popState()
       
   738 {
       
   739     /* QStack::pop() asserts if it's empty, so we need to check
       
   740      * it, since we might receive unbalanced curlies. */
       
   741     if(!m_stateStack.isEmpty())
       
   742         m_state = m_stateStack.pop();
       
   743 }
       
   744 
       
   745 Tokenizer::Token XQueryTokenizer::nextToken()
       
   746 {
       
   747     switch(state())
       
   748     {
       
   749         /* We want to skip or do special whitespace handling for these
       
   750          * states. So fallthrough all of the following. */
       
   751         case AposAttributeContent:
       
   752         case Axis:
       
   753         case ElementContent:
       
   754         case EndTag:
       
   755         case Pragma:
       
   756         case PragmaContent:
       
   757         case ProcessingInstructionName:
       
   758         case QuotAttributeContent:
       
   759         case StartTag:
       
   760         case XMLComment:
       
   761             break;
       
   762         default:
       
   763             handleWhitespace();
       
   764     }
       
   765 
       
   766     switch(state())
       
   767     {
       
   768         case XMLSpaceDecl:
       
   769         /* Fallthrough. */
       
   770         case NamespaceKeyword:
       
   771         {
       
   772             switch(peekCurrent())
       
   773             {
       
   774                 case ',':
       
   775                     return tokenAndAdvance(COMMA);
       
   776                 case '"':
       
   777                 /* Fallthrough. */
       
   778                 case '\'':
       
   779                 {
       
   780                     setState(NamespaceDecl);
       
   781                     return tokenizeStringLiteral();
       
   782                 }
       
   783             }
       
   784 
       
   785             const Token id(tokenizeNCName());
       
   786 
       
   787             if(id.type != NCNAME)
       
   788                 return id;
       
   789 
       
   790             const TokenMap *const keyword = lookupKeyword(id.value);
       
   791             if(keyword)
       
   792             {
       
   793                 switch(keyword->token)
       
   794                 {
       
   795                     case INHERIT:
       
   796                     /* Fallthrough. */
       
   797                     case NO_INHERIT:
       
   798                     {
       
   799                         setState(Default);
       
   800                         break;
       
   801                     }
       
   802                     case NAMESPACE:
       
   803                     {
       
   804                         setState(NamespaceDecl);
       
   805                         break;
       
   806                     }
       
   807                     case ORDERED:
       
   808                     /* Fallthrough. */
       
   809                     case UNORDERED:
       
   810                     /* Fallthrough. */
       
   811                     case STRIP:
       
   812                     {
       
   813                         setState(Default);
       
   814                         break;
       
   815                     }
       
   816                     case PRESERVE:
       
   817                     {
       
   818                         if(state() != NamespaceKeyword)
       
   819                             setState(Default);
       
   820                     }
       
   821                     default:
       
   822                         break;
       
   823                 }
       
   824 
       
   825                 return Token(keyword->token);
       
   826             }
       
   827             else
       
   828                 return id;
       
   829 
       
   830             Q_ASSERT(false);
       
   831         }
       
   832         case NamespaceDecl:
       
   833         {
       
   834             switch(peekCurrent())
       
   835             {
       
   836                 case '=':
       
   837                     return tokenAndAdvance(G_EQ);
       
   838                 case ';':
       
   839                     return tokenAndChangeState(SEMI_COLON, Default);
       
   840                 case '\'':
       
   841                 /* Fallthrough. */
       
   842                 case '\"':
       
   843                     return tokenizeStringLiteral();
       
   844             }
       
   845 
       
   846             const Token nc(tokenizeNCName());
       
   847 
       
   848             handleWhitespace();
       
   849 
       
   850             const char pc = peekCurrent();
       
   851             const TokenMap* const t = lookupKeyword(nc.value);
       
   852 
       
   853             if(pc == '\'' || (pc == '"' && t))
       
   854                 return tokenAndChangeState(t->token, Default, 0);
       
   855             else
       
   856                 return nc;
       
   857 
       
   858             Q_ASSERT(false);
       
   859         }
       
   860         case Axis:
       
   861         {
       
   862             if(peekCurrent() == ':')
       
   863             {
       
   864                 Q_ASSERT(peekAhead() == ':');
       
   865                 m_pos += 2;
       
   866                 setState(AfterAxisSeparator);
       
   867                 return Token(COLONCOLON);
       
   868             }
       
   869             /* Fallthrough. */
       
   870         }
       
   871         case AfterAxisSeparator:
       
   872         /* Fallthrough. */
       
   873         case Default:
       
   874            /* State Operator and state Default have a lot of tokens in common except
       
   875             * for minor differences. So we treat them the same way, and sprinkles logic
       
   876             * here and there to handle the small differences. */
       
   877         /* Fallthrough. */
       
   878         case Operator:
       
   879         {
       
   880             switch(peekCurrent())
       
   881             {
       
   882                 case '=':
       
   883                     return tokenAndChangeState(G_EQ, Default);
       
   884                 case '-':
       
   885                     return tokenAndChangeState(MINUS, Default);
       
   886                 case '+':
       
   887                     return tokenAndChangeState(PLUS, Default);
       
   888                 case '[':
       
   889                     return tokenAndChangeState(LBRACKET, Default);
       
   890                 case ']':
       
   891                     return tokenAndChangeState(RBRACKET, Operator);
       
   892                 case ',':
       
   893                     return tokenAndChangeState(COMMA, Default);
       
   894                 case ';':
       
   895                     return tokenAndChangeState(SEMI_COLON, Default);
       
   896                 case '$':
       
   897                     return tokenAndChangeState(DOLLAR, VarName);
       
   898                 case '|':
       
   899                     return tokenAndChangeState(BAR, Default);
       
   900                 case '?':
       
   901                     return tokenAndChangeState(QUESTION, Operator);
       
   902                 case ')':
       
   903                     return tokenAndChangeState(RPAREN, Operator);
       
   904                 case '@':
       
   905                     return tokenAndChangeState(AT_SIGN, Default);
       
   906                 /* Fallthrough all these. */
       
   907                 case '1':
       
   908                 case '2':
       
   909                 case '3':
       
   910                 case '4':
       
   911                 case '5':
       
   912                 case '6':
       
   913                 case '7':
       
   914                 case '8':
       
   915                 case '9':
       
   916                 case '0':
       
   917                     return tokenizeNumberLiteral();
       
   918                 case '.':
       
   919                 {
       
   920                     const char next = peekAhead();
       
   921                     if(next == '.')
       
   922                         return tokenAndChangeState(DOTDOT, Operator, 2);
       
   923                     /* .5 is allowed, as short form for 0.5:
       
   924                      * <tt>[142]     DecimalLiteral     ::=     ("." Digits) | (Digits "." [0-9]*)</tt>
       
   925                      */
       
   926                     else if(isDigit(next))
       
   927                         return tokenizeNumberLiteral();
       
   928                     else
       
   929                         return tokenAndChangeState(DOT, Operator);
       
   930                 }
       
   931                 case '\'':
       
   932                 /* Fallthrough. */
       
   933                 case '"':
       
   934                 {
       
   935                     setState(Operator);
       
   936                     return tokenizeStringLiteral();
       
   937 
       
   938                 }
       
   939                 case '(':
       
   940                 {
       
   941                     if(peekAhead() == '#')
       
   942                         return tokenAndChangeState(PRAGMA_START, Pragma, 2);
       
   943                     else
       
   944                         return tokenAndChangeState(LPAREN, Default);
       
   945                 }
       
   946                 case '*':
       
   947                 {
       
   948                     if(peekAhead() == ':')
       
   949                     {
       
   950                         m_pos += 2; /* Consume *:. */
       
   951                         const Token nc = tokenizeNCName();
       
   952 
       
   953                         if(nc.hasError())
       
   954                             return error();
       
   955                         else
       
   956                             return tokenAndChangeState(ANY_PREFIX, nc.value, Operator);
       
   957                     }
       
   958                     else
       
   959                         return tokenAndChangeState(STAR, state() == Default ? Operator : Default);
       
   960                 }
       
   961                 case ':':
       
   962                 {
       
   963                     switch(peekAhead())
       
   964                     {
       
   965                         case '=':
       
   966                             return tokenAndChangeState(ASSIGN, Default, 2);
       
   967                         case ':':
       
   968                             return tokenAndChangeState(COLONCOLON, Default, 2);
       
   969                         default:
       
   970                             return error();
       
   971                     }
       
   972                 }
       
   973                 case '!':
       
   974                 {
       
   975                     if(peekAhead() == '=')
       
   976                         return tokenAndChangeState(G_NE, Default, 2);
       
   977                     else
       
   978                         return error();
       
   979                 }
       
   980                 case '<':
       
   981                 {
       
   982                     switch(peekAhead())
       
   983                     {
       
   984                         case '=':
       
   985                             return tokenAndChangeState(G_LE, Default, 2);
       
   986                         case '<':
       
   987                             return tokenAndChangeState(PRECEDES, Default, 2);
       
   988                         case '?':
       
   989                         {
       
   990                             pushState(Operator);
       
   991                             return tokenAndChangeState(PI_START, ProcessingInstructionName, 2);
       
   992                         }
       
   993                         case '!':
       
   994                         {
       
   995                             if(aheadEquals("!--", 3))
       
   996                             {
       
   997                                 m_pos += 3; /* Consume "!--". */
       
   998                                 pushState(Operator);
       
   999                                 return tokenAndChangeState(COMMENT_START, XMLComment);
       
  1000                             }
       
  1001                             /* Fallthrough. It's a syntax error, and this is a good way to report it. */
       
  1002                         }
       
  1003                         default:
       
  1004                         {
       
  1005                             if((m_pos + 1) < m_length && isNCNameStart(m_data.at(m_pos + 1)))
       
  1006                             {
       
  1007                                 /* We assume it's an element constructor. */
       
  1008                                 pushState(Operator);
       
  1009                             }
       
  1010 
       
  1011                             return tokenAndChangeState(G_LT, state() == Operator ? Default : StartTag);
       
  1012                         }
       
  1013                     }
       
  1014                 }
       
  1015                 case '>':
       
  1016                 {
       
  1017                     switch(peekAhead())
       
  1018                     {
       
  1019                         case '=':
       
  1020                             return tokenAndChangeState(G_GE, Default, 2);
       
  1021                         case '>':
       
  1022                             return tokenAndChangeState(FOLLOWS, Default, 2);
       
  1023                         default:
       
  1024                             return tokenAndChangeState(G_GT, Default);
       
  1025                     }
       
  1026                 }
       
  1027                 case '/':
       
  1028                 {
       
  1029                     if(peekAhead() == '/')
       
  1030                         return tokenAndChangeState(SLASHSLASH, Default, 2);
       
  1031                     else
       
  1032                         return tokenAndChangeState(SLASH, Default);
       
  1033                 }
       
  1034                 case '{':
       
  1035                 {
       
  1036                     pushState(Operator);
       
  1037                     return tokenAndChangeState(CURLY_LBRACE, Default);
       
  1038                 }
       
  1039                 case '}':
       
  1040                 {
       
  1041                     popState();
       
  1042 
       
  1043                     return tokenAndAdvance(CURLY_RBRACE);
       
  1044                 }
       
  1045             }
       
  1046 
       
  1047             /* Ok. We're in state Default or Operator, and it wasn't a simple
       
  1048              * character. */
       
  1049 
       
  1050             const Token id(tokenizeNCName());
       
  1051 
       
  1052             if(id.type != NCNAME)
       
  1053                 return id;
       
  1054 
       
  1055             const TokenMap *const keyword = lookupKeyword(id.value);
       
  1056 
       
  1057             if(state() == Operator)
       
  1058             {
       
  1059                 if(keyword)
       
  1060                 {
       
  1061                     if(keyword->token == DEFAULT || keyword->token == ASCENDING || keyword->token == DESCENDING)
       
  1062                         setState(Operator);
       
  1063                     else if(keyword->token == RETURN)
       
  1064                         setState(Default);
       
  1065                     else if(isPhraseKeyword(keyword->token))
       
  1066                     {
       
  1067                         const TokenType ws = consumeWhitespace();
       
  1068                         if(ws == ERROR)
       
  1069                             return error();
       
  1070 
       
  1071                         const Token id2(tokenizeNCName());
       
  1072                         const TokenMap *const keyword2 = lookupKeyword(id2.value);
       
  1073 
       
  1074                         if(keyword2)
       
  1075                         {
       
  1076                             if(keyword->token == TREAT && keyword2->token == AS)
       
  1077                                 setState(ItemType);
       
  1078                             else if (keyword->token == CAST || (keyword->token == CASTABLE && keyword2->token == AS) || keyword2->token == BY)
       
  1079                                 setState(Default);
       
  1080 
       
  1081                             m_tokenStack.push(Token(keyword2->token));
       
  1082                         }
       
  1083                         else
       
  1084                             m_tokenStack.push(id2);
       
  1085 
       
  1086                         return Token(keyword->token);
       
  1087                     }
       
  1088                     else
       
  1089                     {
       
  1090                         /* Such that we tokenize the second token in "empty greatest". */
       
  1091                         if(keyword->token != EMPTY)
       
  1092                             setState(Default);
       
  1093                     }
       
  1094 
       
  1095                     if(keyword->token == AS || keyword->token == CASE)
       
  1096                         setState(ItemType);
       
  1097 
       
  1098                     return Token(keyword->token);
       
  1099                 }
       
  1100                 else
       
  1101                     return id;
       
  1102             }
       
  1103 
       
  1104             Q_ASSERT(state() == Default || state() == Axis || state() == AfterAxisSeparator);
       
  1105 
       
  1106             /*
       
  1107              * This is hard. Consider this:
       
  1108              *
       
  1109              * Valid:           child       ::nameTest
       
  1110              * Valid:           child::     nameTest
       
  1111              * Syntax Error:    child       :localName
       
  1112              * Syntax Error:    child:      localName
       
  1113              *
       
  1114              * Consider "child ::name". Right now, we're here:
       
  1115              *                ^
       
  1116              * We don't know whether "child" is a prefix and hence the whitespace is invalid,
       
  1117              * or whether it's an axis and hence skippable. */
       
  1118             {
       
  1119                 const int wsLength = peekForColonColon();
       
  1120                 /* We cannot call handleWhitespace() because it returns on
       
  1121                  * END_OF_FILE, and we have parsed up keyword, and we need to
       
  1122                  * deal with that.
       
  1123                  *
       
  1124                  * If we have a colon colon, which means the whitespace is
       
  1125                  * allowed, we skip it. */
       
  1126                 if(wsLength != -1)
       
  1127                     m_pos += wsLength;
       
  1128             }
       
  1129 
       
  1130             /* Handle name tests. */
       
  1131             if(peekCurrent() == ':')
       
  1132             {
       
  1133                 switch(peekAhead())
       
  1134                 {
       
  1135                     case '=':
       
  1136                         return id;
       
  1137                     case '*':
       
  1138                     {
       
  1139                         m_pos += 2;
       
  1140                         return tokenAndChangeState(ANY_LOCAL_NAME, id.value, Operator);
       
  1141                     }
       
  1142                     case ':':
       
  1143                     {
       
  1144                         /* We have an axis. */
       
  1145                         setState(Axis);
       
  1146                         return keyword ? Token(keyword->token) : id;
       
  1147                     }
       
  1148                     default:
       
  1149                     {
       
  1150                         /* It's a QName. */
       
  1151                         ++m_pos; /* Consume the colon. */
       
  1152 
       
  1153                         const Token id2(tokenizeNCName());
       
  1154 
       
  1155                         if(id2.type != NCNAME)
       
  1156                         {
       
  1157                             --m_pos;
       
  1158                             return id;
       
  1159                         }
       
  1160 
       
  1161                         setState(Operator);
       
  1162                         const int qNameLen = id.value.length() + id2.value.length() + 1;
       
  1163                         return Token(QNAME, m_data.mid(m_pos - qNameLen, qNameLen));
       
  1164                     }
       
  1165                 }
       
  1166             }
       
  1167 
       
  1168             if(!keyword || isOperatorKeyword(keyword->token))
       
  1169             {
       
  1170                 setState(Operator);
       
  1171                 return id;
       
  1172             }
       
  1173 
       
  1174             const TokenType ws = consumeWhitespace();
       
  1175             if(ws == ERROR) // TODO this should test for success. Write test.
       
  1176                 return Token(ERROR);
       
  1177 
       
  1178             if(atEnd())
       
  1179             {
       
  1180                 setState(Operator);
       
  1181                 return id;
       
  1182             }
       
  1183 
       
  1184             /* Let the if-body apply for constructors, and node type tests. */
       
  1185             if(isTypeToken(keyword->token) ||
       
  1186                keyword->token == TYPESWITCH ||
       
  1187                keyword->token == ORDERED ||
       
  1188                keyword->token == UNORDERED ||
       
  1189                keyword->token == IF)
       
  1190             {
       
  1191                 switch(peekCurrent())
       
  1192                 {
       
  1193                     case '(':
       
  1194                     {
       
  1195                         // TODO See if we can remove DOCUMENT from isTypeToken.
       
  1196                         if(isTypeToken(keyword->token) && keyword->token != DOCUMENT)
       
  1197                         {
       
  1198                             m_tokenStack.push(Token(LPAREN));
       
  1199                             ++m_pos; /* Consume '('. */
       
  1200                             pushState(Operator);
       
  1201 
       
  1202                             if(keyword->token == PROCESSING_INSTRUCTION)
       
  1203                                 setState(KindTestForPI);
       
  1204                             else
       
  1205                                 setState(KindTest);
       
  1206 
       
  1207                             return Token(keyword->token);
       
  1208                         }
       
  1209                         else if(keyword->token == TYPESWITCH || keyword->token == IF)
       
  1210                             return Token(keyword->token);
       
  1211                         else /* It's a function call. */
       
  1212                             return id;
       
  1213                     }
       
  1214                     case '{':
       
  1215                     {
       
  1216                         m_tokenStack.push(Token(CURLY_LBRACE));
       
  1217                         ++m_pos; /* Consume '{'. */
       
  1218                         pushState(Operator);
       
  1219                         /* Stay in state Default. */
       
  1220                         return Token(keyword->token);
       
  1221                     }
       
  1222                     default:
       
  1223                     {
       
  1224                         /* We have read in a token which is for instance
       
  1225                          * "return", and now it can be an element
       
  1226                          * test("element") a node kind test("element()"), or a
       
  1227                          * computed element constructor("element name {...").
       
  1228                          * We need to do a two-token lookahead here, because
       
  1229                          * "element return" can be an element test followed by
       
  1230                          * the return keyword, but it can also be an element
       
  1231                          * constructor("element return {"). */
       
  1232                         if(isNCNameStart(current()))
       
  1233                         {
       
  1234                             const int currentPos = m_pos;
       
  1235                             const Token token2 = tokenizeNCNameOrQName();
       
  1236 
       
  1237                             if(token2.hasError())
       
  1238                                 return token2;
       
  1239 
       
  1240                             handleWhitespace();
       
  1241 
       
  1242                             if(peekCurrent() == '{')
       
  1243                             {
       
  1244                                 /* An element constructor. */
       
  1245                                 m_tokenStack.push(token2);
       
  1246                                 return Token(keyword->token);
       
  1247                             }
       
  1248 
       
  1249                             /* We jump back in the stream, we need to tokenize token2 according
       
  1250                              * to the state. */
       
  1251                             m_pos = currentPos;
       
  1252                             setState(Operator);
       
  1253                             return Token(NCNAME, QLatin1String(keyword->name));
       
  1254                         }
       
  1255                     }
       
  1256                 }
       
  1257             }
       
  1258 
       
  1259             if(peekCurrent() == '$')
       
  1260             {
       
  1261                 setState(VarName);
       
  1262                 return Token(keyword->token);
       
  1263             }
       
  1264 
       
  1265             /* It's not a node type, it's not the typeswitch expression, but it is a function callsite. */
       
  1266             if(peekCurrent() == '(')
       
  1267                 return id;
       
  1268             else if(peekCurrent() == '{' && keyword->token == VALIDATE)
       
  1269                 return Token(keyword->token);
       
  1270 
       
  1271             if(!isNCNameStart(current()))
       
  1272             {
       
  1273                 setState(Operator);
       
  1274                 return id;
       
  1275             }
       
  1276 
       
  1277             const Token id2(tokenizeNCName());
       
  1278             const TokenMap *const keyword2 = lookupKeyword(id2.value);
       
  1279 
       
  1280             if(!keyword2)
       
  1281             {
       
  1282                 /* It's a syntax error. All cases of two subsequent ncnames are keywords(e.g, declarations). */
       
  1283                 setState(Operator);
       
  1284                 return id;
       
  1285             }
       
  1286 
       
  1287             switch(keyword->token)
       
  1288             {
       
  1289                 case DECLARE:
       
  1290                 {
       
  1291                     switch(keyword2->token)
       
  1292                     {
       
  1293                         case VARIABLE:
       
  1294                         /* Fallthrough. */
       
  1295                         case FUNCTION:
       
  1296                         {
       
  1297                             m_tokenStack.push(Token(keyword2->token));
       
  1298                             setState(Default);
       
  1299                             return Token(keyword->token);
       
  1300                         }
       
  1301                         case OPTION:
       
  1302                         {
       
  1303                             m_tokenStack.push(Token(keyword2->token));
       
  1304                             setState(Default);
       
  1305                             return Token(keyword->token);
       
  1306                         }
       
  1307                         case COPY_NAMESPACES:
       
  1308                         /* Fallthrough. */
       
  1309                         case ORDERING:
       
  1310                         {
       
  1311                             m_tokenStack.push(Token(keyword2->token));
       
  1312                             setState(NamespaceKeyword);
       
  1313                             return Token(keyword->token);
       
  1314                         }
       
  1315                         case CONSTRUCTION:
       
  1316                         {
       
  1317                             // TODO identical to CONSTRUCTION?
       
  1318                             m_tokenStack.push(Token(keyword2->token));
       
  1319                             setState(Operator);
       
  1320                             return Token(keyword->token);
       
  1321                         }
       
  1322                         case NAMESPACE:
       
  1323                         /* Fallthrough. */
       
  1324                         case BASEURI:
       
  1325                         {
       
  1326                             m_tokenStack.push(Token(keyword2->token));
       
  1327                             setState(NamespaceDecl);
       
  1328                             return Token(keyword->token);
       
  1329                         }
       
  1330                         case BOUNDARY_SPACE:
       
  1331                         {
       
  1332                             m_tokenStack.push(Token(keyword2->token));
       
  1333                             setState(XMLSpaceDecl);
       
  1334                             return Token(keyword->token);
       
  1335                         }
       
  1336                         case DEFAULT:
       
  1337                         {
       
  1338                             m_tokenStack.push(Token(keyword2->token));
       
  1339 
       
  1340                             const TokenType ws2 = consumeWhitespace();
       
  1341                             if(ws2 != SUCCESS)
       
  1342                             {
       
  1343                                 m_tokenStack.prepend(Token(ws2));
       
  1344                                 return Token(keyword->token);
       
  1345                             }
       
  1346 
       
  1347                             const Token id3(tokenizeNCName());
       
  1348 
       
  1349                             if(id3.type != NCNAME)
       
  1350                             {
       
  1351                                 m_tokenStack.prepend(id3);
       
  1352                                 return Token(keyword->token);
       
  1353                             }
       
  1354 
       
  1355                             const TokenMap *const keyword3 = lookupKeyword(id3.value);
       
  1356                             if(!keyword3)
       
  1357                             {
       
  1358                                 m_tokenStack.prepend(id3);
       
  1359                                 return Token(keyword->token);
       
  1360                             }
       
  1361                             else
       
  1362                             {
       
  1363                                 m_tokenStack.prepend(Token(keyword3->token));
       
  1364 
       
  1365                                 if(keyword3->token == ORDER)
       
  1366                                     setState(Operator);
       
  1367                                 else
       
  1368                                     setState(NamespaceDecl);
       
  1369                             }
       
  1370 
       
  1371                             return Token(keyword->token);
       
  1372                         }
       
  1373                         default:
       
  1374                         {
       
  1375                             m_tokenStack.push(Token(keyword2->token));
       
  1376                             setState(Default);
       
  1377                             return id;
       
  1378                         }
       
  1379                     }
       
  1380                 }
       
  1381                 case XQUERY:
       
  1382                 {
       
  1383                     m_tokenStack.push(Token(keyword2->token));
       
  1384 
       
  1385                     if(keyword2->token == VERSION)
       
  1386                     {
       
  1387                         setState(NamespaceDecl);
       
  1388                         return Token(keyword->token);
       
  1389                     }
       
  1390                     else
       
  1391                     {
       
  1392                         setState(Operator);
       
  1393                         return id;
       
  1394                     }
       
  1395                 }
       
  1396                 case IMPORT:
       
  1397                 {
       
  1398                     m_tokenStack.push(Token(keyword2->token));
       
  1399 
       
  1400                     switch(keyword2->token)
       
  1401                     {
       
  1402                         case SCHEMA:
       
  1403                         /* Fallthrough. */
       
  1404                         case MODULE:
       
  1405                         {
       
  1406                             setState(NamespaceKeyword);
       
  1407                             return Token(keyword->token);
       
  1408                         }
       
  1409                         default:
       
  1410                         {
       
  1411                             setState(Operator);
       
  1412                             return id;
       
  1413                         }
       
  1414                     }
       
  1415                 }
       
  1416                 case VALIDATE:
       
  1417                 {
       
  1418                     m_tokenStack.push(Token(keyword2->token));
       
  1419 
       
  1420                     switch(keyword2->token)
       
  1421                     {
       
  1422                         case LAX:
       
  1423                         case STRICT:
       
  1424                         {
       
  1425                             pushState(Operator);
       
  1426                             return Token(keyword->token);
       
  1427                         }
       
  1428                         default:
       
  1429                         {
       
  1430                             setState(Operator);
       
  1431                             return id;
       
  1432                         }
       
  1433                     }
       
  1434                 }
       
  1435                 default:
       
  1436                 {
       
  1437                     m_tokenStack.push(Token(keyword2->token));
       
  1438                     setState(Operator);
       
  1439                     return id;
       
  1440                 }
       
  1441             }
       
  1442 
       
  1443             Q_ASSERT(false);
       
  1444 
       
  1445         }
       
  1446         case VarName:
       
  1447         {
       
  1448             if(peekCurrent() == '$')
       
  1449                 return tokenAndAdvance(DOLLAR);
       
  1450 
       
  1451             setState(Operator);
       
  1452             return tokenizeNCNameOrQName();
       
  1453             Q_ASSERT(false);
       
  1454         }
       
  1455         case ItemType:
       
  1456         {
       
  1457             switch(peekCurrent())
       
  1458             {
       
  1459                 case '(':
       
  1460                     return tokenAndChangeState(LPAREN, KindTest);
       
  1461                 case '$':
       
  1462                     return tokenAndChangeState(DOLLAR, VarName);
       
  1463             }
       
  1464 
       
  1465             const Token name(tokenizeNCNameOrQName());
       
  1466 
       
  1467             if(name.hasError())
       
  1468                 return error();
       
  1469 
       
  1470             else if(name.type == QNAME)
       
  1471             {
       
  1472                 setState(OccurrenceIndicator);
       
  1473                 return name;
       
  1474             }
       
  1475             else
       
  1476             {
       
  1477                 const TokenMap *const keyword = lookupKeyword(name.value);
       
  1478 
       
  1479                 if(keyword)
       
  1480                 {
       
  1481                     pushState(OccurrenceIndicator);
       
  1482                     return Token(keyword->token);
       
  1483                 }
       
  1484                 else
       
  1485                 {
       
  1486                     setState(Default);
       
  1487                     return name;
       
  1488                 }
       
  1489             }
       
  1490             Q_ASSERT(false);
       
  1491         }
       
  1492         case KindTest:
       
  1493         {
       
  1494             switch(peekCurrent())
       
  1495             {
       
  1496                 case ')':
       
  1497                 {
       
  1498                     popState();
       
  1499                     return tokenAndAdvance(RPAREN);
       
  1500                 }
       
  1501                 case '(':
       
  1502                     return tokenAndAdvance(LPAREN);
       
  1503                 case ',':
       
  1504                     return tokenAndAdvance(COMMA);
       
  1505                 case '*':
       
  1506                     return tokenAndAdvance(STAR);
       
  1507                 case '?':
       
  1508                     return tokenAndAdvance(QUESTION);
       
  1509                 case '\'':
       
  1510                 /* Fallthrough. */
       
  1511                 case '"':
       
  1512                     return tokenizeStringLiteral();
       
  1513             }
       
  1514 
       
  1515             const Token nc(tokenizeNCNameOrQName());
       
  1516             if(nc.hasError())
       
  1517                 return nc;
       
  1518 
       
  1519             const TokenType ws = consumeWhitespace();
       
  1520             if(ws == ERROR)
       
  1521                 return error();
       
  1522 
       
  1523             if(peekCurrent() == '(')
       
  1524             {
       
  1525                 const TokenMap *const keyword = lookupKeyword(nc.value);
       
  1526                 if(keyword)
       
  1527                 {
       
  1528                     pushState(KindTest);
       
  1529                     return Token(keyword->token);
       
  1530                 }
       
  1531                 else
       
  1532                     return nc;
       
  1533             }
       
  1534             else
       
  1535                 return nc;
       
  1536             Q_ASSERT(false);
       
  1537         }
       
  1538         case KindTestForPI:
       
  1539         {
       
  1540             switch(peekCurrent())
       
  1541             {
       
  1542                 case ')':
       
  1543                 {
       
  1544                     popState();
       
  1545                     return tokenAndAdvance(RPAREN);
       
  1546                 }
       
  1547                 case '\'':
       
  1548                 /* Fallthrough. */
       
  1549                 case '"':
       
  1550                     return tokenizeStringLiteral();
       
  1551                 default:
       
  1552                     return tokenizeNCName();
       
  1553             }
       
  1554             Q_ASSERT(false);
       
  1555         }
       
  1556         case OccurrenceIndicator:
       
  1557         {
       
  1558             switch(peekCurrent())
       
  1559             {
       
  1560                 case '?':
       
  1561                     return tokenAndChangeState(QUESTION, Operator);
       
  1562                 case '*':
       
  1563                     return tokenAndChangeState(STAR, Operator);
       
  1564                 case '+':
       
  1565                     return tokenAndChangeState(PLUS, Operator);
       
  1566                 default:
       
  1567                 {
       
  1568                     setState(Operator);
       
  1569                     return nextToken();
       
  1570                 }
       
  1571             }
       
  1572             Q_ASSERT(false);
       
  1573         }
       
  1574         case XQueryVersion:
       
  1575         {
       
  1576             switch(peekCurrent())
       
  1577             {
       
  1578                 case '\'':
       
  1579                 /* Fallthrough. */
       
  1580                 case '"':
       
  1581                     return tokenizeStringLiteral();
       
  1582                 case ';':
       
  1583                     return tokenAndChangeState(SEMI_COLON, Default);
       
  1584             }
       
  1585 
       
  1586             const Token id(tokenizeNCName());
       
  1587 
       
  1588             if(id.type != NCNAME)
       
  1589                 return id;
       
  1590 
       
  1591             const TokenMap *const keyword = lookupKeyword(id.value);
       
  1592             if(keyword)
       
  1593                 return tokenAndChangeState(keyword->token, Default);
       
  1594             else
       
  1595                 return id;
       
  1596             Q_ASSERT(false);
       
  1597         }
       
  1598         case StartTag:
       
  1599         {
       
  1600             if(peekAhead(-1) == '<')
       
  1601             {
       
  1602                 if(current().isSpace())
       
  1603                     return Token(ERROR);
       
  1604             }
       
  1605             else
       
  1606             {
       
  1607                 if(consumeRawWhitespace())
       
  1608                     return Token(END_OF_FILE);
       
  1609             }
       
  1610 
       
  1611             switch(peekCurrent())
       
  1612             {
       
  1613                 case '/':
       
  1614                 {
       
  1615                     if(peekAhead() == '>')
       
  1616                     {
       
  1617                         m_pos += 2;
       
  1618 
       
  1619                         if(m_scanOnly)
       
  1620                             return Token(POSITION_SET);
       
  1621                         else
       
  1622                         {
       
  1623                             popState();
       
  1624                             return Token(QUICK_TAG_END);
       
  1625                         }
       
  1626                     }
       
  1627                     else
       
  1628                         return error();
       
  1629                 }
       
  1630                 case '>':
       
  1631                 {
       
  1632                     if(m_scanOnly)
       
  1633                         return tokenAndChangeState(POSITION_SET, StartTag);
       
  1634                     else
       
  1635                         return tokenAndChangeState(G_GT, ElementContent);
       
  1636                 }
       
  1637                 case '=':
       
  1638                     return tokenAndAdvance(G_EQ);
       
  1639                 case '\'':
       
  1640                     return tokenAndChangeState(APOS, AposAttributeContent);
       
  1641                 case '"':
       
  1642                     return tokenAndChangeState(QUOTE, QuotAttributeContent);
       
  1643                 default:
       
  1644                     return tokenizeNCNameOrQName();
       
  1645             }
       
  1646             Q_ASSERT(false);
       
  1647         }
       
  1648         case AposAttributeContent:
       
  1649         /* Fallthrough. */
       
  1650         case QuotAttributeContent:
       
  1651         {
       
  1652             const QChar sep(state() == AposAttributeContent ? QLatin1Char('\'') : QLatin1Char('"'));
       
  1653             QString result;
       
  1654             result.reserve(20);
       
  1655 
       
  1656             if(m_scanOnly)
       
  1657             {
       
  1658                 int stack = 0;
       
  1659                 return attributeAsRaw(sep, stack, m_pos, true, result);
       
  1660             }
       
  1661 
       
  1662             Q_ASSERT(!m_scanOnly);
       
  1663             while(true)
       
  1664             {
       
  1665                 if(atEnd())
       
  1666                 {
       
  1667                     /* In the case that the XSL-T tokenizer invokes us with
       
  1668                      * default state QuotAttributeContent, we need to be able
       
  1669                      * to return a single string, in case that is all we have
       
  1670                      * accumulated. */
       
  1671                     if(result.isEmpty())
       
  1672                         return Token(END_OF_FILE);
       
  1673                     else
       
  1674                         return Token(STRING_LITERAL, result);
       
  1675                 }
       
  1676 
       
  1677                 const QChar curr(current());
       
  1678 
       
  1679                 if(curr == sep)
       
  1680                 {
       
  1681                     if(m_pos + 1 == m_length)
       
  1682                         return Token(END_OF_FILE);
       
  1683 
       
  1684                     if(m_data.at(m_pos + 1) == sep)
       
  1685                     {
       
  1686                         /* The quoting mechanism was used. */
       
  1687                         m_pos += 2;
       
  1688                         result.append(sep);
       
  1689                         continue;
       
  1690                     }
       
  1691 
       
  1692                     const QChar next(m_data.at(m_pos + 1));
       
  1693                     if(!next.isSpace() && next != QLatin1Char('/') && next != QLatin1Char('>'))
       
  1694                         return Token(ERROR); // i18n Space must separate attributes
       
  1695                     else if(result.isEmpty())
       
  1696                     {
       
  1697                         return tokenAndChangeState(state() == AposAttributeContent ? APOS : QUOTE,
       
  1698                                                    StartTag, 1);
       
  1699                     }
       
  1700                     else
       
  1701                     {
       
  1702                         /* Don't consume the sep, but leave it so we next time return a token for it. */
       
  1703                         return Token(STRING_LITERAL, result);
       
  1704                     }
       
  1705 
       
  1706                     ++m_pos;
       
  1707                     continue;
       
  1708                 }
       
  1709                 else if(curr == QLatin1Char('{'))
       
  1710                 {
       
  1711                     if(m_pos + 1 == m_length)
       
  1712                         return Token(END_OF_FILE);
       
  1713                     else if(peekAhead() == '{')
       
  1714                     {
       
  1715                         ++m_pos;
       
  1716                         result.append(QLatin1Char('{'));
       
  1717                     }
       
  1718                     else
       
  1719                     {
       
  1720                         if(result.isEmpty())
       
  1721                         {
       
  1722                             /* The Attribute Value Template appeared directly in the attribute. */
       
  1723                             pushState();
       
  1724                             return tokenAndChangeState(CURLY_LBRACE, Default);
       
  1725                         }
       
  1726                         else
       
  1727                         {
       
  1728                             /* We don't advance, keep '{' as next token. */
       
  1729                             return Token(STRING_LITERAL, result);
       
  1730                         }
       
  1731                     }
       
  1732                 }
       
  1733                 else if(curr == QLatin1Char('}'))
       
  1734                 {
       
  1735                     if(m_pos + 1 == m_length)
       
  1736                         return Token(END_OF_FILE);
       
  1737                     else if(peekAhead() == '}')
       
  1738                     {
       
  1739                         ++m_pos;
       
  1740                         result.append(QLatin1Char('}'));
       
  1741                     }
       
  1742                     else
       
  1743                         return Token(ERROR);
       
  1744                 }
       
  1745                 else if(curr == QLatin1Char('&'))
       
  1746                 {
       
  1747                     const QString ret(tokenizeCharacterReference());
       
  1748                     if(ret.isNull())
       
  1749                         return Token(ERROR);
       
  1750                     else
       
  1751                         result.append(ret);
       
  1752                 }
       
  1753                 else if(curr == QLatin1Char('<'))
       
  1754                     return Token(STRING_LITERAL, result);
       
  1755                 else
       
  1756                 {
       
  1757                     /* See Extensible Markup Language (XML) 1.0 (Fourth Edition),
       
  1758                      * 3.3.3 Attribute-Value Normalization.
       
  1759                      *
       
  1760                      * However, it is complicated a bit by that AVN is defined on top of
       
  1761                      * EOL normalization and we do those two in one go here. */
       
  1762                     switch(curr.unicode())
       
  1763                     {
       
  1764                         case 0xD:
       
  1765                         {
       
  1766                             if(peekAhead() == '\n')
       
  1767                             {
       
  1768                                 result.append(QLatin1Char(' '));
       
  1769                                 ++m_pos;
       
  1770                                 break;
       
  1771                             }
       
  1772                         }
       
  1773                         case 0xA:
       
  1774                         /* Fallthrough. */
       
  1775                         case 0x9:
       
  1776                         {
       
  1777                             result.append(QLatin1Char(' '));
       
  1778                             break;
       
  1779                         }
       
  1780                         default:
       
  1781                             result.append(curr);
       
  1782                     }
       
  1783                 }
       
  1784 
       
  1785                 ++m_pos;
       
  1786             }
       
  1787             Q_ASSERT(false);
       
  1788         }
       
  1789         case ElementContent:
       
  1790         {
       
  1791             QString result;
       
  1792             result.reserve(20);
       
  1793 
       
  1794             /* Whether the text node, result, may be whitespace only. Character references
       
  1795              * and CDATA sections disables that. */
       
  1796             bool mayBeWS = true;
       
  1797 
       
  1798             CharacterSkips skipEOLNormalization;
       
  1799 
       
  1800             while(true)
       
  1801             {
       
  1802                 if(atEnd())
       
  1803                     return Token(END_OF_FILE);
       
  1804 
       
  1805                 switch(peekCurrent())
       
  1806                 {
       
  1807                     case '<':
       
  1808                     {
       
  1809                         if(!result.isEmpty() && peekAhead(2) != '[')
       
  1810                         {
       
  1811                             /* We encountered the end, and it was not a CDATA section. */
       
  1812                             /* We don't advance. Next time we'll handle the <... stuff. */
       
  1813                             return Token(mayBeWS ? STRING_LITERAL : NON_BOUNDARY_WS, normalizeEOL(result, skipEOLNormalization));
       
  1814                         }
       
  1815 
       
  1816                         ++m_pos;
       
  1817                         if(atEnd())
       
  1818                             return Token(END_OF_FILE);
       
  1819 
       
  1820                         const QChar ahead(current());
       
  1821                         if(ahead.isSpace())
       
  1822                             return error();
       
  1823                         else if(ahead == QLatin1Char('/'))
       
  1824                         {
       
  1825                             if(m_pos + 1 == m_length)
       
  1826                                 return Token(END_OF_FILE);
       
  1827                             else if(m_data.at(m_pos + 1).isSpace())
       
  1828                                 return error();
       
  1829                             else
       
  1830                                 return tokenAndChangeState(BEGIN_END_TAG, EndTag);
       
  1831                         }
       
  1832                         else if(isNCNameStart(ahead))
       
  1833                         {
       
  1834                             pushState();
       
  1835                             return tokenAndChangeState(G_LT, StartTag, 0);
       
  1836                         }
       
  1837                         else if(aheadEquals("!--", 3, 0))
       
  1838                         {
       
  1839                             pushState();
       
  1840                             m_pos += 3;
       
  1841                             return tokenAndChangeState(COMMENT_START, XMLComment, 0);
       
  1842                         }
       
  1843                         else if(aheadEquals("![CDATA[", 8, 0))
       
  1844                         {
       
  1845                             mayBeWS = false;
       
  1846                             m_pos += 8;
       
  1847                             const int start = m_pos;
       
  1848                             const int len = scanUntil("]]>");
       
  1849 
       
  1850                             if(len == -1)
       
  1851                                 return Token(END_OF_FILE);
       
  1852 
       
  1853                             m_pos += 2; /* Consume "]]>". Note that m_pos is on '!'. */
       
  1854                             result.append(m_data.mid(start, len));
       
  1855                             break;
       
  1856                         }
       
  1857                         else if(ahead == QLatin1Char('?'))
       
  1858                         {
       
  1859                             pushState();
       
  1860                             return tokenAndChangeState(PI_START, ProcessingInstructionName);
       
  1861                         }
       
  1862                         else
       
  1863                             return Token(G_LT);
       
  1864                     }
       
  1865                     case '&':
       
  1866                     {
       
  1867                         const QString ret(tokenizeCharacterReference());
       
  1868                         if(ret.isNull())
       
  1869                             return Token(ERROR);
       
  1870                         else
       
  1871                         {
       
  1872                             skipEOLNormalization.insert(result.count());
       
  1873                             result.append(ret);
       
  1874                             mayBeWS = false;
       
  1875                             break;
       
  1876                         }
       
  1877                     }
       
  1878                     case '{':
       
  1879                     {
       
  1880                         // TODO remove this check, also below.
       
  1881                         if(m_pos + 1 == m_length)
       
  1882                             return Token(END_OF_FILE);
       
  1883                         else if(peekAhead() == '{')
       
  1884                         {
       
  1885                             ++m_pos;
       
  1886                             result.append(QLatin1Char('{'));
       
  1887                         }
       
  1888                         else
       
  1889                         {
       
  1890                             if(result.isEmpty())
       
  1891                             {
       
  1892                                 pushState();
       
  1893                                 return tokenAndChangeState(CURLY_LBRACE, Default);
       
  1894                             }
       
  1895                             else
       
  1896                             {
       
  1897                                 /* We don't advance here. */
       
  1898                                 return Token(mayBeWS ? STRING_LITERAL : NON_BOUNDARY_WS, normalizeEOL(result, skipEOLNormalization));
       
  1899                             }
       
  1900                         }
       
  1901                         break;
       
  1902                     }
       
  1903                     case '}':
       
  1904                     {
       
  1905                         if(m_pos + 1 == m_length)
       
  1906                             return Token(END_OF_FILE);
       
  1907                         else if(peekAhead() == '}')
       
  1908                         {
       
  1909                             ++m_pos;
       
  1910                             result.append(QLatin1Char('}'));
       
  1911                         }
       
  1912                         else
       
  1913                         {
       
  1914                             /* This is a parse error, and the grammar won't be able
       
  1915                              * to reduce this CURLY_RBRACE. */
       
  1916                             return tokenAndChangeState(CURLY_RBRACE, Default);
       
  1917                         }
       
  1918                         break;
       
  1919                     }
       
  1920                     case '\n':
       
  1921                     {
       
  1922                         /* We want to translate \r\n into \n. */
       
  1923                         if(peekAhead(-1) == '\r')
       
  1924                             break;
       
  1925                         /* else, fallthrough. */
       
  1926                     }
       
  1927                     case '\r':
       
  1928                     {
       
  1929                         result.append(QLatin1Char('\n'));
       
  1930                         break;
       
  1931                     }
       
  1932                     default:
       
  1933                     {
       
  1934                         result.append(current());
       
  1935                         break;
       
  1936                     }
       
  1937                 }
       
  1938                 ++m_pos;
       
  1939             }
       
  1940             Q_ASSERT(false);
       
  1941         }
       
  1942         case ProcessingInstructionName:
       
  1943         {
       
  1944             const int start = m_pos;
       
  1945 
       
  1946             while(true)
       
  1947             {
       
  1948                 ++m_pos;
       
  1949                 if(m_pos >= m_length)
       
  1950                     return Token(END_OF_FILE);
       
  1951 
       
  1952                 const QChar next(current());
       
  1953                 if(next.isSpace() || next == QLatin1Char('?'))
       
  1954                 {
       
  1955                     return tokenAndChangeState(PI_TARGET, m_data.mid(start, m_pos - start),
       
  1956                                                ProcessingInstructionContent);
       
  1957                 }
       
  1958             }
       
  1959             Q_ASSERT(false);
       
  1960         }
       
  1961         case ProcessingInstructionContent:
       
  1962         {
       
  1963             /* Consume whitespace between the name and the content. */
       
  1964             if(consumeRawWhitespace())
       
  1965                 return Token(END_OF_FILE);
       
  1966 
       
  1967             const int start = m_pos;
       
  1968             const int len = scanUntil("?>");
       
  1969 
       
  1970             if(len == -1)
       
  1971                 return Token(END_OF_FILE);
       
  1972             else
       
  1973             {
       
  1974                 m_pos += 2; /* Consume "?>" */
       
  1975                 popState();
       
  1976                 return Token(PI_CONTENT, normalizeEOL(m_data.mid(start, len), CharacterSkips()));
       
  1977             }
       
  1978             Q_ASSERT(false);
       
  1979         }
       
  1980         case EndTag:
       
  1981         {
       
  1982             if(consumeRawWhitespace())
       
  1983                 return END_OF_FILE;
       
  1984 
       
  1985             if(peekCurrent() == '>')
       
  1986             {
       
  1987                 popState();
       
  1988                 return tokenAndAdvance(G_GT);
       
  1989             }
       
  1990             else
       
  1991                 return tokenizeNCNameOrQName();
       
  1992             Q_ASSERT(false);
       
  1993         }
       
  1994         case XMLComment:
       
  1995         {
       
  1996             const int start = m_pos;
       
  1997             const int len = scanUntil("--");
       
  1998 
       
  1999             if(len == -1)
       
  2000                 return END_OF_FILE;
       
  2001             else
       
  2002             {
       
  2003                 m_pos += 2; /* Consume "--". */
       
  2004                 popState();
       
  2005 
       
  2006                 if(peekCurrent() == '>')
       
  2007                 {
       
  2008                     ++m_pos;
       
  2009                     return Token(COMMENT_CONTENT, normalizeEOL(m_data.mid(start, len), CharacterSkips()));
       
  2010                 }
       
  2011                 else
       
  2012                     return error();
       
  2013             }
       
  2014             Q_ASSERT(false);
       
  2015         }
       
  2016         case Pragma:
       
  2017         {
       
  2018             /* Consume whitespace. */
       
  2019             if(consumeRawWhitespace())
       
  2020                 return Token(END_OF_FILE);
       
  2021 
       
  2022             setState(PragmaContent);
       
  2023             return tokenizeNCNameOrQName();
       
  2024         }
       
  2025         case PragmaContent:
       
  2026         {
       
  2027             QString result;
       
  2028             result.reserve(20);
       
  2029 
       
  2030             const bool hasWS = m_pos < m_length && current().isSpace();
       
  2031 
       
  2032             /* Consume all whitespace up to the pragma content(if any). */
       
  2033             if(consumeRawWhitespace())
       
  2034                 return Token(END_OF_FILE);
       
  2035 
       
  2036             if(peekCurrent() == '#' && peekAhead() == ')')
       
  2037             {
       
  2038                 /* We reached the end, and there's no pragma content. */
       
  2039                 return tokenAndChangeState(PRAGMA_END, Default, 2);
       
  2040             }
       
  2041             else if(!hasWS)
       
  2042             {
       
  2043                 /* A separating space is required if there's pragma content. */
       
  2044                 return error(); /* i18n */
       
  2045             }
       
  2046 
       
  2047             const int start = m_pos;
       
  2048             const int len = scanUntil("#)");
       
  2049             if(len == -1)
       
  2050                 return Token(END_OF_FILE);
       
  2051 
       
  2052             return Token(STRING_LITERAL, m_data.mid(start, len));
       
  2053             Q_ASSERT(false);
       
  2054         }
       
  2055     }
       
  2056 
       
  2057     Q_ASSERT(false);
       
  2058     return error();
       
  2059 }
       
  2060 
       
  2061 Tokenizer::Token XQueryTokenizer::attributeAsRaw(const QChar sep,
       
  2062                                                  int &sepStack,
       
  2063                                                  const int startPos,
       
  2064                                                  const bool aInLiteral,
       
  2065                                                  QString &result)
       
  2066 {
       
  2067     bool inLiteral = aInLiteral;
       
  2068     const char otherSep = (sep == QLatin1Char('"') ? '\'' : '"');
       
  2069 
       
  2070     while(true)
       
  2071     {
       
  2072         if(atEnd())
       
  2073             return END_OF_FILE;
       
  2074 
       
  2075         if(peekCurrent() == sep.unicode())
       
  2076         {
       
  2077             if(inLiteral)
       
  2078                 inLiteral = false;
       
  2079             else
       
  2080                 inLiteral = true;
       
  2081 
       
  2082             if(peekAhead() == sep.unicode())
       
  2083             {
       
  2084                 /* The quoting mechanism was used. */
       
  2085                 result.append(current());
       
  2086                 m_pos += 2;
       
  2087                 continue;
       
  2088             }
       
  2089             else
       
  2090             {
       
  2091                 /* Don't consume the separator, such that we
       
  2092                  * return a token for it next time. */
       
  2093                 if(m_pos == startPos)
       
  2094                 {
       
  2095                     ++m_pos;
       
  2096                     setState(StartTag);
       
  2097                     return Token(sep == QLatin1Char('"') ? QUOTE : APOS);
       
  2098                 }
       
  2099 
       
  2100 
       
  2101                 if(sepStack == 0)
       
  2102                 {
       
  2103                     return Token(STRING_LITERAL, result);
       
  2104                 }
       
  2105                 else
       
  2106                 {
       
  2107                     result.append(current());
       
  2108                     ++m_pos;
       
  2109                     continue;
       
  2110                 }
       
  2111             }
       
  2112         }
       
  2113         else if(peekCurrent() == '&')
       
  2114         {
       
  2115             const QString ret(tokenizeCharacterReference());
       
  2116             if(ret.isNull())
       
  2117                 return Token(ERROR);
       
  2118             else
       
  2119             {
       
  2120                 result.append(ret);
       
  2121                 ++m_pos;
       
  2122                 continue;
       
  2123             }
       
  2124         }
       
  2125         else if(peekCurrent() == otherSep)
       
  2126         {
       
  2127             result.append(current());
       
  2128             ++m_pos;
       
  2129 
       
  2130             if(peekCurrent() == otherSep)
       
  2131                 ++m_pos;
       
  2132 
       
  2133             if(inLiteral)
       
  2134                 inLiteral = false;
       
  2135             else
       
  2136                 inLiteral = true;
       
  2137 
       
  2138             continue;
       
  2139         }
       
  2140         else if(peekCurrent() == '{')
       
  2141         {
       
  2142             result.append(current());
       
  2143 
       
  2144             if(peekAhead() == '{')
       
  2145             {
       
  2146                 m_pos += 2;
       
  2147                 continue;
       
  2148             }
       
  2149             else
       
  2150             {
       
  2151                 ++m_pos;
       
  2152                 ++sepStack;
       
  2153                 const Token t(attributeAsRaw(sep, sepStack, startPos, false, result));
       
  2154                 if(t.type != SUCCESS)
       
  2155                     return t;
       
  2156             }
       
  2157 
       
  2158         }
       
  2159         else if(peekCurrent() == '}')
       
  2160         {
       
  2161             if(inLiteral && peekAhead() == '}')
       
  2162             {
       
  2163                 result.append(current());
       
  2164                 m_pos += 2;
       
  2165                 continue;
       
  2166             }
       
  2167             else
       
  2168             {
       
  2169                 ++m_pos;
       
  2170                 --sepStack;
       
  2171                 return Token(SUCCESS); /* The return value is arbitrary. */
       
  2172             }
       
  2173         }
       
  2174         else
       
  2175         {
       
  2176             result.append(current());
       
  2177             ++m_pos;
       
  2178         }
       
  2179     }
       
  2180 }
       
  2181 
       
  2182 Tokenizer::Token XQueryTokenizer::nextToken(YYLTYPE *const sourceLocator)
       
  2183 {
       
  2184     sourceLocator->first_line = m_line;
       
  2185     sourceLocator->first_column = m_pos - m_columnOffset + 1; /* Plus 1, since m_pos is 0-based. */
       
  2186 
       
  2187     if(m_tokenStack.isEmpty())
       
  2188         return nextToken();
       
  2189     else
       
  2190     {
       
  2191         const Token retval(m_tokenStack.pop());
       
  2192 
       
  2193         switch(retval.type)
       
  2194         {
       
  2195             case MODULE:
       
  2196             /* Fallthrough.*/
       
  2197             case SCHEMA:
       
  2198             /* Fallthrough.*/
       
  2199             case COPY_NAMESPACES:
       
  2200             {
       
  2201                 setState(NamespaceKeyword);
       
  2202                 break;
       
  2203             }
       
  2204             case VERSION:
       
  2205             {
       
  2206                 setState(XQueryVersion);
       
  2207                 break;
       
  2208             }
       
  2209             case AS:
       
  2210             /* Fallthrough. */
       
  2211             case OF:
       
  2212             {
       
  2213                 setState(ItemType);
       
  2214                 break;
       
  2215             }
       
  2216             default:
       
  2217             {
       
  2218                 if(isOperatorKeyword(retval.type))
       
  2219                     setState(Default);
       
  2220 
       
  2221                 break;
       
  2222             }
       
  2223         };
       
  2224 
       
  2225         return retval;
       
  2226     }
       
  2227 }
       
  2228 
       
  2229 int XQueryTokenizer::commenceScanOnly()
       
  2230 {
       
  2231     m_scanOnly = true;
       
  2232     return m_pos;
       
  2233 }
       
  2234 
       
  2235 void XQueryTokenizer::resumeTokenizationFrom(const int pos)
       
  2236 {
       
  2237     m_scanOnly = false;
       
  2238     m_pos = pos;
       
  2239 }
       
  2240 
       
  2241 void XQueryTokenizer::setParserContext(const ParserContext::Ptr &)
       
  2242 {
       
  2243 }
       
  2244 
       
  2245 #undef handleWhitespace
       
  2246 
       
  2247 } // namespace QPatternist
       
  2248 
       
  2249 QT_END_NAMESPACE