tools/porting/src/tokenizer.cpp
changeset 0 1918ee327afb
child 4 3b1da2848fc7
equal deleted inserted replaced
-1:000000000000 0:1918ee327afb
       
     1 /****************************************************************************
       
     2 **
       
     3 ** Copyright (C) 2001-2004 Roberto Raggi
       
     4 ** Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies).
       
     5 ** All rights reserved.
       
     6 ** Contact: Nokia Corporation (qt-info@nokia.com)
       
     7 **
       
     8 ** This file is part of the qt3to4 porting application of the Qt Toolkit.
       
     9 **
       
    10 ** $QT_BEGIN_LICENSE:LGPL$
       
    11 ** No Commercial Usage
       
    12 ** This file contains pre-release code and may not be distributed.
       
    13 ** You may use this file in accordance with the terms and conditions
       
    14 ** contained in the Technology Preview License Agreement accompanying
       
    15 ** this package.
       
    16 **
       
    17 ** GNU Lesser General Public License Usage
       
    18 ** Alternatively, this file may be used under the terms of the GNU Lesser
       
    19 ** General Public License version 2.1 as published by the Free Software
       
    20 ** Foundation and appearing in the file LICENSE.LGPL included in the
       
    21 ** packaging of this file.  Please review the following information to
       
    22 ** ensure the GNU Lesser General Public License version 2.1 requirements
       
    23 ** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
       
    24 **
       
    25 ** In addition, as a special exception, Nokia gives you certain additional
       
    26 ** rights.  These rights are described in the Nokia Qt LGPL Exception
       
    27 ** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
       
    28 **
       
    29 ** If you have questions regarding the use of this file, please contact
       
    30 ** Nokia at qt-info@nokia.com.
       
    31 **
       
    32 **
       
    33 **
       
    34 **
       
    35 **
       
    36 **
       
    37 **
       
    38 **
       
    39 ** $QT_END_LICENSE$
       
    40 **
       
    41 ****************************************************************************/
       
    42 
       
    43 #include "tokenizer.h"
       
    44 #include "tokens.h"
       
    45 #include <QDateTime>
       
    46 #include <QHash>
       
    47 #include <ctype.h>
       
    48 
       
    49 QT_BEGIN_NAMESPACE
       
    50 
       
    51 using TokenEngine::Token;
       
    52 
       
    53 static QHash<QByteArray, bool> preprocessed;
       
    54 bool Tokenizer::s_initialized = false;
       
    55 Tokenizer::scan_fun_ptr Tokenizer::s_scan_table[128 + 1];
       
    56 int Tokenizer::s_attr_table[256];
       
    57 
       
    58 Tokenizer::Tokenizer()
       
    59     : m_buffer(0), m_ptr(0)
       
    60 {
       
    61     if (!s_initialized)
       
    62         setupScanTable();
       
    63 }
       
    64 
       
    65 Tokenizer::~Tokenizer()
       
    66 {
       
    67 }
       
    68 
       
    69 enum
       
    70 {
       
    71     A_Alpha = 0x01,
       
    72     A_Digit = 0x02,
       
    73     A_Alphanum = A_Alpha | A_Digit,
       
    74     A_Whitespace = 0x04
       
    75 };
       
    76 
       
    77 void Tokenizer::setupScanTable()
       
    78 {
       
    79     s_initialized = true;
       
    80 
       
    81     memset(s_attr_table, 0, 256);
       
    82 
       
    83     for (int i=0; i<128; ++i) {
       
    84         switch (i) {
       
    85         case ':':
       
    86         case '*':
       
    87         case '%':
       
    88         case '^':
       
    89         case '=':
       
    90         case '!':
       
    91         case '&':
       
    92         case '|':
       
    93         case '+':
       
    94         case '<':
       
    95         case '>':
       
    96         case '-':
       
    97         case '.':
       
    98             s_scan_table[i] = &Tokenizer::scanOperator;
       
    99             break;
       
   100 
       
   101         case '\r':
       
   102         case '\n':
       
   103             s_scan_table[i] = &Tokenizer::scanNewline;
       
   104             break;
       
   105 
       
   106         case '#':
       
   107             s_scan_table[i] = &Tokenizer::scanPreprocessor;
       
   108             break;
       
   109 
       
   110         case '/':
       
   111             s_scan_table[i] = &Tokenizer::scanComment;
       
   112             break;
       
   113 
       
   114         case '\'':
       
   115             s_scan_table[i] = &Tokenizer::scanCharLiteral;
       
   116             break;
       
   117 
       
   118         case '"':
       
   119             s_scan_table[i] = &Tokenizer::scanStringLiteral;
       
   120             break;
       
   121 
       
   122         default:
       
   123             if (isspace(i)) {
       
   124                 s_scan_table[i] = &Tokenizer::scanWhiteSpaces;
       
   125                 s_attr_table[i] |= A_Whitespace;
       
   126             } else if (isalpha(i) || i == '_') {
       
   127                 s_scan_table[i] = &Tokenizer::scanIdentifier;
       
   128                 s_attr_table[i] |= A_Alpha;
       
   129             } else if (isdigit(i)) {
       
   130                 s_scan_table[i] = &Tokenizer::scanNumberLiteral;
       
   131                 s_attr_table[i] |= A_Digit;
       
   132             } else
       
   133                 s_scan_table[i] = &Tokenizer::scanChar;
       
   134         }
       
   135     }
       
   136 
       
   137     s_scan_table[128] = &Tokenizer::scanUnicodeChar;
       
   138 }
       
   139 
       
   140 QVector<TokenEngine::Token> Tokenizer::tokenize(QByteArray text)
       
   141 {
       
   142     m_tokens.clear();
       
   143 
       
   144     m_buffer = text;
       
   145     m_ptr = 0;
       
   146 
       
   147     // tokenize
       
   148     for (;;) {
       
   149         Token tk;
       
   150         bool endOfFile = nextToken(tk);
       
   151         if (endOfFile) {
       
   152             break;
       
   153         }
       
   154         m_tokens.append(tk);
       
   155     }
       
   156 
       
   157     return m_tokens;
       
   158 }
       
   159 
       
   160 bool Tokenizer::nextToken(Token &tok)
       
   161 {
       
   162     int start = m_ptr;
       
   163     unsigned char ch = (unsigned char)m_buffer[m_ptr];
       
   164 
       
   165     int kind = 0;
       
   166     (this->*s_scan_table[ch < 128 ? ch : 128])(&kind);
       
   167 
       
   168     tok.start = start;
       
   169     tok.length = m_ptr - start;
       
   170 
       
   171     return (kind == 0);
       
   172 }
       
   173 
       
   174 void Tokenizer::scanChar(int *kind)
       
   175 {
       
   176     *kind = m_buffer[m_ptr++];
       
   177 }
       
   178 
       
   179 void Tokenizer::scanWhiteSpaces(int *kind)
       
   180 {
       
   181     *kind = Token_whitespaces;
       
   182     while (unsigned char ch = m_buffer[m_ptr]) {
       
   183         if (s_attr_table[ch] & A_Whitespace)
       
   184             ++m_ptr;
       
   185         else
       
   186             break;
       
   187     }
       
   188 }
       
   189 
       
   190 void Tokenizer::scanNewline(int *kind)
       
   191 {
       
   192     Q_UNUSED(kind);
       
   193     const unsigned char ch = m_buffer[m_ptr++];
       
   194     // Check for \n.
       
   195     if (ch == '\n') {
       
   196         *kind = '\n';
       
   197         return;
       
   198     }
       
   199 
       
   200     // Check for \r\n.
       
   201     if (ch == '\r' && m_buffer[m_ptr] == '\n') {
       
   202         *kind = '\n';
       
   203         ++ m_ptr;
       
   204         return;
       
   205     }
       
   206 
       
   207     *kind = ch;
       
   208 }
       
   209 
       
   210 void Tokenizer::scanUnicodeChar(int *kind)
       
   211 {
       
   212     *kind = m_buffer[m_ptr++];
       
   213 }
       
   214 
       
   215 void Tokenizer::scanCharLiteral(int *kind)
       
   216 {
       
   217     ++m_ptr;
       
   218     for (;;) {
       
   219         unsigned char ch = m_buffer[m_ptr];
       
   220         switch (ch) {
       
   221         case '\0':
       
   222         case '\n':
       
   223             // ### error
       
   224             *kind = Token_char_literal;
       
   225             return;
       
   226         case '\\':
       
   227             if (m_buffer[m_ptr+1] == '\'' || m_buffer[m_ptr+1] == '\\')
       
   228                 m_ptr += 2;
       
   229             else
       
   230                 ++m_ptr;
       
   231             break;
       
   232         case '\'':
       
   233             ++m_ptr;
       
   234             *kind = Token_char_literal;
       
   235             return;
       
   236         default:
       
   237             ++m_ptr;
       
   238             break;
       
   239         }
       
   240     }
       
   241 
       
   242     // ### error
       
   243     *kind = Token_char_literal;
       
   244 }
       
   245 
       
   246 void Tokenizer::scanStringLiteral(int *kind)
       
   247 {
       
   248     ++m_ptr;
       
   249     while (m_buffer[m_ptr]) {
       
   250         switch (m_buffer[m_ptr]) {
       
   251         case '\n':
       
   252             // ### error
       
   253             *kind = Token_string_literal;
       
   254             return;
       
   255         case '\\':
       
   256             if (m_buffer[m_ptr+1] == '"' || m_buffer[m_ptr+1] == '\\')
       
   257                 m_ptr += 2;
       
   258             else
       
   259                 ++m_ptr;
       
   260             break;
       
   261         case '"':
       
   262             ++m_ptr;
       
   263             *kind = Token_string_literal;
       
   264             return;
       
   265         default:
       
   266             ++m_ptr;
       
   267             break;
       
   268         }
       
   269     }
       
   270 
       
   271     // ### error
       
   272     *kind = Token_string_literal;
       
   273 }
       
   274 
       
   275 void Tokenizer::scanIdentifier(int *kind)
       
   276 {
       
   277     unsigned char ch;
       
   278     for (;;) {
       
   279         ch = m_buffer[m_ptr];
       
   280         if (s_attr_table[ch] & A_Alphanum)
       
   281             ++m_ptr;
       
   282         else
       
   283             break;
       
   284     }
       
   285     *kind = Token_identifier;
       
   286 }
       
   287 
       
   288 void Tokenizer::scanNumberLiteral(int *kind)
       
   289 {
       
   290     unsigned char ch;
       
   291     for (;;) {
       
   292         ch = m_buffer[m_ptr];
       
   293         if (s_attr_table[ch] & A_Alphanum || ch == '.')
       
   294             ++m_ptr;
       
   295         else
       
   296             break;
       
   297     }
       
   298 
       
   299     // ### finish to implement me!!
       
   300     *kind = Token_number_literal;
       
   301 }
       
   302 
       
   303 void Tokenizer::scanComment(int *kind)
       
   304 {
       
   305     if (!(m_buffer[m_ptr+1] == '/' || m_buffer[m_ptr+1] == '*')) {
       
   306         scanOperator(kind);
       
   307 		return;
       
   308 	}
       
   309 
       
   310     ++m_ptr; // skip '/'
       
   311 
       
   312     bool multiLineComment = m_buffer[m_ptr++] == '*';
       
   313 
       
   314     while (m_buffer[m_ptr]) {
       
   315         switch (m_buffer[m_ptr]) {
       
   316         case '\r':
       
   317         case '\n':
       
   318             if (!multiLineComment) {
       
   319                 *kind = Token_comment;
       
   320                 return;
       
   321             }
       
   322 
       
   323             (void) scanNewline(kind);
       
   324             break;
       
   325 
       
   326         case '*':
       
   327             if (multiLineComment && m_buffer[m_ptr+1] == '/') {
       
   328                 m_ptr += 2;
       
   329                 *kind = Token_comment;
       
   330                 return;
       
   331             }
       
   332             ++m_ptr;
       
   333             break;
       
   334 
       
   335         default:
       
   336             ++m_ptr;
       
   337         }
       
   338     }
       
   339 
       
   340     // ### error
       
   341     *kind = Token_comment;
       
   342 }
       
   343 
       
   344 
       
   345 void Tokenizer::scanPreprocessor(int *kind)
       
   346 {
       
   347     ++m_ptr;
       
   348     *kind = Token_preproc;
       
   349 }
       
   350 
       
   351 
       
   352 void Tokenizer::scanOperator(int *kind)
       
   353 {
       
   354     switch (m_buffer[m_ptr]) {
       
   355     case ':':
       
   356         if (m_buffer[m_ptr+1] == ':') {
       
   357             m_ptr += 2;
       
   358             *kind = Token_scope;
       
   359             return;
       
   360         }
       
   361         break;
       
   362 
       
   363     case '*':
       
   364     case '/':
       
   365     case '%':
       
   366     case '^':
       
   367         if (m_buffer[m_ptr+1] == '=') {
       
   368             m_ptr += 2;
       
   369             *kind = Token_assign;
       
   370             return;
       
   371         }
       
   372         break;
       
   373 
       
   374     case '=':
       
   375     case '!':
       
   376         if (m_buffer[m_ptr+1] == '=') {
       
   377             m_ptr += 2;
       
   378             *kind = Token_eq;
       
   379             return;
       
   380         }
       
   381         break;
       
   382 
       
   383     case '&':
       
   384         if (m_buffer[m_ptr+1] == '&') {
       
   385             m_ptr += 2;
       
   386             *kind = Token_and;
       
   387             return;
       
   388         } else if (m_buffer[m_ptr+1] == '=') {
       
   389             m_ptr += 2;
       
   390             *kind = Token_assign;
       
   391             return;
       
   392         }
       
   393         break;
       
   394 
       
   395     case '|':
       
   396         if (m_buffer[m_ptr+1] == '|' ) {
       
   397             m_ptr += 2;
       
   398             *kind = Token_or;
       
   399             return;
       
   400         } else if (m_buffer[m_ptr+1] == '=') {
       
   401             m_ptr += 2;
       
   402             *kind = Token_assign;
       
   403             return;
       
   404         }
       
   405         break;
       
   406 
       
   407     case '+':
       
   408         if (m_buffer[m_ptr+1] == '+' ) {
       
   409             m_ptr += 2;
       
   410             *kind = Token_incr;
       
   411             return;
       
   412         } else if (m_buffer[m_ptr+1] == '=') {
       
   413             m_ptr += 2;
       
   414             *kind = Token_assign;
       
   415             return;
       
   416         }
       
   417         break;
       
   418 
       
   419     case '<':
       
   420         if (m_buffer[m_ptr+1] == '<') {
       
   421             if (m_buffer[m_ptr+2] == '=') {
       
   422                 m_ptr += 3;
       
   423                 *kind = Token_assign;
       
   424                 return;
       
   425             }
       
   426             m_ptr += 2;
       
   427             *kind = Token_shift;
       
   428             return;
       
   429         } else if (m_buffer[m_ptr+1] == '=') {
       
   430             m_ptr += 2;
       
   431             *kind = Token_leq;
       
   432             return;
       
   433         }
       
   434         break;
       
   435 
       
   436     case '>':
       
   437         if (m_buffer[m_ptr+1] == '>') {
       
   438             if (m_buffer[m_ptr+2] == '=') {
       
   439                 m_ptr += 3;
       
   440                 *kind = Token_assign;
       
   441                 return;
       
   442             }
       
   443             m_ptr += 2;
       
   444             *kind = Token_shift;
       
   445             return;
       
   446         } else if (m_buffer[m_ptr+1] == '=') {
       
   447             m_ptr += 2;
       
   448             *kind = Token_geq;
       
   449             return;
       
   450         }
       
   451         break;
       
   452 
       
   453     case '-':
       
   454         if (m_buffer[m_ptr+1] == '>') {
       
   455             if (m_buffer[m_ptr+2] == '*') {
       
   456                 m_ptr += 3;
       
   457                 *kind = Token_ptrmem;
       
   458                 return;
       
   459             }
       
   460             m_ptr += 2;
       
   461             *kind = Token_arrow;
       
   462             return;
       
   463         } else if (m_buffer[m_ptr+1] == '-') {
       
   464             m_ptr += 2;
       
   465             *kind = Token_decr;
       
   466             return;
       
   467         } else if (m_buffer[m_ptr+1] == '=') {
       
   468             m_ptr += 2;
       
   469             *kind = Token_assign;
       
   470             return;
       
   471         }
       
   472         break;
       
   473 
       
   474     case '.':
       
   475         if (m_buffer[m_ptr+1] == '.' && m_buffer[m_ptr+2] == '.') {
       
   476             m_ptr += 3;
       
   477             *kind = Token_ellipsis;
       
   478             return;
       
   479         } else if (m_buffer[m_ptr+1] == '*') {
       
   480             m_ptr += 2;
       
   481             *kind = Token_ptrmem;
       
   482             return;
       
   483         }
       
   484         break;
       
   485 
       
   486     }
       
   487 
       
   488     *kind = m_buffer[m_ptr++];
       
   489 }
       
   490 
       
   491 QT_END_NAMESPACE