|
1 /**************************************************************************** |
|
2 ** |
|
3 ** Copyright (C) 2001-2004 Roberto Raggi |
|
4 ** Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies). |
|
5 ** All rights reserved. |
|
6 ** Contact: Nokia Corporation (qt-info@nokia.com) |
|
7 ** |
|
8 ** This file is part of the qt3to4 porting application of the Qt Toolkit. |
|
9 ** |
|
10 ** $QT_BEGIN_LICENSE:LGPL$ |
|
11 ** No Commercial Usage |
|
12 ** This file contains pre-release code and may not be distributed. |
|
13 ** You may use this file in accordance with the terms and conditions |
|
14 ** contained in the Technology Preview License Agreement accompanying |
|
15 ** this package. |
|
16 ** |
|
17 ** GNU Lesser General Public License Usage |
|
18 ** Alternatively, this file may be used under the terms of the GNU Lesser |
|
19 ** General Public License version 2.1 as published by the Free Software |
|
20 ** Foundation and appearing in the file LICENSE.LGPL included in the |
|
21 ** packaging of this file. Please review the following information to |
|
22 ** ensure the GNU Lesser General Public License version 2.1 requirements |
|
23 ** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html. |
|
24 ** |
|
25 ** In addition, as a special exception, Nokia gives you certain additional |
|
26 ** rights. These rights are described in the Nokia Qt LGPL Exception |
|
27 ** version 1.1, included in the file LGPL_EXCEPTION.txt in this package. |
|
28 ** |
|
29 ** If you have questions regarding the use of this file, please contact |
|
30 ** Nokia at qt-info@nokia.com. |
|
31 ** |
|
32 ** |
|
33 ** |
|
34 ** |
|
35 ** |
|
36 ** |
|
37 ** |
|
38 ** |
|
39 ** $QT_END_LICENSE$ |
|
40 ** |
|
41 ****************************************************************************/ |
|
42 |
|
43 #include "tokenizer.h" |
|
44 #include "tokens.h" |
|
45 #include <QDateTime> |
|
46 #include <QHash> |
|
47 #include <ctype.h> |
|
48 |
|
49 QT_BEGIN_NAMESPACE |
|
50 |
|
51 using TokenEngine::Token; |
|
52 |
|
53 static QHash<QByteArray, bool> preprocessed; |
|
54 bool Tokenizer::s_initialized = false; |
|
55 Tokenizer::scan_fun_ptr Tokenizer::s_scan_table[128 + 1]; |
|
56 int Tokenizer::s_attr_table[256]; |
|
57 |
|
58 Tokenizer::Tokenizer() |
|
59 : m_buffer(0), m_ptr(0) |
|
60 { |
|
61 if (!s_initialized) |
|
62 setupScanTable(); |
|
63 } |
|
64 |
|
65 Tokenizer::~Tokenizer() |
|
66 { |
|
67 } |
|
68 |
|
69 enum |
|
70 { |
|
71 A_Alpha = 0x01, |
|
72 A_Digit = 0x02, |
|
73 A_Alphanum = A_Alpha | A_Digit, |
|
74 A_Whitespace = 0x04 |
|
75 }; |
|
76 |
|
77 void Tokenizer::setupScanTable() |
|
78 { |
|
79 s_initialized = true; |
|
80 |
|
81 memset(s_attr_table, 0, 256); |
|
82 |
|
83 for (int i=0; i<128; ++i) { |
|
84 switch (i) { |
|
85 case ':': |
|
86 case '*': |
|
87 case '%': |
|
88 case '^': |
|
89 case '=': |
|
90 case '!': |
|
91 case '&': |
|
92 case '|': |
|
93 case '+': |
|
94 case '<': |
|
95 case '>': |
|
96 case '-': |
|
97 case '.': |
|
98 s_scan_table[i] = &Tokenizer::scanOperator; |
|
99 break; |
|
100 |
|
101 case '\r': |
|
102 case '\n': |
|
103 s_scan_table[i] = &Tokenizer::scanNewline; |
|
104 break; |
|
105 |
|
106 case '#': |
|
107 s_scan_table[i] = &Tokenizer::scanPreprocessor; |
|
108 break; |
|
109 |
|
110 case '/': |
|
111 s_scan_table[i] = &Tokenizer::scanComment; |
|
112 break; |
|
113 |
|
114 case '\'': |
|
115 s_scan_table[i] = &Tokenizer::scanCharLiteral; |
|
116 break; |
|
117 |
|
118 case '"': |
|
119 s_scan_table[i] = &Tokenizer::scanStringLiteral; |
|
120 break; |
|
121 |
|
122 default: |
|
123 if (isspace(i)) { |
|
124 s_scan_table[i] = &Tokenizer::scanWhiteSpaces; |
|
125 s_attr_table[i] |= A_Whitespace; |
|
126 } else if (isalpha(i) || i == '_') { |
|
127 s_scan_table[i] = &Tokenizer::scanIdentifier; |
|
128 s_attr_table[i] |= A_Alpha; |
|
129 } else if (isdigit(i)) { |
|
130 s_scan_table[i] = &Tokenizer::scanNumberLiteral; |
|
131 s_attr_table[i] |= A_Digit; |
|
132 } else |
|
133 s_scan_table[i] = &Tokenizer::scanChar; |
|
134 } |
|
135 } |
|
136 |
|
137 s_scan_table[128] = &Tokenizer::scanUnicodeChar; |
|
138 } |
|
139 |
|
140 QVector<TokenEngine::Token> Tokenizer::tokenize(QByteArray text) |
|
141 { |
|
142 m_tokens.clear(); |
|
143 |
|
144 m_buffer = text; |
|
145 m_ptr = 0; |
|
146 |
|
147 // tokenize |
|
148 for (;;) { |
|
149 Token tk; |
|
150 bool endOfFile = nextToken(tk); |
|
151 if (endOfFile) { |
|
152 break; |
|
153 } |
|
154 m_tokens.append(tk); |
|
155 } |
|
156 |
|
157 return m_tokens; |
|
158 } |
|
159 |
|
160 bool Tokenizer::nextToken(Token &tok) |
|
161 { |
|
162 int start = m_ptr; |
|
163 unsigned char ch = (unsigned char)m_buffer[m_ptr]; |
|
164 |
|
165 int kind = 0; |
|
166 (this->*s_scan_table[ch < 128 ? ch : 128])(&kind); |
|
167 |
|
168 tok.start = start; |
|
169 tok.length = m_ptr - start; |
|
170 |
|
171 return (kind == 0); |
|
172 } |
|
173 |
|
174 void Tokenizer::scanChar(int *kind) |
|
175 { |
|
176 *kind = m_buffer[m_ptr++]; |
|
177 } |
|
178 |
|
179 void Tokenizer::scanWhiteSpaces(int *kind) |
|
180 { |
|
181 *kind = Token_whitespaces; |
|
182 while (unsigned char ch = m_buffer[m_ptr]) { |
|
183 if (s_attr_table[ch] & A_Whitespace) |
|
184 ++m_ptr; |
|
185 else |
|
186 break; |
|
187 } |
|
188 } |
|
189 |
|
190 void Tokenizer::scanNewline(int *kind) |
|
191 { |
|
192 Q_UNUSED(kind); |
|
193 const unsigned char ch = m_buffer[m_ptr++]; |
|
194 // Check for \n. |
|
195 if (ch == '\n') { |
|
196 *kind = '\n'; |
|
197 return; |
|
198 } |
|
199 |
|
200 // Check for \r\n. |
|
201 if (ch == '\r' && m_buffer[m_ptr] == '\n') { |
|
202 *kind = '\n'; |
|
203 ++ m_ptr; |
|
204 return; |
|
205 } |
|
206 |
|
207 *kind = ch; |
|
208 } |
|
209 |
|
210 void Tokenizer::scanUnicodeChar(int *kind) |
|
211 { |
|
212 *kind = m_buffer[m_ptr++]; |
|
213 } |
|
214 |
|
215 void Tokenizer::scanCharLiteral(int *kind) |
|
216 { |
|
217 ++m_ptr; |
|
218 for (;;) { |
|
219 unsigned char ch = m_buffer[m_ptr]; |
|
220 switch (ch) { |
|
221 case '\0': |
|
222 case '\n': |
|
223 // ### error |
|
224 *kind = Token_char_literal; |
|
225 return; |
|
226 case '\\': |
|
227 if (m_buffer[m_ptr+1] == '\'' || m_buffer[m_ptr+1] == '\\') |
|
228 m_ptr += 2; |
|
229 else |
|
230 ++m_ptr; |
|
231 break; |
|
232 case '\'': |
|
233 ++m_ptr; |
|
234 *kind = Token_char_literal; |
|
235 return; |
|
236 default: |
|
237 ++m_ptr; |
|
238 break; |
|
239 } |
|
240 } |
|
241 |
|
242 // ### error |
|
243 *kind = Token_char_literal; |
|
244 } |
|
245 |
|
246 void Tokenizer::scanStringLiteral(int *kind) |
|
247 { |
|
248 ++m_ptr; |
|
249 while (m_buffer[m_ptr]) { |
|
250 switch (m_buffer[m_ptr]) { |
|
251 case '\n': |
|
252 // ### error |
|
253 *kind = Token_string_literal; |
|
254 return; |
|
255 case '\\': |
|
256 if (m_buffer[m_ptr+1] == '"' || m_buffer[m_ptr+1] == '\\') |
|
257 m_ptr += 2; |
|
258 else |
|
259 ++m_ptr; |
|
260 break; |
|
261 case '"': |
|
262 ++m_ptr; |
|
263 *kind = Token_string_literal; |
|
264 return; |
|
265 default: |
|
266 ++m_ptr; |
|
267 break; |
|
268 } |
|
269 } |
|
270 |
|
271 // ### error |
|
272 *kind = Token_string_literal; |
|
273 } |
|
274 |
|
275 void Tokenizer::scanIdentifier(int *kind) |
|
276 { |
|
277 unsigned char ch; |
|
278 for (;;) { |
|
279 ch = m_buffer[m_ptr]; |
|
280 if (s_attr_table[ch] & A_Alphanum) |
|
281 ++m_ptr; |
|
282 else |
|
283 break; |
|
284 } |
|
285 *kind = Token_identifier; |
|
286 } |
|
287 |
|
288 void Tokenizer::scanNumberLiteral(int *kind) |
|
289 { |
|
290 unsigned char ch; |
|
291 for (;;) { |
|
292 ch = m_buffer[m_ptr]; |
|
293 if (s_attr_table[ch] & A_Alphanum || ch == '.') |
|
294 ++m_ptr; |
|
295 else |
|
296 break; |
|
297 } |
|
298 |
|
299 // ### finish to implement me!! |
|
300 *kind = Token_number_literal; |
|
301 } |
|
302 |
|
303 void Tokenizer::scanComment(int *kind) |
|
304 { |
|
305 if (!(m_buffer[m_ptr+1] == '/' || m_buffer[m_ptr+1] == '*')) { |
|
306 scanOperator(kind); |
|
307 return; |
|
308 } |
|
309 |
|
310 ++m_ptr; // skip '/' |
|
311 |
|
312 bool multiLineComment = m_buffer[m_ptr++] == '*'; |
|
313 |
|
314 while (m_buffer[m_ptr]) { |
|
315 switch (m_buffer[m_ptr]) { |
|
316 case '\r': |
|
317 case '\n': |
|
318 if (!multiLineComment) { |
|
319 *kind = Token_comment; |
|
320 return; |
|
321 } |
|
322 |
|
323 (void) scanNewline(kind); |
|
324 break; |
|
325 |
|
326 case '*': |
|
327 if (multiLineComment && m_buffer[m_ptr+1] == '/') { |
|
328 m_ptr += 2; |
|
329 *kind = Token_comment; |
|
330 return; |
|
331 } |
|
332 ++m_ptr; |
|
333 break; |
|
334 |
|
335 default: |
|
336 ++m_ptr; |
|
337 } |
|
338 } |
|
339 |
|
340 // ### error |
|
341 *kind = Token_comment; |
|
342 } |
|
343 |
|
344 |
|
345 void Tokenizer::scanPreprocessor(int *kind) |
|
346 { |
|
347 ++m_ptr; |
|
348 *kind = Token_preproc; |
|
349 } |
|
350 |
|
351 |
|
352 void Tokenizer::scanOperator(int *kind) |
|
353 { |
|
354 switch (m_buffer[m_ptr]) { |
|
355 case ':': |
|
356 if (m_buffer[m_ptr+1] == ':') { |
|
357 m_ptr += 2; |
|
358 *kind = Token_scope; |
|
359 return; |
|
360 } |
|
361 break; |
|
362 |
|
363 case '*': |
|
364 case '/': |
|
365 case '%': |
|
366 case '^': |
|
367 if (m_buffer[m_ptr+1] == '=') { |
|
368 m_ptr += 2; |
|
369 *kind = Token_assign; |
|
370 return; |
|
371 } |
|
372 break; |
|
373 |
|
374 case '=': |
|
375 case '!': |
|
376 if (m_buffer[m_ptr+1] == '=') { |
|
377 m_ptr += 2; |
|
378 *kind = Token_eq; |
|
379 return; |
|
380 } |
|
381 break; |
|
382 |
|
383 case '&': |
|
384 if (m_buffer[m_ptr+1] == '&') { |
|
385 m_ptr += 2; |
|
386 *kind = Token_and; |
|
387 return; |
|
388 } else if (m_buffer[m_ptr+1] == '=') { |
|
389 m_ptr += 2; |
|
390 *kind = Token_assign; |
|
391 return; |
|
392 } |
|
393 break; |
|
394 |
|
395 case '|': |
|
396 if (m_buffer[m_ptr+1] == '|' ) { |
|
397 m_ptr += 2; |
|
398 *kind = Token_or; |
|
399 return; |
|
400 } else if (m_buffer[m_ptr+1] == '=') { |
|
401 m_ptr += 2; |
|
402 *kind = Token_assign; |
|
403 return; |
|
404 } |
|
405 break; |
|
406 |
|
407 case '+': |
|
408 if (m_buffer[m_ptr+1] == '+' ) { |
|
409 m_ptr += 2; |
|
410 *kind = Token_incr; |
|
411 return; |
|
412 } else if (m_buffer[m_ptr+1] == '=') { |
|
413 m_ptr += 2; |
|
414 *kind = Token_assign; |
|
415 return; |
|
416 } |
|
417 break; |
|
418 |
|
419 case '<': |
|
420 if (m_buffer[m_ptr+1] == '<') { |
|
421 if (m_buffer[m_ptr+2] == '=') { |
|
422 m_ptr += 3; |
|
423 *kind = Token_assign; |
|
424 return; |
|
425 } |
|
426 m_ptr += 2; |
|
427 *kind = Token_shift; |
|
428 return; |
|
429 } else if (m_buffer[m_ptr+1] == '=') { |
|
430 m_ptr += 2; |
|
431 *kind = Token_leq; |
|
432 return; |
|
433 } |
|
434 break; |
|
435 |
|
436 case '>': |
|
437 if (m_buffer[m_ptr+1] == '>') { |
|
438 if (m_buffer[m_ptr+2] == '=') { |
|
439 m_ptr += 3; |
|
440 *kind = Token_assign; |
|
441 return; |
|
442 } |
|
443 m_ptr += 2; |
|
444 *kind = Token_shift; |
|
445 return; |
|
446 } else if (m_buffer[m_ptr+1] == '=') { |
|
447 m_ptr += 2; |
|
448 *kind = Token_geq; |
|
449 return; |
|
450 } |
|
451 break; |
|
452 |
|
453 case '-': |
|
454 if (m_buffer[m_ptr+1] == '>') { |
|
455 if (m_buffer[m_ptr+2] == '*') { |
|
456 m_ptr += 3; |
|
457 *kind = Token_ptrmem; |
|
458 return; |
|
459 } |
|
460 m_ptr += 2; |
|
461 *kind = Token_arrow; |
|
462 return; |
|
463 } else if (m_buffer[m_ptr+1] == '-') { |
|
464 m_ptr += 2; |
|
465 *kind = Token_decr; |
|
466 return; |
|
467 } else if (m_buffer[m_ptr+1] == '=') { |
|
468 m_ptr += 2; |
|
469 *kind = Token_assign; |
|
470 return; |
|
471 } |
|
472 break; |
|
473 |
|
474 case '.': |
|
475 if (m_buffer[m_ptr+1] == '.' && m_buffer[m_ptr+2] == '.') { |
|
476 m_ptr += 3; |
|
477 *kind = Token_ellipsis; |
|
478 return; |
|
479 } else if (m_buffer[m_ptr+1] == '*') { |
|
480 m_ptr += 2; |
|
481 *kind = Token_ptrmem; |
|
482 return; |
|
483 } |
|
484 break; |
|
485 |
|
486 } |
|
487 |
|
488 *kind = m_buffer[m_ptr++]; |
|
489 } |
|
490 |
|
491 QT_END_NAMESPACE |