|
1 |
|
2 /* Tokenizer implementation */ |
|
3 |
|
4 #include "Python.h" |
|
5 #include "pgenheaders.h" |
|
6 |
|
7 #include <ctype.h> |
|
8 #include <assert.h> |
|
9 |
|
10 #include "tokenizer.h" |
|
11 #include "errcode.h" |
|
12 |
|
13 #ifndef PGEN |
|
14 #include "unicodeobject.h" |
|
15 #include "stringobject.h" |
|
16 #include "fileobject.h" |
|
17 #include "codecs.h" |
|
18 #include "abstract.h" |
|
19 #include "pydebug.h" |
|
20 #endif /* PGEN */ |
|
21 |
|
22 extern char *PyOS_Readline(FILE *, FILE *, char *); |
|
23 /* Return malloc'ed string including trailing \n; |
|
24 empty malloc'ed string for EOF; |
|
25 NULL if interrupted */ |
|
26 |
|
27 /* Don't ever change this -- it would break the portability of Python code */ |
|
28 #define TABSIZE 8 |
|
29 |
|
30 /* Forward */ |
|
31 static struct tok_state *tok_new(void); |
|
32 static int tok_nextc(struct tok_state *tok); |
|
33 static void tok_backup(struct tok_state *tok, int c); |
|
34 |
|
35 /* Token names */ |
|
36 |
|
37 char *_PyParser_TokenNames[] = { |
|
38 "ENDMARKER", |
|
39 "NAME", |
|
40 "NUMBER", |
|
41 "STRING", |
|
42 "NEWLINE", |
|
43 "INDENT", |
|
44 "DEDENT", |
|
45 "LPAR", |
|
46 "RPAR", |
|
47 "LSQB", |
|
48 "RSQB", |
|
49 "COLON", |
|
50 "COMMA", |
|
51 "SEMI", |
|
52 "PLUS", |
|
53 "MINUS", |
|
54 "STAR", |
|
55 "SLASH", |
|
56 "VBAR", |
|
57 "AMPER", |
|
58 "LESS", |
|
59 "GREATER", |
|
60 "EQUAL", |
|
61 "DOT", |
|
62 "PERCENT", |
|
63 "BACKQUOTE", |
|
64 "LBRACE", |
|
65 "RBRACE", |
|
66 "EQEQUAL", |
|
67 "NOTEQUAL", |
|
68 "LESSEQUAL", |
|
69 "GREATEREQUAL", |
|
70 "TILDE", |
|
71 "CIRCUMFLEX", |
|
72 "LEFTSHIFT", |
|
73 "RIGHTSHIFT", |
|
74 "DOUBLESTAR", |
|
75 "PLUSEQUAL", |
|
76 "MINEQUAL", |
|
77 "STAREQUAL", |
|
78 "SLASHEQUAL", |
|
79 "PERCENTEQUAL", |
|
80 "AMPEREQUAL", |
|
81 "VBAREQUAL", |
|
82 "CIRCUMFLEXEQUAL", |
|
83 "LEFTSHIFTEQUAL", |
|
84 "RIGHTSHIFTEQUAL", |
|
85 "DOUBLESTAREQUAL", |
|
86 "DOUBLESLASH", |
|
87 "DOUBLESLASHEQUAL", |
|
88 "AT", |
|
89 /* This table must match the #defines in token.h! */ |
|
90 "OP", |
|
91 "<ERRORTOKEN>", |
|
92 "<N_TOKENS>" |
|
93 }; |
|
94 |
|
95 |
|
96 /* Create and initialize a new tok_state structure */ |
|
97 |
|
98 static struct tok_state * |
|
99 tok_new(void) |
|
100 { |
|
101 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC( |
|
102 sizeof(struct tok_state)); |
|
103 if (tok == NULL) |
|
104 return NULL; |
|
105 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL; |
|
106 tok->done = E_OK; |
|
107 tok->fp = NULL; |
|
108 tok->tabsize = TABSIZE; |
|
109 tok->indent = 0; |
|
110 tok->indstack[0] = 0; |
|
111 tok->atbol = 1; |
|
112 tok->pendin = 0; |
|
113 tok->prompt = tok->nextprompt = NULL; |
|
114 tok->lineno = 0; |
|
115 tok->level = 0; |
|
116 tok->filename = NULL; |
|
117 tok->altwarning = 0; |
|
118 tok->alterror = 0; |
|
119 tok->alttabsize = 1; |
|
120 tok->altindstack[0] = 0; |
|
121 tok->decoding_state = 0; |
|
122 tok->decoding_erred = 0; |
|
123 tok->read_coding_spec = 0; |
|
124 tok->encoding = NULL; |
|
125 tok->cont_line = 0; |
|
126 #ifndef PGEN |
|
127 tok->decoding_readline = NULL; |
|
128 tok->decoding_buffer = NULL; |
|
129 #endif |
|
130 return tok; |
|
131 } |
|
132 |
|
133 #ifdef PGEN |
|
134 |
|
135 static char * |
|
136 decoding_fgets(char *s, int size, struct tok_state *tok) |
|
137 { |
|
138 return fgets(s, size, tok->fp); |
|
139 } |
|
140 |
|
141 static int |
|
142 decoding_feof(struct tok_state *tok) |
|
143 { |
|
144 return feof(tok->fp); |
|
145 } |
|
146 |
|
147 static const char * |
|
148 decode_str(const char *str, struct tok_state *tok) |
|
149 { |
|
150 return str; |
|
151 } |
|
152 |
|
153 #else /* PGEN */ |
|
154 |
|
155 static char * |
|
156 error_ret(struct tok_state *tok) /* XXX */ |
|
157 { |
|
158 tok->decoding_erred = 1; |
|
159 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */ |
|
160 PyMem_FREE(tok->buf); |
|
161 tok->buf = NULL; |
|
162 return NULL; /* as if it were EOF */ |
|
163 } |
|
164 |
|
165 static char * |
|
166 new_string(const char *s, Py_ssize_t len) |
|
167 { |
|
168 char* result = (char *)PyMem_MALLOC(len + 1); |
|
169 if (result != NULL) { |
|
170 memcpy(result, s, len); |
|
171 result[len] = '\0'; |
|
172 } |
|
173 return result; |
|
174 } |
|
175 |
|
176 static char * |
|
177 get_normal_name(char *s) /* for utf-8 and latin-1 */ |
|
178 { |
|
179 char buf[13]; |
|
180 int i; |
|
181 for (i = 0; i < 12; i++) { |
|
182 int c = s[i]; |
|
183 if (c == '\0') break; |
|
184 else if (c == '_') buf[i] = '-'; |
|
185 else buf[i] = tolower(c); |
|
186 } |
|
187 buf[i] = '\0'; |
|
188 if (strcmp(buf, "utf-8") == 0 || |
|
189 strncmp(buf, "utf-8-", 6) == 0) return "utf-8"; |
|
190 else if (strcmp(buf, "latin-1") == 0 || |
|
191 strcmp(buf, "iso-8859-1") == 0 || |
|
192 strcmp(buf, "iso-latin-1") == 0 || |
|
193 strncmp(buf, "latin-1-", 8) == 0 || |
|
194 strncmp(buf, "iso-8859-1-", 11) == 0 || |
|
195 strncmp(buf, "iso-latin-1-", 12) == 0) return "iso-8859-1"; |
|
196 else return s; |
|
197 } |
|
198 |
|
199 /* Return the coding spec in S, or NULL if none is found. */ |
|
200 |
|
201 static char * |
|
202 get_coding_spec(const char *s, Py_ssize_t size) |
|
203 { |
|
204 Py_ssize_t i; |
|
205 /* Coding spec must be in a comment, and that comment must be |
|
206 * the only statement on the source code line. */ |
|
207 for (i = 0; i < size - 6; i++) { |
|
208 if (s[i] == '#') |
|
209 break; |
|
210 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014') |
|
211 return NULL; |
|
212 } |
|
213 for (; i < size - 6; i++) { /* XXX inefficient search */ |
|
214 const char* t = s + i; |
|
215 if (strncmp(t, "coding", 6) == 0) { |
|
216 const char* begin = NULL; |
|
217 t += 6; |
|
218 if (t[0] != ':' && t[0] != '=') |
|
219 continue; |
|
220 do { |
|
221 t++; |
|
222 } while (t[0] == '\x20' || t[0] == '\t'); |
|
223 |
|
224 begin = t; |
|
225 while (isalnum(Py_CHARMASK(t[0])) || |
|
226 t[0] == '-' || t[0] == '_' || t[0] == '.') |
|
227 t++; |
|
228 |
|
229 if (begin < t) { |
|
230 char* r = new_string(begin, t - begin); |
|
231 char* q = get_normal_name(r); |
|
232 if (r != q) { |
|
233 PyMem_FREE(r); |
|
234 r = new_string(q, strlen(q)); |
|
235 } |
|
236 return r; |
|
237 } |
|
238 } |
|
239 } |
|
240 return NULL; |
|
241 } |
|
242 |
|
243 /* Check whether the line contains a coding spec. If it does, |
|
244 invoke the set_readline function for the new encoding. |
|
245 This function receives the tok_state and the new encoding. |
|
246 Return 1 on success, 0 on failure. */ |
|
247 |
|
248 static int |
|
249 check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok, |
|
250 int set_readline(struct tok_state *, const char *)) |
|
251 { |
|
252 char * cs; |
|
253 int r = 1; |
|
254 |
|
255 if (tok->cont_line) |
|
256 /* It's a continuation line, so it can't be a coding spec. */ |
|
257 return 1; |
|
258 cs = get_coding_spec(line, size); |
|
259 if (cs != NULL) { |
|
260 tok->read_coding_spec = 1; |
|
261 if (tok->encoding == NULL) { |
|
262 assert(tok->decoding_state == 1); /* raw */ |
|
263 if (strcmp(cs, "utf-8") == 0 || |
|
264 strcmp(cs, "iso-8859-1") == 0) { |
|
265 tok->encoding = cs; |
|
266 } else { |
|
267 #ifdef Py_USING_UNICODE |
|
268 r = set_readline(tok, cs); |
|
269 if (r) { |
|
270 tok->encoding = cs; |
|
271 tok->decoding_state = -1; |
|
272 } |
|
273 else |
|
274 PyMem_FREE(cs); |
|
275 #else |
|
276 /* Without Unicode support, we cannot |
|
277 process the coding spec. Since there |
|
278 won't be any Unicode literals, that |
|
279 won't matter. */ |
|
280 PyMem_FREE(cs); |
|
281 #endif |
|
282 } |
|
283 } else { /* then, compare cs with BOM */ |
|
284 r = (strcmp(tok->encoding, cs) == 0); |
|
285 PyMem_FREE(cs); |
|
286 } |
|
287 } |
|
288 if (!r) { |
|
289 cs = tok->encoding; |
|
290 if (!cs) |
|
291 cs = "with BOM"; |
|
292 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs); |
|
293 } |
|
294 return r; |
|
295 } |
|
296 |
|
297 /* See whether the file starts with a BOM. If it does, |
|
298 invoke the set_readline function with the new encoding. |
|
299 Return 1 on success, 0 on failure. */ |
|
300 |
|
301 static int |
|
302 check_bom(int get_char(struct tok_state *), |
|
303 void unget_char(int, struct tok_state *), |
|
304 int set_readline(struct tok_state *, const char *), |
|
305 struct tok_state *tok) |
|
306 { |
|
307 int ch = get_char(tok); |
|
308 tok->decoding_state = 1; |
|
309 if (ch == EOF) { |
|
310 return 1; |
|
311 } else if (ch == 0xEF) { |
|
312 ch = get_char(tok); if (ch != 0xBB) goto NON_BOM; |
|
313 ch = get_char(tok); if (ch != 0xBF) goto NON_BOM; |
|
314 #if 0 |
|
315 /* Disable support for UTF-16 BOMs until a decision |
|
316 is made whether this needs to be supported. */ |
|
317 } else if (ch == 0xFE) { |
|
318 ch = get_char(tok); if (ch != 0xFF) goto NON_BOM; |
|
319 if (!set_readline(tok, "utf-16-be")) return 0; |
|
320 tok->decoding_state = -1; |
|
321 } else if (ch == 0xFF) { |
|
322 ch = get_char(tok); if (ch != 0xFE) goto NON_BOM; |
|
323 if (!set_readline(tok, "utf-16-le")) return 0; |
|
324 tok->decoding_state = -1; |
|
325 #endif |
|
326 } else { |
|
327 unget_char(ch, tok); |
|
328 return 1; |
|
329 } |
|
330 if (tok->encoding != NULL) |
|
331 PyMem_FREE(tok->encoding); |
|
332 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */ |
|
333 return 1; |
|
334 NON_BOM: |
|
335 /* any token beginning with '\xEF', '\xFE', '\xFF' is a bad token */ |
|
336 unget_char(0xFF, tok); /* XXX this will cause a syntax error */ |
|
337 return 1; |
|
338 } |
|
339 |
|
340 /* Read a line of text from TOK into S, using the stream in TOK. |
|
341 Return NULL on failure, else S. |
|
342 |
|
343 On entry, tok->decoding_buffer will be one of: |
|
344 1) NULL: need to call tok->decoding_readline to get a new line |
|
345 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and |
|
346 stored the result in tok->decoding_buffer |
|
347 3) PyStringObject *: previous call to fp_readl did not have enough room |
|
348 (in the s buffer) to copy entire contents of the line read |
|
349 by tok->decoding_readline. tok->decoding_buffer has the overflow. |
|
350 In this case, fp_readl is called in a loop (with an expanded buffer) |
|
351 until the buffer ends with a '\n' (or until the end of the file is |
|
352 reached): see tok_nextc and its calls to decoding_fgets. |
|
353 */ |
|
354 |
|
355 static char * |
|
356 fp_readl(char *s, int size, struct tok_state *tok) |
|
357 { |
|
358 #ifndef Py_USING_UNICODE |
|
359 /* In a non-Unicode built, this should never be called. */ |
|
360 Py_FatalError("fp_readl should not be called in this build."); |
|
361 return NULL; /* Keep compiler happy (not reachable) */ |
|
362 #else |
|
363 PyObject* utf8 = NULL; |
|
364 PyObject* buf = tok->decoding_buffer; |
|
365 char *str; |
|
366 Py_ssize_t utf8len; |
|
367 |
|
368 /* Ask for one less byte so we can terminate it */ |
|
369 assert(size > 0); |
|
370 size--; |
|
371 |
|
372 if (buf == NULL) { |
|
373 buf = PyObject_CallObject(tok->decoding_readline, NULL); |
|
374 if (buf == NULL) |
|
375 return error_ret(tok); |
|
376 } else { |
|
377 tok->decoding_buffer = NULL; |
|
378 if (PyString_CheckExact(buf)) |
|
379 utf8 = buf; |
|
380 } |
|
381 if (utf8 == NULL) { |
|
382 utf8 = PyUnicode_AsUTF8String(buf); |
|
383 Py_DECREF(buf); |
|
384 if (utf8 == NULL) |
|
385 return error_ret(tok); |
|
386 } |
|
387 str = PyString_AsString(utf8); |
|
388 utf8len = PyString_GET_SIZE(utf8); |
|
389 if (utf8len > size) { |
|
390 tok->decoding_buffer = PyString_FromStringAndSize(str+size, utf8len-size); |
|
391 if (tok->decoding_buffer == NULL) { |
|
392 Py_DECREF(utf8); |
|
393 return error_ret(tok); |
|
394 } |
|
395 utf8len = size; |
|
396 } |
|
397 memcpy(s, str, utf8len); |
|
398 s[utf8len] = '\0'; |
|
399 Py_DECREF(utf8); |
|
400 if (utf8len == 0) return NULL; /* EOF */ |
|
401 return s; |
|
402 #endif |
|
403 } |
|
404 |
|
405 /* Set the readline function for TOK to a StreamReader's |
|
406 readline function. The StreamReader is named ENC. |
|
407 |
|
408 This function is called from check_bom and check_coding_spec. |
|
409 |
|
410 ENC is usually identical to the future value of tok->encoding, |
|
411 except for the (currently unsupported) case of UTF-16. |
|
412 |
|
413 Return 1 on success, 0 on failure. */ |
|
414 |
|
415 static int |
|
416 fp_setreadl(struct tok_state *tok, const char* enc) |
|
417 { |
|
418 PyObject *reader, *stream, *readline; |
|
419 |
|
420 /* XXX: constify filename argument. */ |
|
421 stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL); |
|
422 if (stream == NULL) |
|
423 return 0; |
|
424 |
|
425 reader = PyCodec_StreamReader(enc, stream, NULL); |
|
426 Py_DECREF(stream); |
|
427 if (reader == NULL) |
|
428 return 0; |
|
429 |
|
430 readline = PyObject_GetAttrString(reader, "readline"); |
|
431 Py_DECREF(reader); |
|
432 if (readline == NULL) |
|
433 return 0; |
|
434 |
|
435 tok->decoding_readline = readline; |
|
436 return 1; |
|
437 } |
|
438 |
|
439 /* Fetch the next byte from TOK. */ |
|
440 |
|
441 static int fp_getc(struct tok_state *tok) { |
|
442 return getc(tok->fp); |
|
443 } |
|
444 |
|
445 /* Unfetch the last byte back into TOK. */ |
|
446 |
|
447 static void fp_ungetc(int c, struct tok_state *tok) { |
|
448 ungetc(c, tok->fp); |
|
449 } |
|
450 |
|
451 /* Read a line of input from TOK. Determine encoding |
|
452 if necessary. */ |
|
453 |
|
454 static char * |
|
455 decoding_fgets(char *s, int size, struct tok_state *tok) |
|
456 { |
|
457 char *line = NULL; |
|
458 int badchar = 0; |
|
459 for (;;) { |
|
460 if (tok->decoding_state < 0) { |
|
461 /* We already have a codec associated with |
|
462 this input. */ |
|
463 line = fp_readl(s, size, tok); |
|
464 break; |
|
465 } else if (tok->decoding_state > 0) { |
|
466 /* We want a 'raw' read. */ |
|
467 line = Py_UniversalNewlineFgets(s, size, |
|
468 tok->fp, NULL); |
|
469 break; |
|
470 } else { |
|
471 /* We have not yet determined the encoding. |
|
472 If an encoding is found, use the file-pointer |
|
473 reader functions from now on. */ |
|
474 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok)) |
|
475 return error_ret(tok); |
|
476 assert(tok->decoding_state != 0); |
|
477 } |
|
478 } |
|
479 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) { |
|
480 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) { |
|
481 return error_ret(tok); |
|
482 } |
|
483 } |
|
484 #ifndef PGEN |
|
485 /* The default encoding is ASCII, so make sure we don't have any |
|
486 non-ASCII bytes in it. */ |
|
487 if (line && !tok->encoding) { |
|
488 unsigned char *c; |
|
489 for (c = (unsigned char *)line; *c; c++) |
|
490 if (*c > 127) { |
|
491 badchar = *c; |
|
492 break; |
|
493 } |
|
494 } |
|
495 if (badchar) { |
|
496 char buf[500]; |
|
497 /* Need to add 1 to the line number, since this line |
|
498 has not been counted, yet. */ |
|
499 sprintf(buf, |
|
500 "Non-ASCII character '\\x%.2x' " |
|
501 "in file %.200s on line %i, " |
|
502 "but no encoding declared; " |
|
503 "see http://www.python.org/peps/pep-0263.html for details", |
|
504 badchar, tok->filename, tok->lineno + 1); |
|
505 PyErr_SetString(PyExc_SyntaxError, buf); |
|
506 return error_ret(tok); |
|
507 } |
|
508 #endif |
|
509 return line; |
|
510 } |
|
511 |
|
512 static int |
|
513 decoding_feof(struct tok_state *tok) |
|
514 { |
|
515 if (tok->decoding_state >= 0) { |
|
516 return feof(tok->fp); |
|
517 } else { |
|
518 PyObject* buf = tok->decoding_buffer; |
|
519 if (buf == NULL) { |
|
520 buf = PyObject_CallObject(tok->decoding_readline, NULL); |
|
521 if (buf == NULL) { |
|
522 error_ret(tok); |
|
523 return 1; |
|
524 } else { |
|
525 tok->decoding_buffer = buf; |
|
526 } |
|
527 } |
|
528 return PyObject_Length(buf) == 0; |
|
529 } |
|
530 } |
|
531 |
|
532 /* Fetch a byte from TOK, using the string buffer. */ |
|
533 |
|
534 static int |
|
535 buf_getc(struct tok_state *tok) { |
|
536 return Py_CHARMASK(*tok->str++); |
|
537 } |
|
538 |
|
539 /* Unfetch a byte from TOK, using the string buffer. */ |
|
540 |
|
541 static void |
|
542 buf_ungetc(int c, struct tok_state *tok) { |
|
543 tok->str--; |
|
544 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */ |
|
545 } |
|
546 |
|
547 /* Set the readline function for TOK to ENC. For the string-based |
|
548 tokenizer, this means to just record the encoding. */ |
|
549 |
|
550 static int |
|
551 buf_setreadl(struct tok_state *tok, const char* enc) { |
|
552 tok->enc = enc; |
|
553 return 1; |
|
554 } |
|
555 |
|
556 /* Return a UTF-8 encoding Python string object from the |
|
557 C byte string STR, which is encoded with ENC. */ |
|
558 |
|
559 #ifdef Py_USING_UNICODE |
|
560 static PyObject * |
|
561 translate_into_utf8(const char* str, const char* enc) { |
|
562 PyObject *utf8; |
|
563 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL); |
|
564 if (buf == NULL) |
|
565 return NULL; |
|
566 utf8 = PyUnicode_AsUTF8String(buf); |
|
567 Py_DECREF(buf); |
|
568 return utf8; |
|
569 } |
|
570 #endif |
|
571 |
|
572 /* Decode a byte string STR for use as the buffer of TOK. |
|
573 Look for encoding declarations inside STR, and record them |
|
574 inside TOK. */ |
|
575 |
|
576 static const char * |
|
577 decode_str(const char *str, struct tok_state *tok) |
|
578 { |
|
579 PyObject* utf8 = NULL; |
|
580 const char *s; |
|
581 const char *newl[2] = {NULL, NULL}; |
|
582 int lineno = 0; |
|
583 tok->enc = NULL; |
|
584 tok->str = str; |
|
585 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok)) |
|
586 return error_ret(tok); |
|
587 str = tok->str; /* string after BOM if any */ |
|
588 assert(str); |
|
589 #ifdef Py_USING_UNICODE |
|
590 if (tok->enc != NULL) { |
|
591 utf8 = translate_into_utf8(str, tok->enc); |
|
592 if (utf8 == NULL) |
|
593 return error_ret(tok); |
|
594 str = PyString_AsString(utf8); |
|
595 } |
|
596 #endif |
|
597 for (s = str;; s++) { |
|
598 if (*s == '\0') break; |
|
599 else if (*s == '\n') { |
|
600 assert(lineno < 2); |
|
601 newl[lineno] = s; |
|
602 lineno++; |
|
603 if (lineno == 2) break; |
|
604 } |
|
605 } |
|
606 tok->enc = NULL; |
|
607 /* need to check line 1 and 2 separately since check_coding_spec |
|
608 assumes a single line as input */ |
|
609 if (newl[0]) { |
|
610 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl)) |
|
611 return error_ret(tok); |
|
612 if (tok->enc == NULL && newl[1]) { |
|
613 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0], |
|
614 tok, buf_setreadl)) |
|
615 return error_ret(tok); |
|
616 } |
|
617 } |
|
618 #ifdef Py_USING_UNICODE |
|
619 if (tok->enc != NULL) { |
|
620 assert(utf8 == NULL); |
|
621 utf8 = translate_into_utf8(str, tok->enc); |
|
622 if (utf8 == NULL) { |
|
623 PyErr_Format(PyExc_SyntaxError, |
|
624 "unknown encoding: %s", tok->enc); |
|
625 return error_ret(tok); |
|
626 } |
|
627 str = PyString_AsString(utf8); |
|
628 } |
|
629 #endif |
|
630 assert(tok->decoding_buffer == NULL); |
|
631 tok->decoding_buffer = utf8; /* CAUTION */ |
|
632 return str; |
|
633 } |
|
634 |
|
635 #endif /* PGEN */ |
|
636 |
|
637 /* Set up tokenizer for string */ |
|
638 |
|
639 struct tok_state * |
|
640 PyTokenizer_FromString(const char *str) |
|
641 { |
|
642 struct tok_state *tok = tok_new(); |
|
643 if (tok == NULL) |
|
644 return NULL; |
|
645 str = (char *)decode_str(str, tok); |
|
646 if (str == NULL) { |
|
647 PyTokenizer_Free(tok); |
|
648 return NULL; |
|
649 } |
|
650 |
|
651 /* XXX: constify members. */ |
|
652 tok->buf = tok->cur = tok->end = tok->inp = (char*)str; |
|
653 return tok; |
|
654 } |
|
655 |
|
656 |
|
657 /* Set up tokenizer for file */ |
|
658 |
|
659 struct tok_state * |
|
660 PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2) |
|
661 { |
|
662 struct tok_state *tok = tok_new(); |
|
663 if (tok == NULL) |
|
664 return NULL; |
|
665 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) { |
|
666 PyTokenizer_Free(tok); |
|
667 return NULL; |
|
668 } |
|
669 tok->cur = tok->inp = tok->buf; |
|
670 tok->end = tok->buf + BUFSIZ; |
|
671 tok->fp = fp; |
|
672 tok->prompt = ps1; |
|
673 tok->nextprompt = ps2; |
|
674 return tok; |
|
675 } |
|
676 |
|
677 |
|
678 /* Free a tok_state structure */ |
|
679 |
|
680 void |
|
681 PyTokenizer_Free(struct tok_state *tok) |
|
682 { |
|
683 if (tok->encoding != NULL) |
|
684 PyMem_FREE(tok->encoding); |
|
685 #ifndef PGEN |
|
686 Py_XDECREF(tok->decoding_readline); |
|
687 Py_XDECREF(tok->decoding_buffer); |
|
688 #endif |
|
689 if (tok->fp != NULL && tok->buf != NULL) |
|
690 PyMem_FREE(tok->buf); |
|
691 PyMem_FREE(tok); |
|
692 } |
|
693 |
|
694 #if !defined(PGEN) && defined(Py_USING_UNICODE) |
|
695 static int |
|
696 tok_stdin_decode(struct tok_state *tok, char **inp) |
|
697 { |
|
698 PyObject *enc, *sysstdin, *decoded, *utf8; |
|
699 const char *encoding; |
|
700 char *converted; |
|
701 |
|
702 if (PySys_GetFile((char *)"stdin", NULL) != stdin) |
|
703 return 0; |
|
704 sysstdin = PySys_GetObject("stdin"); |
|
705 if (sysstdin == NULL || !PyFile_Check(sysstdin)) |
|
706 return 0; |
|
707 |
|
708 enc = ((PyFileObject *)sysstdin)->f_encoding; |
|
709 if (enc == NULL || !PyString_Check(enc)) |
|
710 return 0; |
|
711 Py_INCREF(enc); |
|
712 |
|
713 encoding = PyString_AsString(enc); |
|
714 decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL); |
|
715 if (decoded == NULL) |
|
716 goto error_clear; |
|
717 |
|
718 utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL); |
|
719 Py_DECREF(decoded); |
|
720 if (utf8 == NULL) |
|
721 goto error_clear; |
|
722 |
|
723 assert(PyString_Check(utf8)); |
|
724 converted = new_string(PyString_AS_STRING(utf8), |
|
725 PyString_GET_SIZE(utf8)); |
|
726 Py_DECREF(utf8); |
|
727 if (converted == NULL) |
|
728 goto error_nomem; |
|
729 |
|
730 PyMem_FREE(*inp); |
|
731 *inp = converted; |
|
732 if (tok->encoding != NULL) |
|
733 PyMem_FREE(tok->encoding); |
|
734 tok->encoding = new_string(encoding, strlen(encoding)); |
|
735 if (tok->encoding == NULL) |
|
736 goto error_nomem; |
|
737 |
|
738 Py_DECREF(enc); |
|
739 return 0; |
|
740 |
|
741 error_nomem: |
|
742 Py_DECREF(enc); |
|
743 tok->done = E_NOMEM; |
|
744 return -1; |
|
745 |
|
746 error_clear: |
|
747 /* Fallback to iso-8859-1: for backward compatibility */ |
|
748 Py_DECREF(enc); |
|
749 PyErr_Clear(); |
|
750 return 0; |
|
751 } |
|
752 #endif |
|
753 |
|
754 /* Get next char, updating state; error code goes into tok->done */ |
|
755 |
|
756 static int |
|
757 tok_nextc(register struct tok_state *tok) |
|
758 { |
|
759 for (;;) { |
|
760 if (tok->cur != tok->inp) { |
|
761 return Py_CHARMASK(*tok->cur++); /* Fast path */ |
|
762 } |
|
763 if (tok->done != E_OK) |
|
764 return EOF; |
|
765 if (tok->fp == NULL) { |
|
766 char *end = strchr(tok->inp, '\n'); |
|
767 if (end != NULL) |
|
768 end++; |
|
769 else { |
|
770 end = strchr(tok->inp, '\0'); |
|
771 if (end == tok->inp) { |
|
772 tok->done = E_EOF; |
|
773 return EOF; |
|
774 } |
|
775 } |
|
776 if (tok->start == NULL) |
|
777 tok->buf = tok->cur; |
|
778 tok->line_start = tok->cur; |
|
779 tok->lineno++; |
|
780 tok->inp = end; |
|
781 return Py_CHARMASK(*tok->cur++); |
|
782 } |
|
783 if (tok->prompt != NULL) { |
|
784 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt); |
|
785 if (tok->nextprompt != NULL) |
|
786 tok->prompt = tok->nextprompt; |
|
787 if (newtok == NULL) |
|
788 tok->done = E_INTR; |
|
789 else if (*newtok == '\0') { |
|
790 PyMem_FREE(newtok); |
|
791 tok->done = E_EOF; |
|
792 } |
|
793 #if !defined(PGEN) && defined(Py_USING_UNICODE) |
|
794 else if (tok_stdin_decode(tok, &newtok) != 0) |
|
795 PyMem_FREE(newtok); |
|
796 #endif |
|
797 else if (tok->start != NULL) { |
|
798 size_t start = tok->start - tok->buf; |
|
799 size_t oldlen = tok->cur - tok->buf; |
|
800 size_t newlen = oldlen + strlen(newtok); |
|
801 char *buf = tok->buf; |
|
802 buf = (char *)PyMem_REALLOC(buf, newlen+1); |
|
803 tok->lineno++; |
|
804 if (buf == NULL) { |
|
805 PyMem_FREE(tok->buf); |
|
806 tok->buf = NULL; |
|
807 PyMem_FREE(newtok); |
|
808 tok->done = E_NOMEM; |
|
809 return EOF; |
|
810 } |
|
811 tok->buf = buf; |
|
812 tok->cur = tok->buf + oldlen; |
|
813 tok->line_start = tok->cur; |
|
814 strcpy(tok->buf + oldlen, newtok); |
|
815 PyMem_FREE(newtok); |
|
816 tok->inp = tok->buf + newlen; |
|
817 tok->end = tok->inp + 1; |
|
818 tok->start = tok->buf + start; |
|
819 } |
|
820 else { |
|
821 tok->lineno++; |
|
822 if (tok->buf != NULL) |
|
823 PyMem_FREE(tok->buf); |
|
824 tok->buf = newtok; |
|
825 tok->line_start = tok->buf; |
|
826 tok->cur = tok->buf; |
|
827 tok->line_start = tok->buf; |
|
828 tok->inp = strchr(tok->buf, '\0'); |
|
829 tok->end = tok->inp + 1; |
|
830 } |
|
831 } |
|
832 else { |
|
833 int done = 0; |
|
834 Py_ssize_t cur = 0; |
|
835 char *pt; |
|
836 if (tok->start == NULL) { |
|
837 if (tok->buf == NULL) { |
|
838 tok->buf = (char *) |
|
839 PyMem_MALLOC(BUFSIZ); |
|
840 if (tok->buf == NULL) { |
|
841 tok->done = E_NOMEM; |
|
842 return EOF; |
|
843 } |
|
844 tok->end = tok->buf + BUFSIZ; |
|
845 } |
|
846 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf), |
|
847 tok) == NULL) { |
|
848 tok->done = E_EOF; |
|
849 done = 1; |
|
850 } |
|
851 else { |
|
852 tok->done = E_OK; |
|
853 tok->inp = strchr(tok->buf, '\0'); |
|
854 done = tok->inp[-1] == '\n'; |
|
855 } |
|
856 } |
|
857 else { |
|
858 cur = tok->cur - tok->buf; |
|
859 if (decoding_feof(tok)) { |
|
860 tok->done = E_EOF; |
|
861 done = 1; |
|
862 } |
|
863 else |
|
864 tok->done = E_OK; |
|
865 } |
|
866 tok->lineno++; |
|
867 /* Read until '\n' or EOF */ |
|
868 while (!done) { |
|
869 Py_ssize_t curstart = tok->start == NULL ? -1 : |
|
870 tok->start - tok->buf; |
|
871 Py_ssize_t curvalid = tok->inp - tok->buf; |
|
872 Py_ssize_t newsize = curvalid + BUFSIZ; |
|
873 char *newbuf = tok->buf; |
|
874 newbuf = (char *)PyMem_REALLOC(newbuf, |
|
875 newsize); |
|
876 if (newbuf == NULL) { |
|
877 tok->done = E_NOMEM; |
|
878 tok->cur = tok->inp; |
|
879 return EOF; |
|
880 } |
|
881 tok->buf = newbuf; |
|
882 tok->inp = tok->buf + curvalid; |
|
883 tok->end = tok->buf + newsize; |
|
884 tok->start = curstart < 0 ? NULL : |
|
885 tok->buf + curstart; |
|
886 if (decoding_fgets(tok->inp, |
|
887 (int)(tok->end - tok->inp), |
|
888 tok) == NULL) { |
|
889 /* Break out early on decoding |
|
890 errors, as tok->buf will be NULL |
|
891 */ |
|
892 if (tok->decoding_erred) |
|
893 return EOF; |
|
894 /* Last line does not end in \n, |
|
895 fake one */ |
|
896 strcpy(tok->inp, "\n"); |
|
897 } |
|
898 tok->inp = strchr(tok->inp, '\0'); |
|
899 done = tok->inp[-1] == '\n'; |
|
900 } |
|
901 if (tok->buf != NULL) { |
|
902 tok->cur = tok->buf + cur; |
|
903 tok->line_start = tok->cur; |
|
904 /* replace "\r\n" with "\n" */ |
|
905 /* For Mac leave the \r, giving a syntax error */ |
|
906 pt = tok->inp - 2; |
|
907 if (pt >= tok->buf && *pt == '\r') { |
|
908 *pt++ = '\n'; |
|
909 *pt = '\0'; |
|
910 tok->inp = pt; |
|
911 } |
|
912 } |
|
913 } |
|
914 if (tok->done != E_OK) { |
|
915 if (tok->prompt != NULL) |
|
916 PySys_WriteStderr("\n"); |
|
917 tok->cur = tok->inp; |
|
918 return EOF; |
|
919 } |
|
920 } |
|
921 /*NOTREACHED*/ |
|
922 } |
|
923 |
|
924 |
|
925 /* Back-up one character */ |
|
926 |
|
927 static void |
|
928 tok_backup(register struct tok_state *tok, register int c) |
|
929 { |
|
930 if (c != EOF) { |
|
931 if (--tok->cur < tok->buf) |
|
932 Py_FatalError("tok_backup: begin of buffer"); |
|
933 if (*tok->cur != c) |
|
934 *tok->cur = c; |
|
935 } |
|
936 } |
|
937 |
|
938 |
|
939 /* Return the token corresponding to a single character */ |
|
940 |
|
941 int |
|
942 PyToken_OneChar(int c) |
|
943 { |
|
944 switch (c) { |
|
945 case '(': return LPAR; |
|
946 case ')': return RPAR; |
|
947 case '[': return LSQB; |
|
948 case ']': return RSQB; |
|
949 case ':': return COLON; |
|
950 case ',': return COMMA; |
|
951 case ';': return SEMI; |
|
952 case '+': return PLUS; |
|
953 case '-': return MINUS; |
|
954 case '*': return STAR; |
|
955 case '/': return SLASH; |
|
956 case '|': return VBAR; |
|
957 case '&': return AMPER; |
|
958 case '<': return LESS; |
|
959 case '>': return GREATER; |
|
960 case '=': return EQUAL; |
|
961 case '.': return DOT; |
|
962 case '%': return PERCENT; |
|
963 case '`': return BACKQUOTE; |
|
964 case '{': return LBRACE; |
|
965 case '}': return RBRACE; |
|
966 case '^': return CIRCUMFLEX; |
|
967 case '~': return TILDE; |
|
968 case '@': return AT; |
|
969 default: return OP; |
|
970 } |
|
971 } |
|
972 |
|
973 |
|
974 int |
|
975 PyToken_TwoChars(int c1, int c2) |
|
976 { |
|
977 switch (c1) { |
|
978 case '=': |
|
979 switch (c2) { |
|
980 case '=': return EQEQUAL; |
|
981 } |
|
982 break; |
|
983 case '!': |
|
984 switch (c2) { |
|
985 case '=': return NOTEQUAL; |
|
986 } |
|
987 break; |
|
988 case '<': |
|
989 switch (c2) { |
|
990 case '>': return NOTEQUAL; |
|
991 case '=': return LESSEQUAL; |
|
992 case '<': return LEFTSHIFT; |
|
993 } |
|
994 break; |
|
995 case '>': |
|
996 switch (c2) { |
|
997 case '=': return GREATEREQUAL; |
|
998 case '>': return RIGHTSHIFT; |
|
999 } |
|
1000 break; |
|
1001 case '+': |
|
1002 switch (c2) { |
|
1003 case '=': return PLUSEQUAL; |
|
1004 } |
|
1005 break; |
|
1006 case '-': |
|
1007 switch (c2) { |
|
1008 case '=': return MINEQUAL; |
|
1009 } |
|
1010 break; |
|
1011 case '*': |
|
1012 switch (c2) { |
|
1013 case '*': return DOUBLESTAR; |
|
1014 case '=': return STAREQUAL; |
|
1015 } |
|
1016 break; |
|
1017 case '/': |
|
1018 switch (c2) { |
|
1019 case '/': return DOUBLESLASH; |
|
1020 case '=': return SLASHEQUAL; |
|
1021 } |
|
1022 break; |
|
1023 case '|': |
|
1024 switch (c2) { |
|
1025 case '=': return VBAREQUAL; |
|
1026 } |
|
1027 break; |
|
1028 case '%': |
|
1029 switch (c2) { |
|
1030 case '=': return PERCENTEQUAL; |
|
1031 } |
|
1032 break; |
|
1033 case '&': |
|
1034 switch (c2) { |
|
1035 case '=': return AMPEREQUAL; |
|
1036 } |
|
1037 break; |
|
1038 case '^': |
|
1039 switch (c2) { |
|
1040 case '=': return CIRCUMFLEXEQUAL; |
|
1041 } |
|
1042 break; |
|
1043 } |
|
1044 return OP; |
|
1045 } |
|
1046 |
|
1047 int |
|
1048 PyToken_ThreeChars(int c1, int c2, int c3) |
|
1049 { |
|
1050 switch (c1) { |
|
1051 case '<': |
|
1052 switch (c2) { |
|
1053 case '<': |
|
1054 switch (c3) { |
|
1055 case '=': |
|
1056 return LEFTSHIFTEQUAL; |
|
1057 } |
|
1058 break; |
|
1059 } |
|
1060 break; |
|
1061 case '>': |
|
1062 switch (c2) { |
|
1063 case '>': |
|
1064 switch (c3) { |
|
1065 case '=': |
|
1066 return RIGHTSHIFTEQUAL; |
|
1067 } |
|
1068 break; |
|
1069 } |
|
1070 break; |
|
1071 case '*': |
|
1072 switch (c2) { |
|
1073 case '*': |
|
1074 switch (c3) { |
|
1075 case '=': |
|
1076 return DOUBLESTAREQUAL; |
|
1077 } |
|
1078 break; |
|
1079 } |
|
1080 break; |
|
1081 case '/': |
|
1082 switch (c2) { |
|
1083 case '/': |
|
1084 switch (c3) { |
|
1085 case '=': |
|
1086 return DOUBLESLASHEQUAL; |
|
1087 } |
|
1088 break; |
|
1089 } |
|
1090 break; |
|
1091 } |
|
1092 return OP; |
|
1093 } |
|
1094 |
|
1095 static int |
|
1096 indenterror(struct tok_state *tok) |
|
1097 { |
|
1098 if (tok->alterror) { |
|
1099 tok->done = E_TABSPACE; |
|
1100 tok->cur = tok->inp; |
|
1101 return 1; |
|
1102 } |
|
1103 if (tok->altwarning) { |
|
1104 PySys_WriteStderr("%s: inconsistent use of tabs and spaces " |
|
1105 "in indentation\n", tok->filename); |
|
1106 tok->altwarning = 0; |
|
1107 } |
|
1108 return 0; |
|
1109 } |
|
1110 |
|
1111 |
|
1112 /* Get next token, after space stripping etc. */ |
|
1113 |
|
1114 static int |
|
1115 tok_get(register struct tok_state *tok, char **p_start, char **p_end) |
|
1116 { |
|
1117 register int c; |
|
1118 int blankline; |
|
1119 |
|
1120 *p_start = *p_end = NULL; |
|
1121 nextline: |
|
1122 tok->start = NULL; |
|
1123 blankline = 0; |
|
1124 |
|
1125 /* Get indentation level */ |
|
1126 if (tok->atbol) { |
|
1127 register int col = 0; |
|
1128 register int altcol = 0; |
|
1129 tok->atbol = 0; |
|
1130 for (;;) { |
|
1131 c = tok_nextc(tok); |
|
1132 if (c == ' ') |
|
1133 col++, altcol++; |
|
1134 else if (c == '\t') { |
|
1135 col = (col/tok->tabsize + 1) * tok->tabsize; |
|
1136 altcol = (altcol/tok->alttabsize + 1) |
|
1137 * tok->alttabsize; |
|
1138 } |
|
1139 else if (c == '\014') /* Control-L (formfeed) */ |
|
1140 col = altcol = 0; /* For Emacs users */ |
|
1141 else |
|
1142 break; |
|
1143 } |
|
1144 tok_backup(tok, c); |
|
1145 if (c == '#' || c == '\n') { |
|
1146 /* Lines with only whitespace and/or comments |
|
1147 shouldn't affect the indentation and are |
|
1148 not passed to the parser as NEWLINE tokens, |
|
1149 except *totally* empty lines in interactive |
|
1150 mode, which signal the end of a command group. */ |
|
1151 if (col == 0 && c == '\n' && tok->prompt != NULL) |
|
1152 blankline = 0; /* Let it through */ |
|
1153 else |
|
1154 blankline = 1; /* Ignore completely */ |
|
1155 /* We can't jump back right here since we still |
|
1156 may need to skip to the end of a comment */ |
|
1157 } |
|
1158 if (!blankline && tok->level == 0) { |
|
1159 if (col == tok->indstack[tok->indent]) { |
|
1160 /* No change */ |
|
1161 if (altcol != tok->altindstack[tok->indent]) { |
|
1162 if (indenterror(tok)) |
|
1163 return ERRORTOKEN; |
|
1164 } |
|
1165 } |
|
1166 else if (col > tok->indstack[tok->indent]) { |
|
1167 /* Indent -- always one */ |
|
1168 if (tok->indent+1 >= MAXINDENT) { |
|
1169 tok->done = E_TOODEEP; |
|
1170 tok->cur = tok->inp; |
|
1171 return ERRORTOKEN; |
|
1172 } |
|
1173 if (altcol <= tok->altindstack[tok->indent]) { |
|
1174 if (indenterror(tok)) |
|
1175 return ERRORTOKEN; |
|
1176 } |
|
1177 tok->pendin++; |
|
1178 tok->indstack[++tok->indent] = col; |
|
1179 tok->altindstack[tok->indent] = altcol; |
|
1180 } |
|
1181 else /* col < tok->indstack[tok->indent] */ { |
|
1182 /* Dedent -- any number, must be consistent */ |
|
1183 while (tok->indent > 0 && |
|
1184 col < tok->indstack[tok->indent]) { |
|
1185 tok->pendin--; |
|
1186 tok->indent--; |
|
1187 } |
|
1188 if (col != tok->indstack[tok->indent]) { |
|
1189 tok->done = E_DEDENT; |
|
1190 tok->cur = tok->inp; |
|
1191 return ERRORTOKEN; |
|
1192 } |
|
1193 if (altcol != tok->altindstack[tok->indent]) { |
|
1194 if (indenterror(tok)) |
|
1195 return ERRORTOKEN; |
|
1196 } |
|
1197 } |
|
1198 } |
|
1199 } |
|
1200 |
|
1201 tok->start = tok->cur; |
|
1202 |
|
1203 /* Return pending indents/dedents */ |
|
1204 if (tok->pendin != 0) { |
|
1205 if (tok->pendin < 0) { |
|
1206 tok->pendin++; |
|
1207 return DEDENT; |
|
1208 } |
|
1209 else { |
|
1210 tok->pendin--; |
|
1211 return INDENT; |
|
1212 } |
|
1213 } |
|
1214 |
|
1215 again: |
|
1216 tok->start = NULL; |
|
1217 /* Skip spaces */ |
|
1218 do { |
|
1219 c = tok_nextc(tok); |
|
1220 } while (c == ' ' || c == '\t' || c == '\014'); |
|
1221 |
|
1222 /* Set start of current token */ |
|
1223 tok->start = tok->cur - 1; |
|
1224 |
|
1225 /* Skip comment, while looking for tab-setting magic */ |
|
1226 if (c == '#') { |
|
1227 static char *tabforms[] = { |
|
1228 "tab-width:", /* Emacs */ |
|
1229 ":tabstop=", /* vim, full form */ |
|
1230 ":ts=", /* vim, abbreviated form */ |
|
1231 "set tabsize=", /* will vi never die? */ |
|
1232 /* more templates can be added here to support other editors */ |
|
1233 }; |
|
1234 char cbuf[80]; |
|
1235 char *tp, **cp; |
|
1236 tp = cbuf; |
|
1237 do { |
|
1238 *tp++ = c = tok_nextc(tok); |
|
1239 } while (c != EOF && c != '\n' && |
|
1240 (size_t)(tp - cbuf + 1) < sizeof(cbuf)); |
|
1241 *tp = '\0'; |
|
1242 for (cp = tabforms; |
|
1243 cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]); |
|
1244 cp++) { |
|
1245 if ((tp = strstr(cbuf, *cp))) { |
|
1246 int newsize = atoi(tp + strlen(*cp)); |
|
1247 |
|
1248 if (newsize >= 1 && newsize <= 40) { |
|
1249 tok->tabsize = newsize; |
|
1250 if (Py_VerboseFlag) |
|
1251 PySys_WriteStderr( |
|
1252 "Tab size set to %d\n", |
|
1253 newsize); |
|
1254 } |
|
1255 } |
|
1256 } |
|
1257 while (c != EOF && c != '\n') |
|
1258 c = tok_nextc(tok); |
|
1259 } |
|
1260 |
|
1261 /* Check for EOF and errors now */ |
|
1262 if (c == EOF) { |
|
1263 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN; |
|
1264 } |
|
1265 |
|
1266 /* Identifier (most frequent token!) */ |
|
1267 if (isalpha(c) || c == '_') { |
|
1268 /* Process r"", u"" and ur"" */ |
|
1269 switch (c) { |
|
1270 case 'b': |
|
1271 case 'B': |
|
1272 c = tok_nextc(tok); |
|
1273 if (c == 'r' || c == 'R') |
|
1274 c = tok_nextc(tok); |
|
1275 if (c == '"' || c == '\'') |
|
1276 goto letter_quote; |
|
1277 break; |
|
1278 case 'r': |
|
1279 case 'R': |
|
1280 c = tok_nextc(tok); |
|
1281 if (c == '"' || c == '\'') |
|
1282 goto letter_quote; |
|
1283 break; |
|
1284 case 'u': |
|
1285 case 'U': |
|
1286 c = tok_nextc(tok); |
|
1287 if (c == 'r' || c == 'R') |
|
1288 c = tok_nextc(tok); |
|
1289 if (c == '"' || c == '\'') |
|
1290 goto letter_quote; |
|
1291 break; |
|
1292 } |
|
1293 while (isalnum(c) || c == '_') { |
|
1294 c = tok_nextc(tok); |
|
1295 } |
|
1296 tok_backup(tok, c); |
|
1297 *p_start = tok->start; |
|
1298 *p_end = tok->cur; |
|
1299 return NAME; |
|
1300 } |
|
1301 |
|
1302 /* Newline */ |
|
1303 if (c == '\n') { |
|
1304 tok->atbol = 1; |
|
1305 if (blankline || tok->level > 0) |
|
1306 goto nextline; |
|
1307 *p_start = tok->start; |
|
1308 *p_end = tok->cur - 1; /* Leave '\n' out of the string */ |
|
1309 tok->cont_line = 0; |
|
1310 return NEWLINE; |
|
1311 } |
|
1312 |
|
1313 /* Period or number starting with period? */ |
|
1314 if (c == '.') { |
|
1315 c = tok_nextc(tok); |
|
1316 if (isdigit(c)) { |
|
1317 goto fraction; |
|
1318 } |
|
1319 else { |
|
1320 tok_backup(tok, c); |
|
1321 *p_start = tok->start; |
|
1322 *p_end = tok->cur; |
|
1323 return DOT; |
|
1324 } |
|
1325 } |
|
1326 |
|
1327 /* Number */ |
|
1328 if (isdigit(c)) { |
|
1329 if (c == '0') { |
|
1330 /* Hex, octal or binary -- maybe. */ |
|
1331 c = tok_nextc(tok); |
|
1332 if (c == '.') |
|
1333 goto fraction; |
|
1334 #ifndef WITHOUT_COMPLEX |
|
1335 if (c == 'j' || c == 'J') |
|
1336 goto imaginary; |
|
1337 #endif |
|
1338 if (c == 'x' || c == 'X') { |
|
1339 |
|
1340 /* Hex */ |
|
1341 c = tok_nextc(tok); |
|
1342 if (!isxdigit(c)) { |
|
1343 tok->done = E_TOKEN; |
|
1344 tok_backup(tok, c); |
|
1345 return ERRORTOKEN; |
|
1346 } |
|
1347 do { |
|
1348 c = tok_nextc(tok); |
|
1349 } while (isxdigit(c)); |
|
1350 } |
|
1351 else if (c == 'o' || c == 'O') { |
|
1352 /* Octal */ |
|
1353 c = tok_nextc(tok); |
|
1354 if (c < '0' || c >= '8') { |
|
1355 tok->done = E_TOKEN; |
|
1356 tok_backup(tok, c); |
|
1357 return ERRORTOKEN; |
|
1358 } |
|
1359 do { |
|
1360 c = tok_nextc(tok); |
|
1361 } while ('0' <= c && c < '8'); |
|
1362 } |
|
1363 else if (c == 'b' || c == 'B') { |
|
1364 /* Binary */ |
|
1365 c = tok_nextc(tok); |
|
1366 if (c != '0' && c != '1') { |
|
1367 tok->done = E_TOKEN; |
|
1368 tok_backup(tok, c); |
|
1369 return ERRORTOKEN; |
|
1370 } |
|
1371 do { |
|
1372 c = tok_nextc(tok); |
|
1373 } while (c == '0' || c == '1'); |
|
1374 } |
|
1375 else { |
|
1376 int found_decimal = 0; |
|
1377 /* Octal; c is first char of it */ |
|
1378 /* There's no 'isoctdigit' macro, sigh */ |
|
1379 while ('0' <= c && c < '8') { |
|
1380 c = tok_nextc(tok); |
|
1381 } |
|
1382 if (isdigit(c)) { |
|
1383 found_decimal = 1; |
|
1384 do { |
|
1385 c = tok_nextc(tok); |
|
1386 } while (isdigit(c)); |
|
1387 } |
|
1388 if (c == '.') |
|
1389 goto fraction; |
|
1390 else if (c == 'e' || c == 'E') |
|
1391 goto exponent; |
|
1392 #ifndef WITHOUT_COMPLEX |
|
1393 else if (c == 'j' || c == 'J') |
|
1394 goto imaginary; |
|
1395 #endif |
|
1396 else if (found_decimal) { |
|
1397 tok->done = E_TOKEN; |
|
1398 tok_backup(tok, c); |
|
1399 return ERRORTOKEN; |
|
1400 } |
|
1401 } |
|
1402 if (c == 'l' || c == 'L') |
|
1403 c = tok_nextc(tok); |
|
1404 } |
|
1405 else { |
|
1406 /* Decimal */ |
|
1407 do { |
|
1408 c = tok_nextc(tok); |
|
1409 } while (isdigit(c)); |
|
1410 if (c == 'l' || c == 'L') |
|
1411 c = tok_nextc(tok); |
|
1412 else { |
|
1413 /* Accept floating point numbers. */ |
|
1414 if (c == '.') { |
|
1415 fraction: |
|
1416 /* Fraction */ |
|
1417 do { |
|
1418 c = tok_nextc(tok); |
|
1419 } while (isdigit(c)); |
|
1420 } |
|
1421 if (c == 'e' || c == 'E') { |
|
1422 exponent: |
|
1423 /* Exponent part */ |
|
1424 c = tok_nextc(tok); |
|
1425 if (c == '+' || c == '-') |
|
1426 c = tok_nextc(tok); |
|
1427 if (!isdigit(c)) { |
|
1428 tok->done = E_TOKEN; |
|
1429 tok_backup(tok, c); |
|
1430 return ERRORTOKEN; |
|
1431 } |
|
1432 do { |
|
1433 c = tok_nextc(tok); |
|
1434 } while (isdigit(c)); |
|
1435 } |
|
1436 #ifndef WITHOUT_COMPLEX |
|
1437 if (c == 'j' || c == 'J') |
|
1438 /* Imaginary part */ |
|
1439 imaginary: |
|
1440 c = tok_nextc(tok); |
|
1441 #endif |
|
1442 } |
|
1443 } |
|
1444 tok_backup(tok, c); |
|
1445 *p_start = tok->start; |
|
1446 *p_end = tok->cur; |
|
1447 return NUMBER; |
|
1448 } |
|
1449 |
|
1450 letter_quote: |
|
1451 /* String */ |
|
1452 if (c == '\'' || c == '"') { |
|
1453 Py_ssize_t quote2 = tok->cur - tok->start + 1; |
|
1454 int quote = c; |
|
1455 int triple = 0; |
|
1456 int tripcount = 0; |
|
1457 for (;;) { |
|
1458 c = tok_nextc(tok); |
|
1459 if (c == '\n') { |
|
1460 if (!triple) { |
|
1461 tok->done = E_EOLS; |
|
1462 tok_backup(tok, c); |
|
1463 return ERRORTOKEN; |
|
1464 } |
|
1465 tripcount = 0; |
|
1466 tok->cont_line = 1; /* multiline string. */ |
|
1467 } |
|
1468 else if (c == EOF) { |
|
1469 if (triple) |
|
1470 tok->done = E_EOFS; |
|
1471 else |
|
1472 tok->done = E_EOLS; |
|
1473 tok->cur = tok->inp; |
|
1474 return ERRORTOKEN; |
|
1475 } |
|
1476 else if (c == quote) { |
|
1477 tripcount++; |
|
1478 if (tok->cur - tok->start == quote2) { |
|
1479 c = tok_nextc(tok); |
|
1480 if (c == quote) { |
|
1481 triple = 1; |
|
1482 tripcount = 0; |
|
1483 continue; |
|
1484 } |
|
1485 tok_backup(tok, c); |
|
1486 } |
|
1487 if (!triple || tripcount == 3) |
|
1488 break; |
|
1489 } |
|
1490 else if (c == '\\') { |
|
1491 tripcount = 0; |
|
1492 c = tok_nextc(tok); |
|
1493 if (c == EOF) { |
|
1494 tok->done = E_EOLS; |
|
1495 tok->cur = tok->inp; |
|
1496 return ERRORTOKEN; |
|
1497 } |
|
1498 } |
|
1499 else |
|
1500 tripcount = 0; |
|
1501 } |
|
1502 *p_start = tok->start; |
|
1503 *p_end = tok->cur; |
|
1504 return STRING; |
|
1505 } |
|
1506 |
|
1507 /* Line continuation */ |
|
1508 if (c == '\\') { |
|
1509 c = tok_nextc(tok); |
|
1510 if (c != '\n') { |
|
1511 tok->done = E_LINECONT; |
|
1512 tok->cur = tok->inp; |
|
1513 return ERRORTOKEN; |
|
1514 } |
|
1515 tok->cont_line = 1; |
|
1516 goto again; /* Read next line */ |
|
1517 } |
|
1518 |
|
1519 /* Check for two-character token */ |
|
1520 { |
|
1521 int c2 = tok_nextc(tok); |
|
1522 int token = PyToken_TwoChars(c, c2); |
|
1523 #ifndef PGEN |
|
1524 if (Py_Py3kWarningFlag && token == NOTEQUAL && c == '<') { |
|
1525 if (PyErr_WarnExplicit(PyExc_DeprecationWarning, |
|
1526 "<> not supported in 3.x; use !=", |
|
1527 tok->filename, tok->lineno, |
|
1528 NULL, NULL)) { |
|
1529 return ERRORTOKEN; |
|
1530 } |
|
1531 } |
|
1532 #endif |
|
1533 if (token != OP) { |
|
1534 int c3 = tok_nextc(tok); |
|
1535 int token3 = PyToken_ThreeChars(c, c2, c3); |
|
1536 if (token3 != OP) { |
|
1537 token = token3; |
|
1538 } else { |
|
1539 tok_backup(tok, c3); |
|
1540 } |
|
1541 *p_start = tok->start; |
|
1542 *p_end = tok->cur; |
|
1543 return token; |
|
1544 } |
|
1545 tok_backup(tok, c2); |
|
1546 } |
|
1547 |
|
1548 /* Keep track of parentheses nesting level */ |
|
1549 switch (c) { |
|
1550 case '(': |
|
1551 case '[': |
|
1552 case '{': |
|
1553 tok->level++; |
|
1554 break; |
|
1555 case ')': |
|
1556 case ']': |
|
1557 case '}': |
|
1558 tok->level--; |
|
1559 break; |
|
1560 } |
|
1561 |
|
1562 /* Punctuation character */ |
|
1563 *p_start = tok->start; |
|
1564 *p_end = tok->cur; |
|
1565 return PyToken_OneChar(c); |
|
1566 } |
|
1567 |
|
1568 int |
|
1569 PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end) |
|
1570 { |
|
1571 int result = tok_get(tok, p_start, p_end); |
|
1572 if (tok->decoding_erred) { |
|
1573 result = ERRORTOKEN; |
|
1574 tok->done = E_DECODE; |
|
1575 } |
|
1576 return result; |
|
1577 } |
|
1578 |
|
1579 /* This function is only called from parsetok. However, it cannot live |
|
1580 there, as it must be empty for PGEN, and we can check for PGEN only |
|
1581 in this file. */ |
|
1582 |
|
1583 #if defined(PGEN) || !defined(Py_USING_UNICODE) |
|
1584 char* |
|
1585 PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int* offset) |
|
1586 { |
|
1587 return NULL; |
|
1588 } |
|
1589 #else |
|
1590 #ifdef Py_USING_UNICODE |
|
1591 static PyObject * |
|
1592 dec_utf8(const char *enc, const char *text, size_t len) { |
|
1593 PyObject *ret = NULL; |
|
1594 PyObject *unicode_text = PyUnicode_DecodeUTF8(text, len, "replace"); |
|
1595 if (unicode_text) { |
|
1596 ret = PyUnicode_AsEncodedString(unicode_text, enc, "replace"); |
|
1597 Py_DECREF(unicode_text); |
|
1598 } |
|
1599 if (!ret) { |
|
1600 PyErr_Clear(); |
|
1601 } |
|
1602 return ret; |
|
1603 } |
|
1604 char * |
|
1605 PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int *offset) |
|
1606 { |
|
1607 char *text = NULL; |
|
1608 if (tok->encoding) { |
|
1609 /* convert source to original encondig */ |
|
1610 PyObject *lineobj = dec_utf8(tok->encoding, tok->buf, len); |
|
1611 if (lineobj != NULL) { |
|
1612 int linelen = PyString_Size(lineobj); |
|
1613 const char *line = PyString_AsString(lineobj); |
|
1614 text = PyObject_MALLOC(linelen + 1); |
|
1615 if (text != NULL && line != NULL) { |
|
1616 if (linelen) |
|
1617 strncpy(text, line, linelen); |
|
1618 text[linelen] = '\0'; |
|
1619 } |
|
1620 Py_DECREF(lineobj); |
|
1621 |
|
1622 /* adjust error offset */ |
|
1623 if (*offset > 1) { |
|
1624 PyObject *offsetobj = dec_utf8(tok->encoding, |
|
1625 tok->buf, *offset-1); |
|
1626 if (offsetobj) { |
|
1627 *offset = PyString_Size(offsetobj) + 1; |
|
1628 Py_DECREF(offsetobj); |
|
1629 } |
|
1630 } |
|
1631 |
|
1632 } |
|
1633 } |
|
1634 return text; |
|
1635 |
|
1636 } |
|
1637 #endif /* defined(Py_USING_UNICODE) */ |
|
1638 #endif |
|
1639 |
|
1640 |
|
1641 #ifdef Py_DEBUG |
|
1642 |
|
1643 void |
|
1644 tok_dump(int type, char *start, char *end) |
|
1645 { |
|
1646 printf("%s", _PyParser_TokenNames[type]); |
|
1647 if (type == NAME || type == NUMBER || type == STRING || type == OP) |
|
1648 printf("(%.*s)", (int)(end - start), start); |
|
1649 } |
|
1650 |
|
1651 #endif |