symbian-qemu-0.9.1-12/python-2.6.1/Parser/tokenizer.c
changeset 1 2fb8b9db1c86
equal deleted inserted replaced
0:ffa851df0825 1:2fb8b9db1c86
       
     1 
       
     2 /* Tokenizer implementation */
       
     3 
       
     4 #include "Python.h"
       
     5 #include "pgenheaders.h"
       
     6 
       
     7 #include <ctype.h>
       
     8 #include <assert.h>
       
     9 
       
    10 #include "tokenizer.h"
       
    11 #include "errcode.h"
       
    12 
       
    13 #ifndef PGEN
       
    14 #include "unicodeobject.h"
       
    15 #include "stringobject.h"
       
    16 #include "fileobject.h"
       
    17 #include "codecs.h"
       
    18 #include "abstract.h"
       
    19 #include "pydebug.h"
       
    20 #endif /* PGEN */
       
    21 
       
    22 extern char *PyOS_Readline(FILE *, FILE *, char *);
       
    23 /* Return malloc'ed string including trailing \n;
       
    24    empty malloc'ed string for EOF;
       
    25    NULL if interrupted */
       
    26 
       
    27 /* Don't ever change this -- it would break the portability of Python code */
       
    28 #define TABSIZE 8
       
    29 
       
    30 /* Forward */
       
    31 static struct tok_state *tok_new(void);
       
    32 static int tok_nextc(struct tok_state *tok);
       
    33 static void tok_backup(struct tok_state *tok, int c);
       
    34 
       
    35 /* Token names */
       
    36 
       
    37 char *_PyParser_TokenNames[] = {
       
    38 	"ENDMARKER",
       
    39 	"NAME",
       
    40 	"NUMBER",
       
    41 	"STRING",
       
    42 	"NEWLINE",
       
    43 	"INDENT",
       
    44 	"DEDENT",
       
    45 	"LPAR",
       
    46 	"RPAR",
       
    47 	"LSQB",
       
    48 	"RSQB",
       
    49 	"COLON",
       
    50 	"COMMA",
       
    51 	"SEMI",
       
    52 	"PLUS",
       
    53 	"MINUS",
       
    54 	"STAR",
       
    55 	"SLASH",
       
    56 	"VBAR",
       
    57 	"AMPER",
       
    58 	"LESS",
       
    59 	"GREATER",
       
    60 	"EQUAL",
       
    61 	"DOT",
       
    62 	"PERCENT",
       
    63 	"BACKQUOTE",
       
    64 	"LBRACE",
       
    65 	"RBRACE",
       
    66 	"EQEQUAL",
       
    67 	"NOTEQUAL",
       
    68 	"LESSEQUAL",
       
    69 	"GREATEREQUAL",
       
    70 	"TILDE",
       
    71 	"CIRCUMFLEX",
       
    72 	"LEFTSHIFT",
       
    73 	"RIGHTSHIFT",
       
    74 	"DOUBLESTAR",
       
    75 	"PLUSEQUAL",
       
    76 	"MINEQUAL",
       
    77 	"STAREQUAL",
       
    78 	"SLASHEQUAL",
       
    79 	"PERCENTEQUAL",
       
    80 	"AMPEREQUAL",
       
    81 	"VBAREQUAL",
       
    82 	"CIRCUMFLEXEQUAL",
       
    83 	"LEFTSHIFTEQUAL",
       
    84 	"RIGHTSHIFTEQUAL",
       
    85 	"DOUBLESTAREQUAL",
       
    86 	"DOUBLESLASH",
       
    87 	"DOUBLESLASHEQUAL",
       
    88 	"AT",
       
    89 	/* This table must match the #defines in token.h! */
       
    90 	"OP",
       
    91 	"<ERRORTOKEN>",
       
    92 	"<N_TOKENS>"
       
    93 };
       
    94 
       
    95 
       
    96 /* Create and initialize a new tok_state structure */
       
    97 
       
    98 static struct tok_state *
       
    99 tok_new(void)
       
   100 {
       
   101 	struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
       
   102                                                 sizeof(struct tok_state));
       
   103 	if (tok == NULL)
       
   104 		return NULL;
       
   105 	tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
       
   106 	tok->done = E_OK;
       
   107 	tok->fp = NULL;
       
   108 	tok->tabsize = TABSIZE;
       
   109 	tok->indent = 0;
       
   110 	tok->indstack[0] = 0;
       
   111 	tok->atbol = 1;
       
   112 	tok->pendin = 0;
       
   113 	tok->prompt = tok->nextprompt = NULL;
       
   114 	tok->lineno = 0;
       
   115 	tok->level = 0;
       
   116 	tok->filename = NULL;
       
   117 	tok->altwarning = 0;
       
   118 	tok->alterror = 0;
       
   119 	tok->alttabsize = 1;
       
   120 	tok->altindstack[0] = 0;
       
   121 	tok->decoding_state = 0;
       
   122 	tok->decoding_erred = 0;
       
   123 	tok->read_coding_spec = 0;
       
   124 	tok->encoding = NULL;
       
   125         tok->cont_line = 0;
       
   126 #ifndef PGEN
       
   127 	tok->decoding_readline = NULL;
       
   128 	tok->decoding_buffer = NULL;
       
   129 #endif
       
   130 	return tok;
       
   131 }
       
   132 
       
   133 #ifdef PGEN
       
   134 
       
   135 static char *
       
   136 decoding_fgets(char *s, int size, struct tok_state *tok)
       
   137 {
       
   138 	return fgets(s, size, tok->fp);
       
   139 }
       
   140 
       
   141 static int
       
   142 decoding_feof(struct tok_state *tok)
       
   143 {
       
   144 	return feof(tok->fp);
       
   145 }
       
   146 
       
   147 static const char *
       
   148 decode_str(const char *str, struct tok_state *tok)
       
   149 {
       
   150 	return str;
       
   151 }
       
   152 
       
   153 #else /* PGEN */
       
   154 
       
   155 static char *
       
   156 error_ret(struct tok_state *tok) /* XXX */
       
   157 {
       
   158 	tok->decoding_erred = 1;
       
   159 	if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
       
   160 		PyMem_FREE(tok->buf);
       
   161 	tok->buf = NULL;
       
   162 	return NULL;		/* as if it were EOF */
       
   163 }
       
   164 
       
   165 static char *
       
   166 new_string(const char *s, Py_ssize_t len)
       
   167 {
       
   168 	char* result = (char *)PyMem_MALLOC(len + 1);
       
   169 	if (result != NULL) {
       
   170 		memcpy(result, s, len);
       
   171 		result[len] = '\0';
       
   172 	}
       
   173 	return result;
       
   174 }
       
   175 
       
   176 static char *
       
   177 get_normal_name(char *s)	/* for utf-8 and latin-1 */
       
   178 {
       
   179 	char buf[13];
       
   180 	int i;
       
   181 	for (i = 0; i < 12; i++) {
       
   182 		int c = s[i];
       
   183 		if (c == '\0') break;
       
   184 		else if (c == '_') buf[i] = '-';
       
   185 		else buf[i] = tolower(c);
       
   186 	}
       
   187 	buf[i] = '\0';
       
   188 	if (strcmp(buf, "utf-8") == 0 ||
       
   189 	    strncmp(buf, "utf-8-", 6) == 0) return "utf-8";
       
   190 	else if (strcmp(buf, "latin-1") == 0 ||
       
   191 		 strcmp(buf, "iso-8859-1") == 0 ||
       
   192 		 strcmp(buf, "iso-latin-1") == 0 ||
       
   193 		 strncmp(buf, "latin-1-", 8) == 0 ||
       
   194 		 strncmp(buf, "iso-8859-1-", 11) == 0 ||
       
   195 		 strncmp(buf, "iso-latin-1-", 12) == 0) return "iso-8859-1";
       
   196 	else return s;
       
   197 }
       
   198 
       
   199 /* Return the coding spec in S, or NULL if none is found.  */
       
   200 
       
   201 static char *
       
   202 get_coding_spec(const char *s, Py_ssize_t size)
       
   203 {
       
   204 	Py_ssize_t i;
       
   205 	/* Coding spec must be in a comment, and that comment must be
       
   206          * the only statement on the source code line. */
       
   207         for (i = 0; i < size - 6; i++) {
       
   208 		if (s[i] == '#')
       
   209 			break;
       
   210 		if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
       
   211 			return NULL;
       
   212 	}
       
   213 	for (; i < size - 6; i++) { /* XXX inefficient search */
       
   214 		const char* t = s + i;
       
   215 		if (strncmp(t, "coding", 6) == 0) {
       
   216 			const char* begin = NULL;
       
   217 			t += 6;
       
   218 			if (t[0] != ':' && t[0] != '=')
       
   219 				continue;
       
   220 			do {
       
   221 				t++;
       
   222 			} while (t[0] == '\x20' || t[0] == '\t');
       
   223 
       
   224 			begin = t;
       
   225 			while (isalnum(Py_CHARMASK(t[0])) ||
       
   226 			       t[0] == '-' || t[0] == '_' || t[0] == '.')
       
   227 				t++;
       
   228 
       
   229 			if (begin < t) {
       
   230 				char* r = new_string(begin, t - begin);
       
   231 				char* q = get_normal_name(r);
       
   232 				if (r != q) {
       
   233 					PyMem_FREE(r);
       
   234 					r = new_string(q, strlen(q));
       
   235 				}
       
   236 				return r;
       
   237 			}
       
   238 		}
       
   239 	}
       
   240 	return NULL;
       
   241 }
       
   242 
       
   243 /* Check whether the line contains a coding spec. If it does,
       
   244    invoke the set_readline function for the new encoding.
       
   245    This function receives the tok_state and the new encoding.
       
   246    Return 1 on success, 0 on failure.  */
       
   247 
       
   248 static int
       
   249 check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
       
   250 		  int set_readline(struct tok_state *, const char *))
       
   251 {
       
   252 	char * cs;
       
   253 	int r = 1;
       
   254 
       
   255         if (tok->cont_line)
       
   256 		/* It's a continuation line, so it can't be a coding spec. */
       
   257 		return 1;
       
   258 	cs = get_coding_spec(line, size);
       
   259 	if (cs != NULL) {
       
   260 		tok->read_coding_spec = 1;
       
   261 		if (tok->encoding == NULL) {
       
   262 			assert(tok->decoding_state == 1); /* raw */
       
   263 			if (strcmp(cs, "utf-8") == 0 ||
       
   264 			    strcmp(cs, "iso-8859-1") == 0) {
       
   265 				tok->encoding = cs;
       
   266 			} else {
       
   267 #ifdef Py_USING_UNICODE
       
   268 				r = set_readline(tok, cs);
       
   269 				if (r) {
       
   270 					tok->encoding = cs;
       
   271 					tok->decoding_state = -1;
       
   272 				}
       
   273 				else
       
   274 					PyMem_FREE(cs);
       
   275 #else
       
   276                                 /* Without Unicode support, we cannot
       
   277                                    process the coding spec. Since there
       
   278                                    won't be any Unicode literals, that
       
   279                                    won't matter. */
       
   280 				PyMem_FREE(cs);
       
   281 #endif
       
   282 			}
       
   283 		} else {	/* then, compare cs with BOM */
       
   284 			r = (strcmp(tok->encoding, cs) == 0);
       
   285 			PyMem_FREE(cs);
       
   286 		}
       
   287 	}
       
   288 	if (!r) {
       
   289 		cs = tok->encoding;
       
   290 		if (!cs)
       
   291 			cs = "with BOM";
       
   292 		PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
       
   293 	}
       
   294 	return r;
       
   295 }
       
   296 
       
   297 /* See whether the file starts with a BOM. If it does,
       
   298    invoke the set_readline function with the new encoding.
       
   299    Return 1 on success, 0 on failure.  */
       
   300 
       
   301 static int
       
   302 check_bom(int get_char(struct tok_state *),
       
   303 	  void unget_char(int, struct tok_state *),
       
   304 	  int set_readline(struct tok_state *, const char *),
       
   305 	  struct tok_state *tok)
       
   306 {
       
   307 	int ch = get_char(tok);
       
   308 	tok->decoding_state = 1;
       
   309 	if (ch == EOF) {
       
   310 		return 1;
       
   311 	} else if (ch == 0xEF) {
       
   312 		ch = get_char(tok); if (ch != 0xBB) goto NON_BOM;
       
   313 		ch = get_char(tok); if (ch != 0xBF) goto NON_BOM;
       
   314 #if 0
       
   315 	/* Disable support for UTF-16 BOMs until a decision
       
   316 	   is made whether this needs to be supported.  */
       
   317 	} else if (ch == 0xFE) {
       
   318 		ch = get_char(tok); if (ch != 0xFF) goto NON_BOM;
       
   319 		if (!set_readline(tok, "utf-16-be")) return 0;
       
   320 		tok->decoding_state = -1;
       
   321 	} else if (ch == 0xFF) {
       
   322 		ch = get_char(tok); if (ch != 0xFE) goto NON_BOM;
       
   323 		if (!set_readline(tok, "utf-16-le")) return 0;
       
   324 		tok->decoding_state = -1;
       
   325 #endif
       
   326 	} else {
       
   327 		unget_char(ch, tok);
       
   328 		return 1;
       
   329 	}
       
   330 	if (tok->encoding != NULL)
       
   331 		PyMem_FREE(tok->encoding);
       
   332 	tok->encoding = new_string("utf-8", 5);	/* resulting is in utf-8 */
       
   333 	return 1;
       
   334   NON_BOM:
       
   335 	/* any token beginning with '\xEF', '\xFE', '\xFF' is a bad token */
       
   336 	unget_char(0xFF, tok);	/* XXX this will cause a syntax error */
       
   337 	return 1;
       
   338 }
       
   339 
       
   340 /* Read a line of text from TOK into S, using the stream in TOK.
       
   341    Return NULL on failure, else S.
       
   342 
       
   343    On entry, tok->decoding_buffer will be one of:
       
   344      1) NULL: need to call tok->decoding_readline to get a new line
       
   345      2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
       
   346            stored the result in tok->decoding_buffer
       
   347      3) PyStringObject *: previous call to fp_readl did not have enough room
       
   348            (in the s buffer) to copy entire contents of the line read
       
   349            by tok->decoding_readline.  tok->decoding_buffer has the overflow.
       
   350            In this case, fp_readl is called in a loop (with an expanded buffer)
       
   351            until the buffer ends with a '\n' (or until the end of the file is
       
   352            reached): see tok_nextc and its calls to decoding_fgets.
       
   353 */
       
   354 
       
   355 static char *
       
   356 fp_readl(char *s, int size, struct tok_state *tok)
       
   357 {
       
   358 #ifndef Py_USING_UNICODE
       
   359 	/* In a non-Unicode built, this should never be called. */
       
   360 	Py_FatalError("fp_readl should not be called in this build.");
       
   361 	return NULL; /* Keep compiler happy (not reachable) */
       
   362 #else
       
   363 	PyObject* utf8 = NULL;
       
   364 	PyObject* buf = tok->decoding_buffer;
       
   365 	char *str;
       
   366 	Py_ssize_t utf8len;
       
   367 
       
   368 	/* Ask for one less byte so we can terminate it */
       
   369 	assert(size > 0);
       
   370 	size--;
       
   371 
       
   372 	if (buf == NULL) {
       
   373 		buf = PyObject_CallObject(tok->decoding_readline, NULL);
       
   374 		if (buf == NULL)
       
   375 			return error_ret(tok);
       
   376 	} else {
       
   377 		tok->decoding_buffer = NULL;
       
   378 		if (PyString_CheckExact(buf))
       
   379 			utf8 = buf;
       
   380 	}
       
   381 	if (utf8 == NULL) {
       
   382 		utf8 = PyUnicode_AsUTF8String(buf);
       
   383 		Py_DECREF(buf);
       
   384 		if (utf8 == NULL)
       
   385 			return error_ret(tok);
       
   386 	}
       
   387 	str = PyString_AsString(utf8);
       
   388 	utf8len = PyString_GET_SIZE(utf8);
       
   389 	if (utf8len > size) {
       
   390 		tok->decoding_buffer = PyString_FromStringAndSize(str+size, utf8len-size);
       
   391 		if (tok->decoding_buffer == NULL) {
       
   392 			Py_DECREF(utf8);
       
   393 			return error_ret(tok);
       
   394 		}
       
   395 		utf8len = size;
       
   396 	}
       
   397 	memcpy(s, str, utf8len);
       
   398 	s[utf8len] = '\0';
       
   399 	Py_DECREF(utf8);
       
   400 	if (utf8len == 0) return NULL; /* EOF */
       
   401 	return s;
       
   402 #endif
       
   403 }
       
   404 
       
   405 /* Set the readline function for TOK to a StreamReader's
       
   406    readline function. The StreamReader is named ENC.
       
   407 
       
   408    This function is called from check_bom and check_coding_spec.
       
   409 
       
   410    ENC is usually identical to the future value of tok->encoding,
       
   411    except for the (currently unsupported) case of UTF-16.
       
   412 
       
   413    Return 1 on success, 0 on failure. */
       
   414 
       
   415 static int
       
   416 fp_setreadl(struct tok_state *tok, const char* enc)
       
   417 {
       
   418 	PyObject *reader, *stream, *readline;
       
   419 
       
   420 	/* XXX: constify filename argument. */
       
   421 	stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL);
       
   422 	if (stream == NULL)
       
   423 		return 0;
       
   424 
       
   425 	reader = PyCodec_StreamReader(enc, stream, NULL);
       
   426 	Py_DECREF(stream);
       
   427 	if (reader == NULL)
       
   428 		return 0;
       
   429 
       
   430 	readline = PyObject_GetAttrString(reader, "readline");
       
   431 	Py_DECREF(reader);
       
   432 	if (readline == NULL)
       
   433 		return 0;
       
   434 
       
   435 	tok->decoding_readline = readline;
       
   436 	return 1;
       
   437 }
       
   438 
       
   439 /* Fetch the next byte from TOK. */
       
   440 
       
   441 static int fp_getc(struct tok_state *tok) {
       
   442 	return getc(tok->fp);
       
   443 }
       
   444 
       
   445 /* Unfetch the last byte back into TOK.  */
       
   446 
       
   447 static void fp_ungetc(int c, struct tok_state *tok) {
       
   448 	ungetc(c, tok->fp);
       
   449 }
       
   450 
       
   451 /* Read a line of input from TOK. Determine encoding
       
   452    if necessary.  */
       
   453 
       
   454 static char *
       
   455 decoding_fgets(char *s, int size, struct tok_state *tok)
       
   456 {
       
   457 	char *line = NULL;
       
   458 	int badchar = 0;
       
   459 	for (;;) {
       
   460 		if (tok->decoding_state < 0) {
       
   461 			/* We already have a codec associated with
       
   462 			   this input. */
       
   463 			line = fp_readl(s, size, tok);
       
   464 			break;
       
   465 		} else if (tok->decoding_state > 0) {
       
   466 			/* We want a 'raw' read. */
       
   467 			line = Py_UniversalNewlineFgets(s, size,
       
   468 							tok->fp, NULL);
       
   469 			break;
       
   470 		} else {
       
   471 			/* We have not yet determined the encoding.
       
   472 			   If an encoding is found, use the file-pointer
       
   473 			   reader functions from now on. */
       
   474 			if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
       
   475 				return error_ret(tok);
       
   476 			assert(tok->decoding_state != 0);
       
   477 		}
       
   478 	}
       
   479 	if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
       
   480 		if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
       
   481 			return error_ret(tok);
       
   482 		}
       
   483 	}
       
   484 #ifndef PGEN
       
   485 	/* The default encoding is ASCII, so make sure we don't have any
       
   486            non-ASCII bytes in it. */
       
   487 	if (line && !tok->encoding) {
       
   488 		unsigned char *c;
       
   489 		for (c = (unsigned char *)line; *c; c++)
       
   490 			if (*c > 127) {
       
   491 				badchar = *c;
       
   492 				break;
       
   493 			}
       
   494 	}
       
   495 	if (badchar) {
       
   496 		char buf[500];
       
   497 		/* Need to add 1 to the line number, since this line
       
   498 		   has not been counted, yet.  */
       
   499 		sprintf(buf,
       
   500 			"Non-ASCII character '\\x%.2x' "
       
   501 			"in file %.200s on line %i, "
       
   502 			"but no encoding declared; "
       
   503 			"see http://www.python.org/peps/pep-0263.html for details",
       
   504 			badchar, tok->filename, tok->lineno + 1);
       
   505 		PyErr_SetString(PyExc_SyntaxError, buf);
       
   506 		return error_ret(tok);
       
   507 	}
       
   508 #endif
       
   509 	return line;
       
   510 }
       
   511 
       
   512 static int
       
   513 decoding_feof(struct tok_state *tok)
       
   514 {
       
   515 	if (tok->decoding_state >= 0) {
       
   516 		return feof(tok->fp);
       
   517 	} else {
       
   518 		PyObject* buf = tok->decoding_buffer;
       
   519 		if (buf == NULL) {
       
   520 			buf = PyObject_CallObject(tok->decoding_readline, NULL);
       
   521 			if (buf == NULL) {
       
   522 				error_ret(tok);
       
   523 				return 1;
       
   524 			} else {
       
   525 				tok->decoding_buffer = buf;
       
   526 			}
       
   527 		}
       
   528 		return PyObject_Length(buf) == 0;
       
   529 	}
       
   530 }
       
   531 
       
   532 /* Fetch a byte from TOK, using the string buffer. */
       
   533 
       
   534 static int
       
   535 buf_getc(struct tok_state *tok) {
       
   536 	return Py_CHARMASK(*tok->str++);
       
   537 }
       
   538 
       
   539 /* Unfetch a byte from TOK, using the string buffer. */
       
   540 
       
   541 static void
       
   542 buf_ungetc(int c, struct tok_state *tok) {
       
   543 	tok->str--;
       
   544 	assert(Py_CHARMASK(*tok->str) == c);	/* tok->cur may point to read-only segment */
       
   545 }
       
   546 
       
   547 /* Set the readline function for TOK to ENC. For the string-based
       
   548    tokenizer, this means to just record the encoding. */
       
   549 
       
   550 static int
       
   551 buf_setreadl(struct tok_state *tok, const char* enc) {
       
   552 	tok->enc = enc;
       
   553 	return 1;
       
   554 }
       
   555 
       
   556 /* Return a UTF-8 encoding Python string object from the
       
   557    C byte string STR, which is encoded with ENC. */
       
   558 
       
   559 #ifdef Py_USING_UNICODE
       
   560 static PyObject *
       
   561 translate_into_utf8(const char* str, const char* enc) {
       
   562 	PyObject *utf8;
       
   563 	PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
       
   564 	if (buf == NULL)
       
   565 		return NULL;
       
   566 	utf8 = PyUnicode_AsUTF8String(buf);
       
   567 	Py_DECREF(buf);
       
   568 	return utf8;
       
   569 }
       
   570 #endif
       
   571 
       
   572 /* Decode a byte string STR for use as the buffer of TOK.
       
   573    Look for encoding declarations inside STR, and record them
       
   574    inside TOK.  */
       
   575 
       
   576 static const char *
       
   577 decode_str(const char *str, struct tok_state *tok)
       
   578 {
       
   579 	PyObject* utf8 = NULL;
       
   580 	const char *s;
       
   581 	const char *newl[2] = {NULL, NULL};
       
   582 	int lineno = 0;
       
   583 	tok->enc = NULL;
       
   584 	tok->str = str;
       
   585 	if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
       
   586 		return error_ret(tok);
       
   587 	str = tok->str;		/* string after BOM if any */
       
   588 	assert(str);
       
   589 #ifdef Py_USING_UNICODE
       
   590 	if (tok->enc != NULL) {
       
   591 		utf8 = translate_into_utf8(str, tok->enc);
       
   592 		if (utf8 == NULL)
       
   593 			return error_ret(tok);
       
   594 		str = PyString_AsString(utf8);
       
   595 	}
       
   596 #endif
       
   597 	for (s = str;; s++) {
       
   598 		if (*s == '\0') break;
       
   599 		else if (*s == '\n') {
       
   600 			assert(lineno < 2);
       
   601 			newl[lineno] = s;
       
   602 			lineno++;
       
   603 			if (lineno == 2) break;
       
   604 		}
       
   605 	}
       
   606 	tok->enc = NULL;
       
   607 	/* need to check line 1 and 2 separately since check_coding_spec
       
   608 	   assumes a single line as input */
       
   609 	if (newl[0]) {
       
   610 		if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
       
   611 			return error_ret(tok);
       
   612 		if (tok->enc == NULL && newl[1]) {
       
   613 			if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
       
   614 					       tok, buf_setreadl))
       
   615 				return error_ret(tok);
       
   616 		}
       
   617 	}
       
   618 #ifdef Py_USING_UNICODE
       
   619 	if (tok->enc != NULL) {
       
   620 		assert(utf8 == NULL);
       
   621 		utf8 = translate_into_utf8(str, tok->enc);
       
   622 		if (utf8 == NULL) {
       
   623 			PyErr_Format(PyExc_SyntaxError,
       
   624 				"unknown encoding: %s", tok->enc);
       
   625 			return error_ret(tok);
       
   626 		}
       
   627 		str = PyString_AsString(utf8);
       
   628 	}
       
   629 #endif
       
   630 	assert(tok->decoding_buffer == NULL);
       
   631 	tok->decoding_buffer = utf8; /* CAUTION */
       
   632 	return str;
       
   633 }
       
   634 
       
   635 #endif /* PGEN */
       
   636 
       
   637 /* Set up tokenizer for string */
       
   638 
       
   639 struct tok_state *
       
   640 PyTokenizer_FromString(const char *str)
       
   641 {
       
   642 	struct tok_state *tok = tok_new();
       
   643 	if (tok == NULL)
       
   644 		return NULL;
       
   645 	str = (char *)decode_str(str, tok);
       
   646 	if (str == NULL) {
       
   647 		PyTokenizer_Free(tok);
       
   648 		return NULL;
       
   649 	}
       
   650 
       
   651 	/* XXX: constify members. */
       
   652 	tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
       
   653 	return tok;
       
   654 }
       
   655 
       
   656 
       
   657 /* Set up tokenizer for file */
       
   658 
       
   659 struct tok_state *
       
   660 PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
       
   661 {
       
   662 	struct tok_state *tok = tok_new();
       
   663 	if (tok == NULL)
       
   664 		return NULL;
       
   665 	if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
       
   666 		PyTokenizer_Free(tok);
       
   667 		return NULL;
       
   668 	}
       
   669 	tok->cur = tok->inp = tok->buf;
       
   670 	tok->end = tok->buf + BUFSIZ;
       
   671 	tok->fp = fp;
       
   672 	tok->prompt = ps1;
       
   673 	tok->nextprompt = ps2;
       
   674 	return tok;
       
   675 }
       
   676 
       
   677 
       
   678 /* Free a tok_state structure */
       
   679 
       
   680 void
       
   681 PyTokenizer_Free(struct tok_state *tok)
       
   682 {
       
   683 	if (tok->encoding != NULL)
       
   684 		PyMem_FREE(tok->encoding);
       
   685 #ifndef PGEN
       
   686 	Py_XDECREF(tok->decoding_readline);
       
   687 	Py_XDECREF(tok->decoding_buffer);
       
   688 #endif
       
   689 	if (tok->fp != NULL && tok->buf != NULL)
       
   690 		PyMem_FREE(tok->buf);
       
   691 	PyMem_FREE(tok);
       
   692 }
       
   693 
       
   694 #if !defined(PGEN) && defined(Py_USING_UNICODE)
       
   695 static int
       
   696 tok_stdin_decode(struct tok_state *tok, char **inp)
       
   697 {
       
   698 	PyObject *enc, *sysstdin, *decoded, *utf8;
       
   699 	const char *encoding;
       
   700 	char *converted;
       
   701 
       
   702 	if (PySys_GetFile((char *)"stdin", NULL) != stdin)
       
   703 		return 0;
       
   704 	sysstdin = PySys_GetObject("stdin");
       
   705 	if (sysstdin == NULL || !PyFile_Check(sysstdin))
       
   706 		return 0;
       
   707 
       
   708 	enc = ((PyFileObject *)sysstdin)->f_encoding;
       
   709 	if (enc == NULL || !PyString_Check(enc))
       
   710 		return 0;
       
   711 	Py_INCREF(enc);
       
   712 
       
   713 	encoding = PyString_AsString(enc);
       
   714 	decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL);
       
   715 	if (decoded == NULL)
       
   716 		goto error_clear;
       
   717 
       
   718 	utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL);
       
   719 	Py_DECREF(decoded);
       
   720 	if (utf8 == NULL)
       
   721 		goto error_clear;
       
   722 
       
   723 	assert(PyString_Check(utf8));
       
   724 	converted = new_string(PyString_AS_STRING(utf8),
       
   725 			       PyString_GET_SIZE(utf8));
       
   726 	Py_DECREF(utf8);
       
   727 	if (converted == NULL)
       
   728 		goto error_nomem;
       
   729 
       
   730 	PyMem_FREE(*inp);
       
   731 	*inp = converted;
       
   732 	if (tok->encoding != NULL)
       
   733 		PyMem_FREE(tok->encoding);
       
   734 	tok->encoding = new_string(encoding, strlen(encoding));
       
   735 	if (tok->encoding == NULL)
       
   736 		goto error_nomem;
       
   737 
       
   738 	Py_DECREF(enc);
       
   739 	return 0;
       
   740 
       
   741 error_nomem:
       
   742 	Py_DECREF(enc);
       
   743 	tok->done = E_NOMEM;
       
   744 	return -1;
       
   745 
       
   746 error_clear:
       
   747 	/* Fallback to iso-8859-1: for backward compatibility */
       
   748 	Py_DECREF(enc);
       
   749 	PyErr_Clear();
       
   750 	return 0;
       
   751 }
       
   752 #endif
       
   753 
       
   754 /* Get next char, updating state; error code goes into tok->done */
       
   755 
       
   756 static int
       
   757 tok_nextc(register struct tok_state *tok)
       
   758 {
       
   759 	for (;;) {
       
   760 		if (tok->cur != tok->inp) {
       
   761 			return Py_CHARMASK(*tok->cur++); /* Fast path */
       
   762 		}
       
   763 		if (tok->done != E_OK)
       
   764 			return EOF;
       
   765 		if (tok->fp == NULL) {
       
   766 			char *end = strchr(tok->inp, '\n');
       
   767 			if (end != NULL)
       
   768 				end++;
       
   769 			else {
       
   770 				end = strchr(tok->inp, '\0');
       
   771 				if (end == tok->inp) {
       
   772 					tok->done = E_EOF;
       
   773 					return EOF;
       
   774 				}
       
   775 			}
       
   776 			if (tok->start == NULL)
       
   777 				tok->buf = tok->cur;
       
   778 			tok->line_start = tok->cur;
       
   779 			tok->lineno++;
       
   780 			tok->inp = end;
       
   781 			return Py_CHARMASK(*tok->cur++);
       
   782 		}
       
   783 		if (tok->prompt != NULL) {
       
   784 			char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
       
   785 			if (tok->nextprompt != NULL)
       
   786 				tok->prompt = tok->nextprompt;
       
   787 			if (newtok == NULL)
       
   788 				tok->done = E_INTR;
       
   789 			else if (*newtok == '\0') {
       
   790 				PyMem_FREE(newtok);
       
   791 				tok->done = E_EOF;
       
   792 			}
       
   793 #if !defined(PGEN) && defined(Py_USING_UNICODE)
       
   794 			else if (tok_stdin_decode(tok, &newtok) != 0)
       
   795 				PyMem_FREE(newtok);
       
   796 #endif
       
   797 			else if (tok->start != NULL) {
       
   798 				size_t start = tok->start - tok->buf;
       
   799 				size_t oldlen = tok->cur - tok->buf;
       
   800 				size_t newlen = oldlen + strlen(newtok);
       
   801 				char *buf = tok->buf;
       
   802 				buf = (char *)PyMem_REALLOC(buf, newlen+1);
       
   803 				tok->lineno++;
       
   804 				if (buf == NULL) {
       
   805 					PyMem_FREE(tok->buf);
       
   806 					tok->buf = NULL;
       
   807 					PyMem_FREE(newtok);
       
   808 					tok->done = E_NOMEM;
       
   809 					return EOF;
       
   810 				}
       
   811 				tok->buf = buf;
       
   812 				tok->cur = tok->buf + oldlen;
       
   813 				tok->line_start = tok->cur;
       
   814 				strcpy(tok->buf + oldlen, newtok);
       
   815 				PyMem_FREE(newtok);
       
   816 				tok->inp = tok->buf + newlen;
       
   817 				tok->end = tok->inp + 1;
       
   818 				tok->start = tok->buf + start;
       
   819 			}
       
   820 			else {
       
   821 				tok->lineno++;
       
   822 				if (tok->buf != NULL)
       
   823 					PyMem_FREE(tok->buf);
       
   824 				tok->buf = newtok;
       
   825 				tok->line_start = tok->buf;
       
   826 				tok->cur = tok->buf;
       
   827 				tok->line_start = tok->buf;
       
   828 				tok->inp = strchr(tok->buf, '\0');
       
   829 				tok->end = tok->inp + 1;
       
   830 			}
       
   831 		}
       
   832 		else {
       
   833 			int done = 0;
       
   834 			Py_ssize_t cur = 0;
       
   835 			char *pt;
       
   836 			if (tok->start == NULL) {
       
   837 				if (tok->buf == NULL) {
       
   838 					tok->buf = (char *)
       
   839 						PyMem_MALLOC(BUFSIZ);
       
   840 					if (tok->buf == NULL) {
       
   841 						tok->done = E_NOMEM;
       
   842 						return EOF;
       
   843 					}
       
   844 					tok->end = tok->buf + BUFSIZ;
       
   845 				}
       
   846 				if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
       
   847 					  tok) == NULL) {
       
   848 					tok->done = E_EOF;
       
   849 					done = 1;
       
   850 				}
       
   851 				else {
       
   852 					tok->done = E_OK;
       
   853 					tok->inp = strchr(tok->buf, '\0');
       
   854 					done = tok->inp[-1] == '\n';
       
   855 				}
       
   856 			}
       
   857 			else {
       
   858 				cur = tok->cur - tok->buf;
       
   859 				if (decoding_feof(tok)) {
       
   860 					tok->done = E_EOF;
       
   861 					done = 1;
       
   862 				}
       
   863 				else
       
   864 					tok->done = E_OK;
       
   865 			}
       
   866 			tok->lineno++;
       
   867 			/* Read until '\n' or EOF */
       
   868 			while (!done) {
       
   869 				Py_ssize_t curstart = tok->start == NULL ? -1 :
       
   870 					          tok->start - tok->buf;
       
   871 				Py_ssize_t curvalid = tok->inp - tok->buf;
       
   872 				Py_ssize_t newsize = curvalid + BUFSIZ;
       
   873 				char *newbuf = tok->buf;
       
   874 				newbuf = (char *)PyMem_REALLOC(newbuf,
       
   875 							       newsize);
       
   876 				if (newbuf == NULL) {
       
   877 					tok->done = E_NOMEM;
       
   878 					tok->cur = tok->inp;
       
   879 					return EOF;
       
   880 				}
       
   881 				tok->buf = newbuf;
       
   882 				tok->inp = tok->buf + curvalid;
       
   883 				tok->end = tok->buf + newsize;
       
   884 				tok->start = curstart < 0 ? NULL :
       
   885 					     tok->buf + curstart;
       
   886 				if (decoding_fgets(tok->inp,
       
   887 					       (int)(tok->end - tok->inp),
       
   888 					       tok) == NULL) {
       
   889 					/* Break out early on decoding
       
   890 					   errors, as tok->buf will be NULL
       
   891 					 */
       
   892 					if (tok->decoding_erred)
       
   893 						return EOF;
       
   894 					/* Last line does not end in \n,
       
   895 					   fake one */
       
   896 					strcpy(tok->inp, "\n");
       
   897 				}
       
   898 				tok->inp = strchr(tok->inp, '\0');
       
   899 				done = tok->inp[-1] == '\n';
       
   900 			}
       
   901 			if (tok->buf != NULL) {
       
   902 				tok->cur = tok->buf + cur;
       
   903 				tok->line_start = tok->cur;
       
   904 				/* replace "\r\n" with "\n" */
       
   905 				/* For Mac leave the \r, giving a syntax error */
       
   906 				pt = tok->inp - 2;
       
   907 				if (pt >= tok->buf && *pt == '\r') {
       
   908 					*pt++ = '\n';
       
   909 					*pt = '\0';
       
   910 					tok->inp = pt;
       
   911 				}
       
   912 			}
       
   913 		}
       
   914 		if (tok->done != E_OK) {
       
   915 			if (tok->prompt != NULL)
       
   916 				PySys_WriteStderr("\n");
       
   917 			tok->cur = tok->inp;
       
   918 			return EOF;
       
   919 		}
       
   920 	}
       
   921 	/*NOTREACHED*/
       
   922 }
       
   923 
       
   924 
       
   925 /* Back-up one character */
       
   926 
       
   927 static void
       
   928 tok_backup(register struct tok_state *tok, register int c)
       
   929 {
       
   930 	if (c != EOF) {
       
   931 		if (--tok->cur < tok->buf)
       
   932 			Py_FatalError("tok_backup: begin of buffer");
       
   933 		if (*tok->cur != c)
       
   934 			*tok->cur = c;
       
   935 	}
       
   936 }
       
   937 
       
   938 
       
   939 /* Return the token corresponding to a single character */
       
   940 
       
   941 int
       
   942 PyToken_OneChar(int c)
       
   943 {
       
   944 	switch (c) {
       
   945 	case '(':	return LPAR;
       
   946 	case ')':	return RPAR;
       
   947 	case '[':	return LSQB;
       
   948 	case ']':	return RSQB;
       
   949 	case ':':	return COLON;
       
   950 	case ',':	return COMMA;
       
   951 	case ';':	return SEMI;
       
   952 	case '+':	return PLUS;
       
   953 	case '-':	return MINUS;
       
   954 	case '*':	return STAR;
       
   955 	case '/':	return SLASH;
       
   956 	case '|':	return VBAR;
       
   957 	case '&':	return AMPER;
       
   958 	case '<':	return LESS;
       
   959 	case '>':	return GREATER;
       
   960 	case '=':	return EQUAL;
       
   961 	case '.':	return DOT;
       
   962 	case '%':	return PERCENT;
       
   963 	case '`':	return BACKQUOTE;
       
   964 	case '{':	return LBRACE;
       
   965 	case '}':	return RBRACE;
       
   966 	case '^':	return CIRCUMFLEX;
       
   967 	case '~':	return TILDE;
       
   968 	case '@':       return AT;
       
   969 	default:	return OP;
       
   970 	}
       
   971 }
       
   972 
       
   973 
       
   974 int
       
   975 PyToken_TwoChars(int c1, int c2)
       
   976 {
       
   977 	switch (c1) {
       
   978 	case '=':
       
   979 		switch (c2) {
       
   980 		case '=':	return EQEQUAL;
       
   981 		}
       
   982 		break;
       
   983 	case '!':
       
   984 		switch (c2) {
       
   985 		case '=':	return NOTEQUAL;
       
   986 		}
       
   987 		break;
       
   988 	case '<':
       
   989 		switch (c2) {
       
   990 		case '>':	return NOTEQUAL;
       
   991 		case '=':	return LESSEQUAL;
       
   992 		case '<':	return LEFTSHIFT;
       
   993 		}
       
   994 		break;
       
   995 	case '>':
       
   996 		switch (c2) {
       
   997 		case '=':	return GREATEREQUAL;
       
   998 		case '>':	return RIGHTSHIFT;
       
   999 		}
       
  1000 		break;
       
  1001 	case '+':
       
  1002 		switch (c2) {
       
  1003 		case '=':	return PLUSEQUAL;
       
  1004 		}
       
  1005 		break;
       
  1006 	case '-':
       
  1007 		switch (c2) {
       
  1008 		case '=':	return MINEQUAL;
       
  1009 		}
       
  1010 		break;
       
  1011 	case '*':
       
  1012 		switch (c2) {
       
  1013 		case '*':	return DOUBLESTAR;
       
  1014 		case '=':	return STAREQUAL;
       
  1015 		}
       
  1016 		break;
       
  1017 	case '/':
       
  1018 		switch (c2) {
       
  1019 		case '/':	return DOUBLESLASH;
       
  1020 		case '=':	return SLASHEQUAL;
       
  1021 		}
       
  1022 		break;
       
  1023 	case '|':
       
  1024 		switch (c2) {
       
  1025 		case '=':	return VBAREQUAL;
       
  1026 		}
       
  1027 		break;
       
  1028 	case '%':
       
  1029 		switch (c2) {
       
  1030 		case '=':	return PERCENTEQUAL;
       
  1031 		}
       
  1032 		break;
       
  1033 	case '&':
       
  1034 		switch (c2) {
       
  1035 		case '=':	return AMPEREQUAL;
       
  1036 		}
       
  1037 		break;
       
  1038 	case '^':
       
  1039 		switch (c2) {
       
  1040 		case '=':	return CIRCUMFLEXEQUAL;
       
  1041 		}
       
  1042 		break;
       
  1043 	}
       
  1044 	return OP;
       
  1045 }
       
  1046 
       
  1047 int
       
  1048 PyToken_ThreeChars(int c1, int c2, int c3)
       
  1049 {
       
  1050 	switch (c1) {
       
  1051 	case '<':
       
  1052 		switch (c2) {
       
  1053 		case '<':
       
  1054 			switch (c3) {
       
  1055 			case '=':
       
  1056 				return LEFTSHIFTEQUAL;
       
  1057 			}
       
  1058 			break;
       
  1059 		}
       
  1060 		break;
       
  1061 	case '>':
       
  1062 		switch (c2) {
       
  1063 		case '>':
       
  1064 			switch (c3) {
       
  1065 			case '=':
       
  1066 				return RIGHTSHIFTEQUAL;
       
  1067 			}
       
  1068 			break;
       
  1069 		}
       
  1070 		break;
       
  1071 	case '*':
       
  1072 		switch (c2) {
       
  1073 		case '*':
       
  1074 			switch (c3) {
       
  1075 			case '=':
       
  1076 				return DOUBLESTAREQUAL;
       
  1077 			}
       
  1078 			break;
       
  1079 		}
       
  1080 		break;
       
  1081 	case '/':
       
  1082 		switch (c2) {
       
  1083 		case '/':
       
  1084 			switch (c3) {
       
  1085 			case '=':
       
  1086 				return DOUBLESLASHEQUAL;
       
  1087 			}
       
  1088 			break;
       
  1089 		}
       
  1090 		break;
       
  1091 	}
       
  1092 	return OP;
       
  1093 }
       
  1094 
       
  1095 static int
       
  1096 indenterror(struct tok_state *tok)
       
  1097 {
       
  1098 	if (tok->alterror) {
       
  1099 		tok->done = E_TABSPACE;
       
  1100 		tok->cur = tok->inp;
       
  1101 		return 1;
       
  1102 	}
       
  1103 	if (tok->altwarning) {
       
  1104 		PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
       
  1105                                   "in indentation\n", tok->filename);
       
  1106 		tok->altwarning = 0;
       
  1107 	}
       
  1108 	return 0;
       
  1109 }
       
  1110 
       
  1111 
       
  1112 /* Get next token, after space stripping etc. */
       
  1113 
       
  1114 static int
       
  1115 tok_get(register struct tok_state *tok, char **p_start, char **p_end)
       
  1116 {
       
  1117 	register int c;
       
  1118 	int blankline;
       
  1119 
       
  1120 	*p_start = *p_end = NULL;
       
  1121   nextline:
       
  1122 	tok->start = NULL;
       
  1123 	blankline = 0;
       
  1124 
       
  1125 	/* Get indentation level */
       
  1126 	if (tok->atbol) {
       
  1127 		register int col = 0;
       
  1128 		register int altcol = 0;
       
  1129 		tok->atbol = 0;
       
  1130 		for (;;) {
       
  1131 			c = tok_nextc(tok);
       
  1132 			if (c == ' ')
       
  1133 				col++, altcol++;
       
  1134 			else if (c == '\t') {
       
  1135 				col = (col/tok->tabsize + 1) * tok->tabsize;
       
  1136 				altcol = (altcol/tok->alttabsize + 1)
       
  1137 					* tok->alttabsize;
       
  1138 			}
       
  1139 			else if (c == '\014') /* Control-L (formfeed) */
       
  1140 				col = altcol = 0; /* For Emacs users */
       
  1141 			else
       
  1142 				break;
       
  1143 		}
       
  1144 		tok_backup(tok, c);
       
  1145 		if (c == '#' || c == '\n') {
       
  1146 			/* Lines with only whitespace and/or comments
       
  1147 			   shouldn't affect the indentation and are
       
  1148 			   not passed to the parser as NEWLINE tokens,
       
  1149 			   except *totally* empty lines in interactive
       
  1150 			   mode, which signal the end of a command group. */
       
  1151 			if (col == 0 && c == '\n' && tok->prompt != NULL)
       
  1152 				blankline = 0; /* Let it through */
       
  1153 			else
       
  1154 				blankline = 1; /* Ignore completely */
       
  1155 			/* We can't jump back right here since we still
       
  1156 			   may need to skip to the end of a comment */
       
  1157 		}
       
  1158 		if (!blankline && tok->level == 0) {
       
  1159 			if (col == tok->indstack[tok->indent]) {
       
  1160 				/* No change */
       
  1161 				if (altcol != tok->altindstack[tok->indent]) {
       
  1162 					if (indenterror(tok))
       
  1163 						return ERRORTOKEN;
       
  1164 				}
       
  1165 			}
       
  1166 			else if (col > tok->indstack[tok->indent]) {
       
  1167 				/* Indent -- always one */
       
  1168 				if (tok->indent+1 >= MAXINDENT) {
       
  1169 					tok->done = E_TOODEEP;
       
  1170 					tok->cur = tok->inp;
       
  1171 					return ERRORTOKEN;
       
  1172 				}
       
  1173 				if (altcol <= tok->altindstack[tok->indent]) {
       
  1174 					if (indenterror(tok))
       
  1175 						return ERRORTOKEN;
       
  1176 				}
       
  1177 				tok->pendin++;
       
  1178 				tok->indstack[++tok->indent] = col;
       
  1179 				tok->altindstack[tok->indent] = altcol;
       
  1180 			}
       
  1181 			else /* col < tok->indstack[tok->indent] */ {
       
  1182 				/* Dedent -- any number, must be consistent */
       
  1183 				while (tok->indent > 0 &&
       
  1184 					col < tok->indstack[tok->indent]) {
       
  1185 					tok->pendin--;
       
  1186 					tok->indent--;
       
  1187 				}
       
  1188 				if (col != tok->indstack[tok->indent]) {
       
  1189 					tok->done = E_DEDENT;
       
  1190 					tok->cur = tok->inp;
       
  1191 					return ERRORTOKEN;
       
  1192 				}
       
  1193 				if (altcol != tok->altindstack[tok->indent]) {
       
  1194 					if (indenterror(tok))
       
  1195 						return ERRORTOKEN;
       
  1196 				}
       
  1197 			}
       
  1198 		}
       
  1199 	}
       
  1200 
       
  1201 	tok->start = tok->cur;
       
  1202 
       
  1203 	/* Return pending indents/dedents */
       
  1204 	if (tok->pendin != 0) {
       
  1205 		if (tok->pendin < 0) {
       
  1206 			tok->pendin++;
       
  1207 			return DEDENT;
       
  1208 		}
       
  1209 		else {
       
  1210 			tok->pendin--;
       
  1211 			return INDENT;
       
  1212 		}
       
  1213 	}
       
  1214 
       
  1215  again:
       
  1216 	tok->start = NULL;
       
  1217 	/* Skip spaces */
       
  1218 	do {
       
  1219 		c = tok_nextc(tok);
       
  1220 	} while (c == ' ' || c == '\t' || c == '\014');
       
  1221 
       
  1222 	/* Set start of current token */
       
  1223 	tok->start = tok->cur - 1;
       
  1224 
       
  1225 	/* Skip comment, while looking for tab-setting magic */
       
  1226 	if (c == '#') {
       
  1227 		static char *tabforms[] = {
       
  1228 			"tab-width:",		/* Emacs */
       
  1229 			":tabstop=",		/* vim, full form */
       
  1230 			":ts=",			/* vim, abbreviated form */
       
  1231 			"set tabsize=",		/* will vi never die? */
       
  1232 		/* more templates can be added here to support other editors */
       
  1233 		};
       
  1234 		char cbuf[80];
       
  1235 		char *tp, **cp;
       
  1236 		tp = cbuf;
       
  1237 		do {
       
  1238 			*tp++ = c = tok_nextc(tok);
       
  1239 		} while (c != EOF && c != '\n' &&
       
  1240 			 (size_t)(tp - cbuf + 1) < sizeof(cbuf));
       
  1241 		*tp = '\0';
       
  1242 		for (cp = tabforms;
       
  1243 		     cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
       
  1244 		     cp++) {
       
  1245 			if ((tp = strstr(cbuf, *cp))) {
       
  1246 				int newsize = atoi(tp + strlen(*cp));
       
  1247 
       
  1248 				if (newsize >= 1 && newsize <= 40) {
       
  1249 					tok->tabsize = newsize;
       
  1250 					if (Py_VerboseFlag)
       
  1251 					    PySys_WriteStderr(
       
  1252 						"Tab size set to %d\n",
       
  1253 						newsize);
       
  1254 				}
       
  1255 			}
       
  1256 		}
       
  1257 		while (c != EOF && c != '\n')
       
  1258 			c = tok_nextc(tok);
       
  1259 	}
       
  1260 
       
  1261 	/* Check for EOF and errors now */
       
  1262 	if (c == EOF) {
       
  1263 		return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
       
  1264 	}
       
  1265 
       
  1266 	/* Identifier (most frequent token!) */
       
  1267 	if (isalpha(c) || c == '_') {
       
  1268 		/* Process r"", u"" and ur"" */
       
  1269 		switch (c) {
       
  1270 		case 'b':
       
  1271 		case 'B':
       
  1272 			c = tok_nextc(tok);
       
  1273 			if (c == 'r' || c == 'R')
       
  1274 				c = tok_nextc(tok);
       
  1275 			if (c == '"' || c == '\'')
       
  1276 				goto letter_quote;
       
  1277 			break;
       
  1278 		case 'r':
       
  1279 		case 'R':
       
  1280 			c = tok_nextc(tok);
       
  1281 			if (c == '"' || c == '\'')
       
  1282 				goto letter_quote;
       
  1283 			break;
       
  1284 		case 'u':
       
  1285 		case 'U':
       
  1286 			c = tok_nextc(tok);
       
  1287 			if (c == 'r' || c == 'R')
       
  1288 				c = tok_nextc(tok);
       
  1289 			if (c == '"' || c == '\'')
       
  1290 				goto letter_quote;
       
  1291 			break;
       
  1292 		}
       
  1293 		while (isalnum(c) || c == '_') {
       
  1294 			c = tok_nextc(tok);
       
  1295 		}
       
  1296 		tok_backup(tok, c);
       
  1297 		*p_start = tok->start;
       
  1298 		*p_end = tok->cur;
       
  1299 		return NAME;
       
  1300 	}
       
  1301 
       
  1302 	/* Newline */
       
  1303 	if (c == '\n') {
       
  1304 		tok->atbol = 1;
       
  1305 		if (blankline || tok->level > 0)
       
  1306 			goto nextline;
       
  1307 		*p_start = tok->start;
       
  1308 		*p_end = tok->cur - 1; /* Leave '\n' out of the string */
       
  1309                 tok->cont_line = 0;
       
  1310 		return NEWLINE;
       
  1311 	}
       
  1312 
       
  1313 	/* Period or number starting with period? */
       
  1314 	if (c == '.') {
       
  1315 		c = tok_nextc(tok);
       
  1316 		if (isdigit(c)) {
       
  1317 			goto fraction;
       
  1318 		}
       
  1319 		else {
       
  1320 			tok_backup(tok, c);
       
  1321 			*p_start = tok->start;
       
  1322 			*p_end = tok->cur;
       
  1323 			return DOT;
       
  1324 		}
       
  1325 	}
       
  1326 
       
  1327 	/* Number */
       
  1328 	if (isdigit(c)) {
       
  1329 		if (c == '0') {
       
  1330 			/* Hex, octal or binary -- maybe. */
       
  1331 			c = tok_nextc(tok);
       
  1332 			if (c == '.')
       
  1333 				goto fraction;
       
  1334 #ifndef WITHOUT_COMPLEX
       
  1335 			if (c == 'j' || c == 'J')
       
  1336 				goto imaginary;
       
  1337 #endif
       
  1338 			if (c == 'x' || c == 'X') {
       
  1339 
       
  1340 				/* Hex */
       
  1341 				c = tok_nextc(tok);
       
  1342 				if (!isxdigit(c)) {
       
  1343 					tok->done = E_TOKEN;
       
  1344 					tok_backup(tok, c);
       
  1345 					return ERRORTOKEN;
       
  1346 				}
       
  1347 				do {
       
  1348 					c = tok_nextc(tok);
       
  1349 				} while (isxdigit(c));
       
  1350 			}
       
  1351                         else if (c == 'o' || c == 'O') {
       
  1352 				/* Octal */
       
  1353 				c = tok_nextc(tok);
       
  1354 				if (c < '0' || c >= '8') {
       
  1355 					tok->done = E_TOKEN;
       
  1356 					tok_backup(tok, c);
       
  1357 					return ERRORTOKEN;
       
  1358 				}
       
  1359 				do {
       
  1360 					c = tok_nextc(tok);
       
  1361 				} while ('0' <= c && c < '8');
       
  1362 			}
       
  1363 			else if (c == 'b' || c == 'B') {
       
  1364 				/* Binary */
       
  1365 				c = tok_nextc(tok);
       
  1366 				if (c != '0' && c != '1') {
       
  1367 					tok->done = E_TOKEN;
       
  1368 					tok_backup(tok, c);
       
  1369 					return ERRORTOKEN;
       
  1370 				}
       
  1371 				do {
       
  1372 					c = tok_nextc(tok);
       
  1373 				} while (c == '0' || c == '1');
       
  1374 			}
       
  1375 			else {
       
  1376 				int found_decimal = 0;
       
  1377 				/* Octal; c is first char of it */
       
  1378 				/* There's no 'isoctdigit' macro, sigh */
       
  1379 				while ('0' <= c && c < '8') {
       
  1380 					c = tok_nextc(tok);
       
  1381 				}
       
  1382 				if (isdigit(c)) {
       
  1383 					found_decimal = 1;
       
  1384 					do {
       
  1385 						c = tok_nextc(tok);
       
  1386 					} while (isdigit(c));
       
  1387 				}
       
  1388 				if (c == '.')
       
  1389 					goto fraction;
       
  1390 				else if (c == 'e' || c == 'E')
       
  1391 					goto exponent;
       
  1392 #ifndef WITHOUT_COMPLEX
       
  1393 				else if (c == 'j' || c == 'J')
       
  1394 					goto imaginary;
       
  1395 #endif
       
  1396 				else if (found_decimal) {
       
  1397 					tok->done = E_TOKEN;
       
  1398 					tok_backup(tok, c);
       
  1399 					return ERRORTOKEN;
       
  1400 				}
       
  1401 			}
       
  1402 			if (c == 'l' || c == 'L')
       
  1403 				c = tok_nextc(tok);
       
  1404 		}
       
  1405 		else {
       
  1406 			/* Decimal */
       
  1407 			do {
       
  1408 				c = tok_nextc(tok);
       
  1409 			} while (isdigit(c));
       
  1410 			if (c == 'l' || c == 'L')
       
  1411 				c = tok_nextc(tok);
       
  1412 			else {
       
  1413 				/* Accept floating point numbers. */
       
  1414 				if (c == '.') {
       
  1415 		fraction:
       
  1416 					/* Fraction */
       
  1417 					do {
       
  1418 						c = tok_nextc(tok);
       
  1419 					} while (isdigit(c));
       
  1420 				}
       
  1421 				if (c == 'e' || c == 'E') {
       
  1422 		exponent:
       
  1423 					/* Exponent part */
       
  1424 					c = tok_nextc(tok);
       
  1425 					if (c == '+' || c == '-')
       
  1426 						c = tok_nextc(tok);
       
  1427 					if (!isdigit(c)) {
       
  1428 						tok->done = E_TOKEN;
       
  1429 						tok_backup(tok, c);
       
  1430 						return ERRORTOKEN;
       
  1431 					}
       
  1432 					do {
       
  1433 						c = tok_nextc(tok);
       
  1434 					} while (isdigit(c));
       
  1435 				}
       
  1436 #ifndef WITHOUT_COMPLEX
       
  1437 				if (c == 'j' || c == 'J')
       
  1438 					/* Imaginary part */
       
  1439 		imaginary:
       
  1440 					c = tok_nextc(tok);
       
  1441 #endif
       
  1442 			}
       
  1443 		}
       
  1444 		tok_backup(tok, c);
       
  1445 		*p_start = tok->start;
       
  1446 		*p_end = tok->cur;
       
  1447 		return NUMBER;
       
  1448 	}
       
  1449 
       
  1450   letter_quote:
       
  1451 	/* String */
       
  1452 	if (c == '\'' || c == '"') {
       
  1453 		Py_ssize_t quote2 = tok->cur - tok->start + 1;
       
  1454 		int quote = c;
       
  1455 		int triple = 0;
       
  1456 		int tripcount = 0;
       
  1457 		for (;;) {
       
  1458 			c = tok_nextc(tok);
       
  1459 			if (c == '\n') {
       
  1460 				if (!triple) {
       
  1461 					tok->done = E_EOLS;
       
  1462 					tok_backup(tok, c);
       
  1463 					return ERRORTOKEN;
       
  1464 				}
       
  1465 				tripcount = 0;
       
  1466                                 tok->cont_line = 1; /* multiline string. */
       
  1467 			}
       
  1468 			else if (c == EOF) {
       
  1469 				if (triple)
       
  1470 					tok->done = E_EOFS;
       
  1471 				else
       
  1472 					tok->done = E_EOLS;
       
  1473 				tok->cur = tok->inp;
       
  1474 				return ERRORTOKEN;
       
  1475 			}
       
  1476 			else if (c == quote) {
       
  1477 				tripcount++;
       
  1478 				if (tok->cur - tok->start == quote2) {
       
  1479 					c = tok_nextc(tok);
       
  1480 					if (c == quote) {
       
  1481 						triple = 1;
       
  1482 						tripcount = 0;
       
  1483 						continue;
       
  1484 					}
       
  1485 					tok_backup(tok, c);
       
  1486 				}
       
  1487 				if (!triple || tripcount == 3)
       
  1488 					break;
       
  1489 			}
       
  1490 			else if (c == '\\') {
       
  1491 				tripcount = 0;
       
  1492 				c = tok_nextc(tok);
       
  1493 				if (c == EOF) {
       
  1494 					tok->done = E_EOLS;
       
  1495 					tok->cur = tok->inp;
       
  1496 					return ERRORTOKEN;
       
  1497 				}
       
  1498 			}
       
  1499 			else
       
  1500 				tripcount = 0;
       
  1501 		}
       
  1502 		*p_start = tok->start;
       
  1503 		*p_end = tok->cur;
       
  1504 		return STRING;
       
  1505 	}
       
  1506 
       
  1507 	/* Line continuation */
       
  1508 	if (c == '\\') {
       
  1509 		c = tok_nextc(tok);
       
  1510 		if (c != '\n') {
       
  1511 			tok->done = E_LINECONT;
       
  1512 			tok->cur = tok->inp;
       
  1513 			return ERRORTOKEN;
       
  1514 		}
       
  1515                 tok->cont_line = 1;
       
  1516 		goto again; /* Read next line */
       
  1517 	}
       
  1518 
       
  1519 	/* Check for two-character token */
       
  1520 	{
       
  1521 		int c2 = tok_nextc(tok);
       
  1522 		int token = PyToken_TwoChars(c, c2);
       
  1523 #ifndef PGEN
       
  1524 		if (Py_Py3kWarningFlag && token == NOTEQUAL && c == '<') {
       
  1525 			if (PyErr_WarnExplicit(PyExc_DeprecationWarning,
       
  1526 					       "<> not supported in 3.x; use !=",
       
  1527 					       tok->filename, tok->lineno,
       
  1528 					       NULL, NULL)) {
       
  1529 				return ERRORTOKEN;
       
  1530 			}
       
  1531 		}
       
  1532 #endif
       
  1533 		if (token != OP) {
       
  1534 			int c3 = tok_nextc(tok);
       
  1535 			int token3 = PyToken_ThreeChars(c, c2, c3);
       
  1536 			if (token3 != OP) {
       
  1537 				token = token3;
       
  1538 			} else {
       
  1539 				tok_backup(tok, c3);
       
  1540 			}
       
  1541 			*p_start = tok->start;
       
  1542 			*p_end = tok->cur;
       
  1543 			return token;
       
  1544 		}
       
  1545 		tok_backup(tok, c2);
       
  1546 	}
       
  1547 
       
  1548 	/* Keep track of parentheses nesting level */
       
  1549 	switch (c) {
       
  1550 	case '(':
       
  1551 	case '[':
       
  1552 	case '{':
       
  1553 		tok->level++;
       
  1554 		break;
       
  1555 	case ')':
       
  1556 	case ']':
       
  1557 	case '}':
       
  1558 		tok->level--;
       
  1559 		break;
       
  1560 	}
       
  1561 
       
  1562 	/* Punctuation character */
       
  1563 	*p_start = tok->start;
       
  1564 	*p_end = tok->cur;
       
  1565 	return PyToken_OneChar(c);
       
  1566 }
       
  1567 
       
  1568 int
       
  1569 PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
       
  1570 {
       
  1571 	int result = tok_get(tok, p_start, p_end);
       
  1572 	if (tok->decoding_erred) {
       
  1573 		result = ERRORTOKEN;
       
  1574 		tok->done = E_DECODE;
       
  1575 	}
       
  1576 	return result;
       
  1577 }
       
  1578 
       
  1579 /* This function is only called from parsetok. However, it cannot live
       
  1580    there, as it must be empty for PGEN, and we can check for PGEN only
       
  1581    in this file. */
       
  1582 
       
  1583 #if defined(PGEN) || !defined(Py_USING_UNICODE)
       
  1584 char*
       
  1585 PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int* offset)
       
  1586 {
       
  1587 	return NULL;
       
  1588 }
       
  1589 #else
       
  1590 #ifdef Py_USING_UNICODE
       
  1591 static PyObject *
       
  1592 dec_utf8(const char *enc, const char *text, size_t len) {
       
  1593 	PyObject *ret = NULL;	
       
  1594 	PyObject *unicode_text = PyUnicode_DecodeUTF8(text, len, "replace");
       
  1595 	if (unicode_text) {
       
  1596 		ret = PyUnicode_AsEncodedString(unicode_text, enc, "replace");
       
  1597 		Py_DECREF(unicode_text);
       
  1598 	}
       
  1599 	if (!ret) {
       
  1600 		PyErr_Clear();
       
  1601 	}
       
  1602 	return ret;
       
  1603 }
       
  1604 char *
       
  1605 PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int *offset)
       
  1606 {
       
  1607 	char *text = NULL;
       
  1608 	if (tok->encoding) {
       
  1609 		/* convert source to original encondig */
       
  1610 		PyObject *lineobj = dec_utf8(tok->encoding, tok->buf, len);
       
  1611 		if (lineobj != NULL) {
       
  1612 			int linelen = PyString_Size(lineobj);
       
  1613 			const char *line = PyString_AsString(lineobj);
       
  1614 			text = PyObject_MALLOC(linelen + 1);
       
  1615 			if (text != NULL && line != NULL) {
       
  1616 				if (linelen)
       
  1617 					strncpy(text, line, linelen);
       
  1618 				text[linelen] = '\0';
       
  1619 			}
       
  1620 			Py_DECREF(lineobj);
       
  1621 					
       
  1622 			/* adjust error offset */
       
  1623 			if (*offset > 1) {
       
  1624 				PyObject *offsetobj = dec_utf8(tok->encoding, 
       
  1625 							       tok->buf, *offset-1);
       
  1626 				if (offsetobj) {
       
  1627 					*offset = PyString_Size(offsetobj) + 1;
       
  1628 					Py_DECREF(offsetobj);
       
  1629 				}
       
  1630 			}
       
  1631 			
       
  1632 		}
       
  1633 	}
       
  1634 	return text;
       
  1635 
       
  1636 }
       
  1637 #endif /* defined(Py_USING_UNICODE) */
       
  1638 #endif
       
  1639 
       
  1640 
       
  1641 #ifdef Py_DEBUG
       
  1642 
       
  1643 void
       
  1644 tok_dump(int type, char *start, char *end)
       
  1645 {
       
  1646 	printf("%s", _PyParser_TokenNames[type]);
       
  1647 	if (type == NAME || type == NUMBER || type == STRING || type == OP)
       
  1648 		printf("(%.*s)", (int)(end - start), start);
       
  1649 }
       
  1650 
       
  1651 #endif