symbian-qemu-0.9.1-12/python-win32-2.6.1/lib/tokenize.py
changeset 1 2fb8b9db1c86
equal deleted inserted replaced
0:ffa851df0825 1:2fb8b9db1c86
       
     1 """Tokenization help for Python programs.
       
     2 
       
     3 generate_tokens(readline) is a generator that breaks a stream of
       
     4 text into Python tokens.  It accepts a readline-like method which is called
       
     5 repeatedly to get the next line of input (or "" for EOF).  It generates
       
     6 5-tuples with these members:
       
     7 
       
     8     the token type (see token.py)
       
     9     the token (a string)
       
    10     the starting (row, column) indices of the token (a 2-tuple of ints)
       
    11     the ending (row, column) indices of the token (a 2-tuple of ints)
       
    12     the original line (string)
       
    13 
       
    14 It is designed to match the working of the Python tokenizer exactly, except
       
    15 that it produces COMMENT tokens for comments and gives type OP for all
       
    16 operators
       
    17 
       
    18 Older entry points
       
    19     tokenize_loop(readline, tokeneater)
       
    20     tokenize(readline, tokeneater=printtoken)
       
    21 are the same, except instead of generating tokens, tokeneater is a callback
       
    22 function to which the 5 fields described above are passed as 5 arguments,
       
    23 each time a new token is found."""
       
    24 
       
    25 __author__ = 'Ka-Ping Yee <ping@lfw.org>'
       
    26 __credits__ = \
       
    27     'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro, Raymond Hettinger'
       
    28 
       
    29 import string, re
       
    30 from token import *
       
    31 
       
    32 import token
       
    33 __all__ = [x for x in dir(token) if x[0] != '_'] + ["COMMENT", "tokenize",
       
    34            "generate_tokens", "NL", "untokenize"]
       
    35 del x
       
    36 del token
       
    37 
       
    38 COMMENT = N_TOKENS
       
    39 tok_name[COMMENT] = 'COMMENT'
       
    40 NL = N_TOKENS + 1
       
    41 tok_name[NL] = 'NL'
       
    42 N_TOKENS += 2
       
    43 
       
    44 def group(*choices): return '(' + '|'.join(choices) + ')'
       
    45 def any(*choices): return group(*choices) + '*'
       
    46 def maybe(*choices): return group(*choices) + '?'
       
    47 
       
    48 Whitespace = r'[ \f\t]*'
       
    49 Comment = r'#[^\r\n]*'
       
    50 Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
       
    51 Name = r'[a-zA-Z_]\w*'
       
    52 
       
    53 Hexnumber = r'0[xX][\da-fA-F]+[lL]?'
       
    54 Octnumber = r'(0[oO][0-7]+)|(0[0-7]*)[lL]?'
       
    55 Binnumber = r'0[bB][01]+[lL]?'
       
    56 Decnumber = r'[1-9]\d*[lL]?'
       
    57 Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
       
    58 Exponent = r'[eE][-+]?\d+'
       
    59 Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
       
    60 Expfloat = r'\d+' + Exponent
       
    61 Floatnumber = group(Pointfloat, Expfloat)
       
    62 Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
       
    63 Number = group(Imagnumber, Floatnumber, Intnumber)
       
    64 
       
    65 # Tail end of ' string.
       
    66 Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
       
    67 # Tail end of " string.
       
    68 Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
       
    69 # Tail end of ''' string.
       
    70 Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
       
    71 # Tail end of """ string.
       
    72 Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
       
    73 Triple = group("[uU]?[rR]?'''", '[uU]?[rR]?"""')
       
    74 # Single-line ' or " string.
       
    75 String = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
       
    76                r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
       
    77 
       
    78 # Because of leftmost-then-longest match semantics, be sure to put the
       
    79 # longest operators first (e.g., if = came before ==, == would get
       
    80 # recognized as two instances of =).
       
    81 Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
       
    82                  r"//=?",
       
    83                  r"[+\-*/%&|^=<>]=?",
       
    84                  r"~")
       
    85 
       
    86 Bracket = '[][(){}]'
       
    87 Special = group(r'\r?\n', r'[:;.,`@]')
       
    88 Funny = group(Operator, Bracket, Special)
       
    89 
       
    90 PlainToken = group(Number, Funny, String, Name)
       
    91 Token = Ignore + PlainToken
       
    92 
       
    93 # First (or only) line of ' or " string.
       
    94 ContStr = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
       
    95                 group("'", r'\\\r?\n'),
       
    96                 r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
       
    97                 group('"', r'\\\r?\n'))
       
    98 PseudoExtras = group(r'\\\r?\n', Comment, Triple)
       
    99 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
       
   100 
       
   101 tokenprog, pseudoprog, single3prog, double3prog = map(
       
   102     re.compile, (Token, PseudoToken, Single3, Double3))
       
   103 endprogs = {"'": re.compile(Single), '"': re.compile(Double),
       
   104             "'''": single3prog, '"""': double3prog,
       
   105             "r'''": single3prog, 'r"""': double3prog,
       
   106             "u'''": single3prog, 'u"""': double3prog,
       
   107             "ur'''": single3prog, 'ur"""': double3prog,
       
   108             "R'''": single3prog, 'R"""': double3prog,
       
   109             "U'''": single3prog, 'U"""': double3prog,
       
   110             "uR'''": single3prog, 'uR"""': double3prog,
       
   111             "Ur'''": single3prog, 'Ur"""': double3prog,
       
   112             "UR'''": single3prog, 'UR"""': double3prog,
       
   113             "b'''": single3prog, 'b"""': double3prog,
       
   114             "br'''": single3prog, 'br"""': double3prog,
       
   115             "B'''": single3prog, 'B"""': double3prog,
       
   116             "bR'''": single3prog, 'bR"""': double3prog,
       
   117             "Br'''": single3prog, 'Br"""': double3prog,
       
   118             "BR'''": single3prog, 'BR"""': double3prog,
       
   119             'r': None, 'R': None, 'u': None, 'U': None,
       
   120             'b': None, 'B': None}
       
   121 
       
   122 triple_quoted = {}
       
   123 for t in ("'''", '"""',
       
   124           "r'''", 'r"""', "R'''", 'R"""',
       
   125           "u'''", 'u"""', "U'''", 'U"""',
       
   126           "ur'''", 'ur"""', "Ur'''", 'Ur"""',
       
   127           "uR'''", 'uR"""', "UR'''", 'UR"""',
       
   128           "b'''", 'b"""', "B'''", 'B"""',
       
   129           "br'''", 'br"""', "Br'''", 'Br"""',
       
   130           "bR'''", 'bR"""', "BR'''", 'BR"""'):
       
   131     triple_quoted[t] = t
       
   132 single_quoted = {}
       
   133 for t in ("'", '"',
       
   134           "r'", 'r"', "R'", 'R"',
       
   135           "u'", 'u"', "U'", 'U"',
       
   136           "ur'", 'ur"', "Ur'", 'Ur"',
       
   137           "uR'", 'uR"', "UR'", 'UR"',
       
   138           "b'", 'b"', "B'", 'B"',
       
   139           "br'", 'br"', "Br'", 'Br"',
       
   140           "bR'", 'bR"', "BR'", 'BR"' ):
       
   141     single_quoted[t] = t
       
   142 
       
   143 tabsize = 8
       
   144 
       
   145 class TokenError(Exception): pass
       
   146 
       
   147 class StopTokenizing(Exception): pass
       
   148 
       
   149 def printtoken(type, token, srow_scol, erow_ecol, line): # for testing
       
   150     srow, scol = srow_scol
       
   151     erow, ecol = erow_ecol
       
   152     print "%d,%d-%d,%d:\t%s\t%s" % \
       
   153         (srow, scol, erow, ecol, tok_name[type], repr(token))
       
   154 
       
   155 def tokenize(readline, tokeneater=printtoken):
       
   156     """
       
   157     The tokenize() function accepts two parameters: one representing the
       
   158     input stream, and one providing an output mechanism for tokenize().
       
   159 
       
   160     The first parameter, readline, must be a callable object which provides
       
   161     the same interface as the readline() method of built-in file objects.
       
   162     Each call to the function should return one line of input as a string.
       
   163 
       
   164     The second parameter, tokeneater, must also be a callable object. It is
       
   165     called once for each token, with five arguments, corresponding to the
       
   166     tuples generated by generate_tokens().
       
   167     """
       
   168     try:
       
   169         tokenize_loop(readline, tokeneater)
       
   170     except StopTokenizing:
       
   171         pass
       
   172 
       
   173 # backwards compatible interface
       
   174 def tokenize_loop(readline, tokeneater):
       
   175     for token_info in generate_tokens(readline):
       
   176         tokeneater(*token_info)
       
   177 
       
   178 class Untokenizer:
       
   179 
       
   180     def __init__(self):
       
   181         self.tokens = []
       
   182         self.prev_row = 1
       
   183         self.prev_col = 0
       
   184 
       
   185     def add_whitespace(self, start):
       
   186         row, col = start
       
   187         assert row <= self.prev_row
       
   188         col_offset = col - self.prev_col
       
   189         if col_offset:
       
   190             self.tokens.append(" " * col_offset)
       
   191 
       
   192     def untokenize(self, iterable):
       
   193         for t in iterable:
       
   194             if len(t) == 2:
       
   195                 self.compat(t, iterable)
       
   196                 break
       
   197             tok_type, token, start, end, line = t
       
   198             self.add_whitespace(start)
       
   199             self.tokens.append(token)
       
   200             self.prev_row, self.prev_col = end
       
   201             if tok_type in (NEWLINE, NL):
       
   202                 self.prev_row += 1
       
   203                 self.prev_col = 0
       
   204         return "".join(self.tokens)
       
   205 
       
   206     def compat(self, token, iterable):
       
   207         startline = False
       
   208         indents = []
       
   209         toks_append = self.tokens.append
       
   210         toknum, tokval = token
       
   211         if toknum in (NAME, NUMBER):
       
   212             tokval += ' '
       
   213         if toknum in (NEWLINE, NL):
       
   214             startline = True
       
   215         prevstring = False
       
   216         for tok in iterable:
       
   217             toknum, tokval = tok[:2]
       
   218 
       
   219             if toknum in (NAME, NUMBER):
       
   220                 tokval += ' '
       
   221 
       
   222             # Insert a space between two consecutive strings
       
   223             if toknum == STRING:
       
   224                 if prevstring:
       
   225                     tokval = ' ' + tokval
       
   226                 prevstring = True
       
   227             else:
       
   228                 prevstring = False
       
   229 
       
   230             if toknum == INDENT:
       
   231                 indents.append(tokval)
       
   232                 continue
       
   233             elif toknum == DEDENT:
       
   234                 indents.pop()
       
   235                 continue
       
   236             elif toknum in (NEWLINE, NL):
       
   237                 startline = True
       
   238             elif startline and indents:
       
   239                 toks_append(indents[-1])
       
   240                 startline = False
       
   241             toks_append(tokval)
       
   242 
       
   243 def untokenize(iterable):
       
   244     """Transform tokens back into Python source code.
       
   245 
       
   246     Each element returned by the iterable must be a token sequence
       
   247     with at least two elements, a token number and token value.  If
       
   248     only two tokens are passed, the resulting output is poor.
       
   249 
       
   250     Round-trip invariant for full input:
       
   251         Untokenized source will match input source exactly
       
   252 
       
   253     Round-trip invariant for limited intput:
       
   254         # Output text will tokenize the back to the input
       
   255         t1 = [tok[:2] for tok in generate_tokens(f.readline)]
       
   256         newcode = untokenize(t1)
       
   257         readline = iter(newcode.splitlines(1)).next
       
   258         t2 = [tok[:2] for tok in generate_tokens(readline)]
       
   259         assert t1 == t2
       
   260     """
       
   261     ut = Untokenizer()
       
   262     return ut.untokenize(iterable)
       
   263 
       
   264 def generate_tokens(readline):
       
   265     """
       
   266     The generate_tokens() generator requires one argment, readline, which
       
   267     must be a callable object which provides the same interface as the
       
   268     readline() method of built-in file objects. Each call to the function
       
   269     should return one line of input as a string.  Alternately, readline
       
   270     can be a callable function terminating with StopIteration:
       
   271         readline = open(myfile).next    # Example of alternate readline
       
   272 
       
   273     The generator produces 5-tuples with these members: the token type; the
       
   274     token string; a 2-tuple (srow, scol) of ints specifying the row and
       
   275     column where the token begins in the source; a 2-tuple (erow, ecol) of
       
   276     ints specifying the row and column where the token ends in the source;
       
   277     and the line on which the token was found. The line passed is the
       
   278     logical line; continuation lines are included.
       
   279     """
       
   280     lnum = parenlev = continued = 0
       
   281     namechars, numchars = string.ascii_letters + '_', '0123456789'
       
   282     contstr, needcont = '', 0
       
   283     contline = None
       
   284     indents = [0]
       
   285 
       
   286     while 1:                                   # loop over lines in stream
       
   287         try:
       
   288             line = readline()
       
   289         except StopIteration:
       
   290             line = ''
       
   291         lnum = lnum + 1
       
   292         pos, max = 0, len(line)
       
   293 
       
   294         if contstr:                            # continued string
       
   295             if not line:
       
   296                 raise TokenError, ("EOF in multi-line string", strstart)
       
   297             endmatch = endprog.match(line)
       
   298             if endmatch:
       
   299                 pos = end = endmatch.end(0)
       
   300                 yield (STRING, contstr + line[:end],
       
   301                        strstart, (lnum, end), contline + line)
       
   302                 contstr, needcont = '', 0
       
   303                 contline = None
       
   304             elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
       
   305                 yield (ERRORTOKEN, contstr + line,
       
   306                            strstart, (lnum, len(line)), contline)
       
   307                 contstr = ''
       
   308                 contline = None
       
   309                 continue
       
   310             else:
       
   311                 contstr = contstr + line
       
   312                 contline = contline + line
       
   313                 continue
       
   314 
       
   315         elif parenlev == 0 and not continued:  # new statement
       
   316             if not line: break
       
   317             column = 0
       
   318             while pos < max:                   # measure leading whitespace
       
   319                 if line[pos] == ' ': column = column + 1
       
   320                 elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize
       
   321                 elif line[pos] == '\f': column = 0
       
   322                 else: break
       
   323                 pos = pos + 1
       
   324             if pos == max: break
       
   325 
       
   326             if line[pos] in '#\r\n':           # skip comments or blank lines
       
   327                 if line[pos] == '#':
       
   328                     comment_token = line[pos:].rstrip('\r\n')
       
   329                     nl_pos = pos + len(comment_token)
       
   330                     yield (COMMENT, comment_token,
       
   331                            (lnum, pos), (lnum, pos + len(comment_token)), line)
       
   332                     yield (NL, line[nl_pos:],
       
   333                            (lnum, nl_pos), (lnum, len(line)), line)
       
   334                 else:
       
   335                     yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
       
   336                            (lnum, pos), (lnum, len(line)), line)
       
   337                 continue
       
   338 
       
   339             if column > indents[-1]:           # count indents or dedents
       
   340                 indents.append(column)
       
   341                 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
       
   342             while column < indents[-1]:
       
   343                 if column not in indents:
       
   344                     raise IndentationError(
       
   345                         "unindent does not match any outer indentation level",
       
   346                         ("<tokenize>", lnum, pos, line))
       
   347                 indents = indents[:-1]
       
   348                 yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
       
   349 
       
   350         else:                                  # continued statement
       
   351             if not line:
       
   352                 raise TokenError, ("EOF in multi-line statement", (lnum, 0))
       
   353             continued = 0
       
   354 
       
   355         while pos < max:
       
   356             pseudomatch = pseudoprog.match(line, pos)
       
   357             if pseudomatch:                                # scan for tokens
       
   358                 start, end = pseudomatch.span(1)
       
   359                 spos, epos, pos = (lnum, start), (lnum, end), end
       
   360                 token, initial = line[start:end], line[start]
       
   361 
       
   362                 if initial in numchars or \
       
   363                    (initial == '.' and token != '.'):      # ordinary number
       
   364                     yield (NUMBER, token, spos, epos, line)
       
   365                 elif initial in '\r\n':
       
   366                     yield (NL if parenlev > 0 else NEWLINE,
       
   367                            token, spos, epos, line)
       
   368                 elif initial == '#':
       
   369                     assert not token.endswith("\n")
       
   370                     yield (COMMENT, token, spos, epos, line)
       
   371                 elif token in triple_quoted:
       
   372                     endprog = endprogs[token]
       
   373                     endmatch = endprog.match(line, pos)
       
   374                     if endmatch:                           # all on one line
       
   375                         pos = endmatch.end(0)
       
   376                         token = line[start:pos]
       
   377                         yield (STRING, token, spos, (lnum, pos), line)
       
   378                     else:
       
   379                         strstart = (lnum, start)           # multiple lines
       
   380                         contstr = line[start:]
       
   381                         contline = line
       
   382                         break
       
   383                 elif initial in single_quoted or \
       
   384                     token[:2] in single_quoted or \
       
   385                     token[:3] in single_quoted:
       
   386                     if token[-1] == '\n':                  # continued string
       
   387                         strstart = (lnum, start)
       
   388                         endprog = (endprogs[initial] or endprogs[token[1]] or
       
   389                                    endprogs[token[2]])
       
   390                         contstr, needcont = line[start:], 1
       
   391                         contline = line
       
   392                         break
       
   393                     else:                                  # ordinary string
       
   394                         yield (STRING, token, spos, epos, line)
       
   395                 elif initial in namechars:                 # ordinary name
       
   396                     yield (NAME, token, spos, epos, line)
       
   397                 elif initial == '\\':                      # continued stmt
       
   398                     continued = 1
       
   399                 else:
       
   400                     if initial in '([{': parenlev = parenlev + 1
       
   401                     elif initial in ')]}': parenlev = parenlev - 1
       
   402                     yield (OP, token, spos, epos, line)
       
   403             else:
       
   404                 yield (ERRORTOKEN, line[pos],
       
   405                            (lnum, pos), (lnum, pos+1), line)
       
   406                 pos = pos + 1
       
   407 
       
   408     for indent in indents[1:]:                 # pop remaining indent levels
       
   409         yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
       
   410     yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
       
   411 
       
   412 if __name__ == '__main__':                     # testing
       
   413     import sys
       
   414     if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
       
   415     else: tokenize(sys.stdin.readline)