symbian-qemu-0.9.1-12/python-2.6.1/Lib/sre_parse.py
changeset 1 2fb8b9db1c86
equal deleted inserted replaced
0:ffa851df0825 1:2fb8b9db1c86
       
     1 #
       
     2 # Secret Labs' Regular Expression Engine
       
     3 #
       
     4 # convert re-style regular expression to sre pattern
       
     5 #
       
     6 # Copyright (c) 1998-2001 by Secret Labs AB.  All rights reserved.
       
     7 #
       
     8 # See the sre.py file for information on usage and redistribution.
       
     9 #
       
    10 
       
    11 """Internal support module for sre"""
       
    12 
       
    13 # XXX: show string offset and offending character for all errors
       
    14 
       
    15 import sys
       
    16 
       
    17 from sre_constants import *
       
    18 
       
    19 def set(seq):
       
    20     s = {}
       
    21     for elem in seq:
       
    22         s[elem] = 1
       
    23     return s
       
    24 
       
    25 SPECIAL_CHARS = ".\\[{()*+?^$|"
       
    26 REPEAT_CHARS = "*+?{"
       
    27 
       
    28 DIGITS = set("0123456789")
       
    29 
       
    30 OCTDIGITS = set("01234567")
       
    31 HEXDIGITS = set("0123456789abcdefABCDEF")
       
    32 
       
    33 WHITESPACE = set(" \t\n\r\v\f")
       
    34 
       
    35 ESCAPES = {
       
    36     r"\a": (LITERAL, ord("\a")),
       
    37     r"\b": (LITERAL, ord("\b")),
       
    38     r"\f": (LITERAL, ord("\f")),
       
    39     r"\n": (LITERAL, ord("\n")),
       
    40     r"\r": (LITERAL, ord("\r")),
       
    41     r"\t": (LITERAL, ord("\t")),
       
    42     r"\v": (LITERAL, ord("\v")),
       
    43     r"\\": (LITERAL, ord("\\"))
       
    44 }
       
    45 
       
    46 CATEGORIES = {
       
    47     r"\A": (AT, AT_BEGINNING_STRING), # start of string
       
    48     r"\b": (AT, AT_BOUNDARY),
       
    49     r"\B": (AT, AT_NON_BOUNDARY),
       
    50     r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]),
       
    51     r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]),
       
    52     r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]),
       
    53     r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]),
       
    54     r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]),
       
    55     r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]),
       
    56     r"\Z": (AT, AT_END_STRING), # end of string
       
    57 }
       
    58 
       
    59 FLAGS = {
       
    60     # standard flags
       
    61     "i": SRE_FLAG_IGNORECASE,
       
    62     "L": SRE_FLAG_LOCALE,
       
    63     "m": SRE_FLAG_MULTILINE,
       
    64     "s": SRE_FLAG_DOTALL,
       
    65     "x": SRE_FLAG_VERBOSE,
       
    66     # extensions
       
    67     "t": SRE_FLAG_TEMPLATE,
       
    68     "u": SRE_FLAG_UNICODE,
       
    69 }
       
    70 
       
    71 class Pattern:
       
    72     # master pattern object.  keeps track of global attributes
       
    73     def __init__(self):
       
    74         self.flags = 0
       
    75         self.open = []
       
    76         self.groups = 1
       
    77         self.groupdict = {}
       
    78     def opengroup(self, name=None):
       
    79         gid = self.groups
       
    80         self.groups = gid + 1
       
    81         if name is not None:
       
    82             ogid = self.groupdict.get(name, None)
       
    83             if ogid is not None:
       
    84                 raise error, ("redefinition of group name %s as group %d; "
       
    85                               "was group %d" % (repr(name), gid,  ogid))
       
    86             self.groupdict[name] = gid
       
    87         self.open.append(gid)
       
    88         return gid
       
    89     def closegroup(self, gid):
       
    90         self.open.remove(gid)
       
    91     def checkgroup(self, gid):
       
    92         return gid < self.groups and gid not in self.open
       
    93 
       
    94 class SubPattern:
       
    95     # a subpattern, in intermediate form
       
    96     def __init__(self, pattern, data=None):
       
    97         self.pattern = pattern
       
    98         if data is None:
       
    99             data = []
       
   100         self.data = data
       
   101         self.width = None
       
   102     def dump(self, level=0):
       
   103         nl = 1
       
   104         seqtypes = type(()), type([])
       
   105         for op, av in self.data:
       
   106             print level*"  " + op,; nl = 0
       
   107             if op == "in":
       
   108                 # member sublanguage
       
   109                 print; nl = 1
       
   110                 for op, a in av:
       
   111                     print (level+1)*"  " + op, a
       
   112             elif op == "branch":
       
   113                 print; nl = 1
       
   114                 i = 0
       
   115                 for a in av[1]:
       
   116                     if i > 0:
       
   117                         print level*"  " + "or"
       
   118                     a.dump(level+1); nl = 1
       
   119                     i = i + 1
       
   120             elif type(av) in seqtypes:
       
   121                 for a in av:
       
   122                     if isinstance(a, SubPattern):
       
   123                         if not nl: print
       
   124                         a.dump(level+1); nl = 1
       
   125                     else:
       
   126                         print a, ; nl = 0
       
   127             else:
       
   128                 print av, ; nl = 0
       
   129             if not nl: print
       
   130     def __repr__(self):
       
   131         return repr(self.data)
       
   132     def __len__(self):
       
   133         return len(self.data)
       
   134     def __delitem__(self, index):
       
   135         del self.data[index]
       
   136     def __getitem__(self, index):
       
   137         if isinstance(index, slice):
       
   138             return SubPattern(self.pattern, self.data[index])
       
   139         return self.data[index]
       
   140     def __setitem__(self, index, code):
       
   141         self.data[index] = code
       
   142     def insert(self, index, code):
       
   143         self.data.insert(index, code)
       
   144     def append(self, code):
       
   145         self.data.append(code)
       
   146     def getwidth(self):
       
   147         # determine the width (min, max) for this subpattern
       
   148         if self.width:
       
   149             return self.width
       
   150         lo = hi = 0L
       
   151         UNITCODES = (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY)
       
   152         REPEATCODES = (MIN_REPEAT, MAX_REPEAT)
       
   153         for op, av in self.data:
       
   154             if op is BRANCH:
       
   155                 i = sys.maxint
       
   156                 j = 0
       
   157                 for av in av[1]:
       
   158                     l, h = av.getwidth()
       
   159                     i = min(i, l)
       
   160                     j = max(j, h)
       
   161                 lo = lo + i
       
   162                 hi = hi + j
       
   163             elif op is CALL:
       
   164                 i, j = av.getwidth()
       
   165                 lo = lo + i
       
   166                 hi = hi + j
       
   167             elif op is SUBPATTERN:
       
   168                 i, j = av[1].getwidth()
       
   169                 lo = lo + i
       
   170                 hi = hi + j
       
   171             elif op in REPEATCODES:
       
   172                 i, j = av[2].getwidth()
       
   173                 lo = lo + long(i) * av[0]
       
   174                 hi = hi + long(j) * av[1]
       
   175             elif op in UNITCODES:
       
   176                 lo = lo + 1
       
   177                 hi = hi + 1
       
   178             elif op == SUCCESS:
       
   179                 break
       
   180         self.width = int(min(lo, sys.maxint)), int(min(hi, sys.maxint))
       
   181         return self.width
       
   182 
       
   183 class Tokenizer:
       
   184     def __init__(self, string):
       
   185         self.string = string
       
   186         self.index = 0
       
   187         self.__next()
       
   188     def __next(self):
       
   189         if self.index >= len(self.string):
       
   190             self.next = None
       
   191             return
       
   192         char = self.string[self.index]
       
   193         if char[0] == "\\":
       
   194             try:
       
   195                 c = self.string[self.index + 1]
       
   196             except IndexError:
       
   197                 raise error, "bogus escape (end of line)"
       
   198             char = char + c
       
   199         self.index = self.index + len(char)
       
   200         self.next = char
       
   201     def match(self, char, skip=1):
       
   202         if char == self.next:
       
   203             if skip:
       
   204                 self.__next()
       
   205             return 1
       
   206         return 0
       
   207     def get(self):
       
   208         this = self.next
       
   209         self.__next()
       
   210         return this
       
   211     def tell(self):
       
   212         return self.index, self.next
       
   213     def seek(self, index):
       
   214         self.index, self.next = index
       
   215 
       
   216 def isident(char):
       
   217     return "a" <= char <= "z" or "A" <= char <= "Z" or char == "_"
       
   218 
       
   219 def isdigit(char):
       
   220     return "0" <= char <= "9"
       
   221 
       
   222 def isname(name):
       
   223     # check that group name is a valid string
       
   224     if not isident(name[0]):
       
   225         return False
       
   226     for char in name[1:]:
       
   227         if not isident(char) and not isdigit(char):
       
   228             return False
       
   229     return True
       
   230 
       
   231 def _class_escape(source, escape):
       
   232     # handle escape code inside character class
       
   233     code = ESCAPES.get(escape)
       
   234     if code:
       
   235         return code
       
   236     code = CATEGORIES.get(escape)
       
   237     if code:
       
   238         return code
       
   239     try:
       
   240         c = escape[1:2]
       
   241         if c == "x":
       
   242             # hexadecimal escape (exactly two digits)
       
   243             while source.next in HEXDIGITS and len(escape) < 4:
       
   244                 escape = escape + source.get()
       
   245             escape = escape[2:]
       
   246             if len(escape) != 2:
       
   247                 raise error, "bogus escape: %s" % repr("\\" + escape)
       
   248             return LITERAL, int(escape, 16) & 0xff
       
   249         elif c in OCTDIGITS:
       
   250             # octal escape (up to three digits)
       
   251             while source.next in OCTDIGITS and len(escape) < 4:
       
   252                 escape = escape + source.get()
       
   253             escape = escape[1:]
       
   254             return LITERAL, int(escape, 8) & 0xff
       
   255         elif c in DIGITS:
       
   256             raise error, "bogus escape: %s" % repr(escape)
       
   257         if len(escape) == 2:
       
   258             return LITERAL, ord(escape[1])
       
   259     except ValueError:
       
   260         pass
       
   261     raise error, "bogus escape: %s" % repr(escape)
       
   262 
       
   263 def _escape(source, escape, state):
       
   264     # handle escape code in expression
       
   265     code = CATEGORIES.get(escape)
       
   266     if code:
       
   267         return code
       
   268     code = ESCAPES.get(escape)
       
   269     if code:
       
   270         return code
       
   271     try:
       
   272         c = escape[1:2]
       
   273         if c == "x":
       
   274             # hexadecimal escape
       
   275             while source.next in HEXDIGITS and len(escape) < 4:
       
   276                 escape = escape + source.get()
       
   277             if len(escape) != 4:
       
   278                 raise ValueError
       
   279             return LITERAL, int(escape[2:], 16) & 0xff
       
   280         elif c == "0":
       
   281             # octal escape
       
   282             while source.next in OCTDIGITS and len(escape) < 4:
       
   283                 escape = escape + source.get()
       
   284             return LITERAL, int(escape[1:], 8) & 0xff
       
   285         elif c in DIGITS:
       
   286             # octal escape *or* decimal group reference (sigh)
       
   287             if source.next in DIGITS:
       
   288                 escape = escape + source.get()
       
   289                 if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and
       
   290                     source.next in OCTDIGITS):
       
   291                     # got three octal digits; this is an octal escape
       
   292                     escape = escape + source.get()
       
   293                     return LITERAL, int(escape[1:], 8) & 0xff
       
   294             # not an octal escape, so this is a group reference
       
   295             group = int(escape[1:])
       
   296             if group < state.groups:
       
   297                 if not state.checkgroup(group):
       
   298                     raise error, "cannot refer to open group"
       
   299                 return GROUPREF, group
       
   300             raise ValueError
       
   301         if len(escape) == 2:
       
   302             return LITERAL, ord(escape[1])
       
   303     except ValueError:
       
   304         pass
       
   305     raise error, "bogus escape: %s" % repr(escape)
       
   306 
       
   307 def _parse_sub(source, state, nested=1):
       
   308     # parse an alternation: a|b|c
       
   309 
       
   310     items = []
       
   311     itemsappend = items.append
       
   312     sourcematch = source.match
       
   313     while 1:
       
   314         itemsappend(_parse(source, state))
       
   315         if sourcematch("|"):
       
   316             continue
       
   317         if not nested:
       
   318             break
       
   319         if not source.next or sourcematch(")", 0):
       
   320             break
       
   321         else:
       
   322             raise error, "pattern not properly closed"
       
   323 
       
   324     if len(items) == 1:
       
   325         return items[0]
       
   326 
       
   327     subpattern = SubPattern(state)
       
   328     subpatternappend = subpattern.append
       
   329 
       
   330     # check if all items share a common prefix
       
   331     while 1:
       
   332         prefix = None
       
   333         for item in items:
       
   334             if not item:
       
   335                 break
       
   336             if prefix is None:
       
   337                 prefix = item[0]
       
   338             elif item[0] != prefix:
       
   339                 break
       
   340         else:
       
   341             # all subitems start with a common "prefix".
       
   342             # move it out of the branch
       
   343             for item in items:
       
   344                 del item[0]
       
   345             subpatternappend(prefix)
       
   346             continue # check next one
       
   347         break
       
   348 
       
   349     # check if the branch can be replaced by a character set
       
   350     for item in items:
       
   351         if len(item) != 1 or item[0][0] != LITERAL:
       
   352             break
       
   353     else:
       
   354         # we can store this as a character set instead of a
       
   355         # branch (the compiler may optimize this even more)
       
   356         set = []
       
   357         setappend = set.append
       
   358         for item in items:
       
   359             setappend(item[0])
       
   360         subpatternappend((IN, set))
       
   361         return subpattern
       
   362 
       
   363     subpattern.append((BRANCH, (None, items)))
       
   364     return subpattern
       
   365 
       
   366 def _parse_sub_cond(source, state, condgroup):
       
   367     item_yes = _parse(source, state)
       
   368     if source.match("|"):
       
   369         item_no = _parse(source, state)
       
   370         if source.match("|"):
       
   371             raise error, "conditional backref with more than two branches"
       
   372     else:
       
   373         item_no = None
       
   374     if source.next and not source.match(")", 0):
       
   375         raise error, "pattern not properly closed"
       
   376     subpattern = SubPattern(state)
       
   377     subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no)))
       
   378     return subpattern
       
   379 
       
   380 _PATTERNENDERS = set("|)")
       
   381 _ASSERTCHARS = set("=!<")
       
   382 _LOOKBEHINDASSERTCHARS = set("=!")
       
   383 _REPEATCODES = set([MIN_REPEAT, MAX_REPEAT])
       
   384 
       
   385 def _parse(source, state):
       
   386     # parse a simple pattern
       
   387     subpattern = SubPattern(state)
       
   388 
       
   389     # precompute constants into local variables
       
   390     subpatternappend = subpattern.append
       
   391     sourceget = source.get
       
   392     sourcematch = source.match
       
   393     _len = len
       
   394     PATTERNENDERS = _PATTERNENDERS
       
   395     ASSERTCHARS = _ASSERTCHARS
       
   396     LOOKBEHINDASSERTCHARS = _LOOKBEHINDASSERTCHARS
       
   397     REPEATCODES = _REPEATCODES
       
   398 
       
   399     while 1:
       
   400 
       
   401         if source.next in PATTERNENDERS:
       
   402             break # end of subpattern
       
   403         this = sourceget()
       
   404         if this is None:
       
   405             break # end of pattern
       
   406 
       
   407         if state.flags & SRE_FLAG_VERBOSE:
       
   408             # skip whitespace and comments
       
   409             if this in WHITESPACE:
       
   410                 continue
       
   411             if this == "#":
       
   412                 while 1:
       
   413                     this = sourceget()
       
   414                     if this in (None, "\n"):
       
   415                         break
       
   416                 continue
       
   417 
       
   418         if this and this[0] not in SPECIAL_CHARS:
       
   419             subpatternappend((LITERAL, ord(this)))
       
   420 
       
   421         elif this == "[":
       
   422             # character set
       
   423             set = []
       
   424             setappend = set.append
       
   425 ##          if sourcematch(":"):
       
   426 ##              pass # handle character classes
       
   427             if sourcematch("^"):
       
   428                 setappend((NEGATE, None))
       
   429             # check remaining characters
       
   430             start = set[:]
       
   431             while 1:
       
   432                 this = sourceget()
       
   433                 if this == "]" and set != start:
       
   434                     break
       
   435                 elif this and this[0] == "\\":
       
   436                     code1 = _class_escape(source, this)
       
   437                 elif this:
       
   438                     code1 = LITERAL, ord(this)
       
   439                 else:
       
   440                     raise error, "unexpected end of regular expression"
       
   441                 if sourcematch("-"):
       
   442                     # potential range
       
   443                     this = sourceget()
       
   444                     if this == "]":
       
   445                         if code1[0] is IN:
       
   446                             code1 = code1[1][0]
       
   447                         setappend(code1)
       
   448                         setappend((LITERAL, ord("-")))
       
   449                         break
       
   450                     elif this:
       
   451                         if this[0] == "\\":
       
   452                             code2 = _class_escape(source, this)
       
   453                         else:
       
   454                             code2 = LITERAL, ord(this)
       
   455                         if code1[0] != LITERAL or code2[0] != LITERAL:
       
   456                             raise error, "bad character range"
       
   457                         lo = code1[1]
       
   458                         hi = code2[1]
       
   459                         if hi < lo:
       
   460                             raise error, "bad character range"
       
   461                         setappend((RANGE, (lo, hi)))
       
   462                     else:
       
   463                         raise error, "unexpected end of regular expression"
       
   464                 else:
       
   465                     if code1[0] is IN:
       
   466                         code1 = code1[1][0]
       
   467                     setappend(code1)
       
   468 
       
   469             # XXX: <fl> should move set optimization to compiler!
       
   470             if _len(set)==1 and set[0][0] is LITERAL:
       
   471                 subpatternappend(set[0]) # optimization
       
   472             elif _len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL:
       
   473                 subpatternappend((NOT_LITERAL, set[1][1])) # optimization
       
   474             else:
       
   475                 # XXX: <fl> should add charmap optimization here
       
   476                 subpatternappend((IN, set))
       
   477 
       
   478         elif this and this[0] in REPEAT_CHARS:
       
   479             # repeat previous item
       
   480             if this == "?":
       
   481                 min, max = 0, 1
       
   482             elif this == "*":
       
   483                 min, max = 0, MAXREPEAT
       
   484 
       
   485             elif this == "+":
       
   486                 min, max = 1, MAXREPEAT
       
   487             elif this == "{":
       
   488                 if source.next == "}":
       
   489                     subpatternappend((LITERAL, ord(this)))
       
   490                     continue
       
   491                 here = source.tell()
       
   492                 min, max = 0, MAXREPEAT
       
   493                 lo = hi = ""
       
   494                 while source.next in DIGITS:
       
   495                     lo = lo + source.get()
       
   496                 if sourcematch(","):
       
   497                     while source.next in DIGITS:
       
   498                         hi = hi + sourceget()
       
   499                 else:
       
   500                     hi = lo
       
   501                 if not sourcematch("}"):
       
   502                     subpatternappend((LITERAL, ord(this)))
       
   503                     source.seek(here)
       
   504                     continue
       
   505                 if lo:
       
   506                     min = int(lo)
       
   507                 if hi:
       
   508                     max = int(hi)
       
   509                 if max < min:
       
   510                     raise error, "bad repeat interval"
       
   511             else:
       
   512                 raise error, "not supported"
       
   513             # figure out which item to repeat
       
   514             if subpattern:
       
   515                 item = subpattern[-1:]
       
   516             else:
       
   517                 item = None
       
   518             if not item or (_len(item) == 1 and item[0][0] == AT):
       
   519                 raise error, "nothing to repeat"
       
   520             if item[0][0] in REPEATCODES:
       
   521                 raise error, "multiple repeat"
       
   522             if sourcematch("?"):
       
   523                 subpattern[-1] = (MIN_REPEAT, (min, max, item))
       
   524             else:
       
   525                 subpattern[-1] = (MAX_REPEAT, (min, max, item))
       
   526 
       
   527         elif this == ".":
       
   528             subpatternappend((ANY, None))
       
   529 
       
   530         elif this == "(":
       
   531             group = 1
       
   532             name = None
       
   533             condgroup = None
       
   534             if sourcematch("?"):
       
   535                 group = 0
       
   536                 # options
       
   537                 if sourcematch("P"):
       
   538                     # python extensions
       
   539                     if sourcematch("<"):
       
   540                         # named group: skip forward to end of name
       
   541                         name = ""
       
   542                         while 1:
       
   543                             char = sourceget()
       
   544                             if char is None:
       
   545                                 raise error, "unterminated name"
       
   546                             if char == ">":
       
   547                                 break
       
   548                             name = name + char
       
   549                         group = 1
       
   550                         if not isname(name):
       
   551                             raise error, "bad character in group name"
       
   552                     elif sourcematch("="):
       
   553                         # named backreference
       
   554                         name = ""
       
   555                         while 1:
       
   556                             char = sourceget()
       
   557                             if char is None:
       
   558                                 raise error, "unterminated name"
       
   559                             if char == ")":
       
   560                                 break
       
   561                             name = name + char
       
   562                         if not isname(name):
       
   563                             raise error, "bad character in group name"
       
   564                         gid = state.groupdict.get(name)
       
   565                         if gid is None:
       
   566                             raise error, "unknown group name"
       
   567                         subpatternappend((GROUPREF, gid))
       
   568                         continue
       
   569                     else:
       
   570                         char = sourceget()
       
   571                         if char is None:
       
   572                             raise error, "unexpected end of pattern"
       
   573                         raise error, "unknown specifier: ?P%s" % char
       
   574                 elif sourcematch(":"):
       
   575                     # non-capturing group
       
   576                     group = 2
       
   577                 elif sourcematch("#"):
       
   578                     # comment
       
   579                     while 1:
       
   580                         if source.next is None or source.next == ")":
       
   581                             break
       
   582                         sourceget()
       
   583                     if not sourcematch(")"):
       
   584                         raise error, "unbalanced parenthesis"
       
   585                     continue
       
   586                 elif source.next in ASSERTCHARS:
       
   587                     # lookahead assertions
       
   588                     char = sourceget()
       
   589                     dir = 1
       
   590                     if char == "<":
       
   591                         if source.next not in LOOKBEHINDASSERTCHARS:
       
   592                             raise error, "syntax error"
       
   593                         dir = -1 # lookbehind
       
   594                         char = sourceget()
       
   595                     p = _parse_sub(source, state)
       
   596                     if not sourcematch(")"):
       
   597                         raise error, "unbalanced parenthesis"
       
   598                     if char == "=":
       
   599                         subpatternappend((ASSERT, (dir, p)))
       
   600                     else:
       
   601                         subpatternappend((ASSERT_NOT, (dir, p)))
       
   602                     continue
       
   603                 elif sourcematch("("):
       
   604                     # conditional backreference group
       
   605                     condname = ""
       
   606                     while 1:
       
   607                         char = sourceget()
       
   608                         if char is None:
       
   609                             raise error, "unterminated name"
       
   610                         if char == ")":
       
   611                             break
       
   612                         condname = condname + char
       
   613                     group = 2
       
   614                     if isname(condname):
       
   615                         condgroup = state.groupdict.get(condname)
       
   616                         if condgroup is None:
       
   617                             raise error, "unknown group name"
       
   618                     else:
       
   619                         try:
       
   620                             condgroup = int(condname)
       
   621                         except ValueError:
       
   622                             raise error, "bad character in group name"
       
   623                 else:
       
   624                     # flags
       
   625                     if not source.next in FLAGS:
       
   626                         raise error, "unexpected end of pattern"
       
   627                     while source.next in FLAGS:
       
   628                         state.flags = state.flags | FLAGS[sourceget()]
       
   629             if group:
       
   630                 # parse group contents
       
   631                 if group == 2:
       
   632                     # anonymous group
       
   633                     group = None
       
   634                 else:
       
   635                     group = state.opengroup(name)
       
   636                 if condgroup:
       
   637                     p = _parse_sub_cond(source, state, condgroup)
       
   638                 else:
       
   639                     p = _parse_sub(source, state)
       
   640                 if not sourcematch(")"):
       
   641                     raise error, "unbalanced parenthesis"
       
   642                 if group is not None:
       
   643                     state.closegroup(group)
       
   644                 subpatternappend((SUBPATTERN, (group, p)))
       
   645             else:
       
   646                 while 1:
       
   647                     char = sourceget()
       
   648                     if char is None:
       
   649                         raise error, "unexpected end of pattern"
       
   650                     if char == ")":
       
   651                         break
       
   652                     raise error, "unknown extension"
       
   653 
       
   654         elif this == "^":
       
   655             subpatternappend((AT, AT_BEGINNING))
       
   656 
       
   657         elif this == "$":
       
   658             subpattern.append((AT, AT_END))
       
   659 
       
   660         elif this and this[0] == "\\":
       
   661             code = _escape(source, this, state)
       
   662             subpatternappend(code)
       
   663 
       
   664         else:
       
   665             raise error, "parser error"
       
   666 
       
   667     return subpattern
       
   668 
       
   669 def parse(str, flags=0, pattern=None):
       
   670     # parse 're' pattern into list of (opcode, argument) tuples
       
   671 
       
   672     source = Tokenizer(str)
       
   673 
       
   674     if pattern is None:
       
   675         pattern = Pattern()
       
   676     pattern.flags = flags
       
   677     pattern.str = str
       
   678 
       
   679     p = _parse_sub(source, pattern, 0)
       
   680 
       
   681     tail = source.get()
       
   682     if tail == ")":
       
   683         raise error, "unbalanced parenthesis"
       
   684     elif tail:
       
   685         raise error, "bogus characters at end of regular expression"
       
   686 
       
   687     if flags & SRE_FLAG_DEBUG:
       
   688         p.dump()
       
   689 
       
   690     if not (flags & SRE_FLAG_VERBOSE) and p.pattern.flags & SRE_FLAG_VERBOSE:
       
   691         # the VERBOSE flag was switched on inside the pattern.  to be
       
   692         # on the safe side, we'll parse the whole thing again...
       
   693         return parse(str, p.pattern.flags)
       
   694 
       
   695     return p
       
   696 
       
   697 def parse_template(source, pattern):
       
   698     # parse 're' replacement string into list of literals and
       
   699     # group references
       
   700     s = Tokenizer(source)
       
   701     sget = s.get
       
   702     p = []
       
   703     a = p.append
       
   704     def literal(literal, p=p, pappend=a):
       
   705         if p and p[-1][0] is LITERAL:
       
   706             p[-1] = LITERAL, p[-1][1] + literal
       
   707         else:
       
   708             pappend((LITERAL, literal))
       
   709     sep = source[:0]
       
   710     if type(sep) is type(""):
       
   711         makechar = chr
       
   712     else:
       
   713         makechar = unichr
       
   714     while 1:
       
   715         this = sget()
       
   716         if this is None:
       
   717             break # end of replacement string
       
   718         if this and this[0] == "\\":
       
   719             # group
       
   720             c = this[1:2]
       
   721             if c == "g":
       
   722                 name = ""
       
   723                 if s.match("<"):
       
   724                     while 1:
       
   725                         char = sget()
       
   726                         if char is None:
       
   727                             raise error, "unterminated group name"
       
   728                         if char == ">":
       
   729                             break
       
   730                         name = name + char
       
   731                 if not name:
       
   732                     raise error, "bad group name"
       
   733                 try:
       
   734                     index = int(name)
       
   735                     if index < 0:
       
   736                         raise error, "negative group number"
       
   737                 except ValueError:
       
   738                     if not isname(name):
       
   739                         raise error, "bad character in group name"
       
   740                     try:
       
   741                         index = pattern.groupindex[name]
       
   742                     except KeyError:
       
   743                         raise IndexError, "unknown group name"
       
   744                 a((MARK, index))
       
   745             elif c == "0":
       
   746                 if s.next in OCTDIGITS:
       
   747                     this = this + sget()
       
   748                     if s.next in OCTDIGITS:
       
   749                         this = this + sget()
       
   750                 literal(makechar(int(this[1:], 8) & 0xff))
       
   751             elif c in DIGITS:
       
   752                 isoctal = False
       
   753                 if s.next in DIGITS:
       
   754                     this = this + sget()
       
   755                     if (c in OCTDIGITS and this[2] in OCTDIGITS and
       
   756                         s.next in OCTDIGITS):
       
   757                         this = this + sget()
       
   758                         isoctal = True
       
   759                         literal(makechar(int(this[1:], 8) & 0xff))
       
   760                 if not isoctal:
       
   761                     a((MARK, int(this[1:])))
       
   762             else:
       
   763                 try:
       
   764                     this = makechar(ESCAPES[this][1])
       
   765                 except KeyError:
       
   766                     pass
       
   767                 literal(this)
       
   768         else:
       
   769             literal(this)
       
   770     # convert template to groups and literals lists
       
   771     i = 0
       
   772     groups = []
       
   773     groupsappend = groups.append
       
   774     literals = [None] * len(p)
       
   775     for c, s in p:
       
   776         if c is MARK:
       
   777             groupsappend((i, s))
       
   778             # literal[i] is already None
       
   779         else:
       
   780             literals[i] = s
       
   781         i = i + 1
       
   782     return groups, literals
       
   783 
       
   784 def expand_template(template, match):
       
   785     g = match.group
       
   786     sep = match.string[:0]
       
   787     groups, literals = template
       
   788     literals = literals[:]
       
   789     try:
       
   790         for index, group in groups:
       
   791             literals[index] = s = g(group)
       
   792             if s is None:
       
   793                 raise error, "unmatched group"
       
   794     except IndexError:
       
   795         raise error, "invalid group reference"
       
   796     return sep.join(literals)