|
1 # |
|
2 # Secret Labs' Regular Expression Engine |
|
3 # |
|
4 # convert re-style regular expression to sre pattern |
|
5 # |
|
6 # Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved. |
|
7 # |
|
8 # See the sre.py file for information on usage and redistribution. |
|
9 # |
|
10 |
|
11 """Internal support module for sre""" |
|
12 |
|
13 # XXX: show string offset and offending character for all errors |
|
14 |
|
15 import sys |
|
16 |
|
17 from sre_constants import * |
|
18 |
|
19 def set(seq): |
|
20 s = {} |
|
21 for elem in seq: |
|
22 s[elem] = 1 |
|
23 return s |
|
24 |
|
25 SPECIAL_CHARS = ".\\[{()*+?^$|" |
|
26 REPEAT_CHARS = "*+?{" |
|
27 |
|
28 DIGITS = set("0123456789") |
|
29 |
|
30 OCTDIGITS = set("01234567") |
|
31 HEXDIGITS = set("0123456789abcdefABCDEF") |
|
32 |
|
33 WHITESPACE = set(" \t\n\r\v\f") |
|
34 |
|
35 ESCAPES = { |
|
36 r"\a": (LITERAL, ord("\a")), |
|
37 r"\b": (LITERAL, ord("\b")), |
|
38 r"\f": (LITERAL, ord("\f")), |
|
39 r"\n": (LITERAL, ord("\n")), |
|
40 r"\r": (LITERAL, ord("\r")), |
|
41 r"\t": (LITERAL, ord("\t")), |
|
42 r"\v": (LITERAL, ord("\v")), |
|
43 r"\\": (LITERAL, ord("\\")) |
|
44 } |
|
45 |
|
46 CATEGORIES = { |
|
47 r"\A": (AT, AT_BEGINNING_STRING), # start of string |
|
48 r"\b": (AT, AT_BOUNDARY), |
|
49 r"\B": (AT, AT_NON_BOUNDARY), |
|
50 r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]), |
|
51 r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]), |
|
52 r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]), |
|
53 r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]), |
|
54 r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]), |
|
55 r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]), |
|
56 r"\Z": (AT, AT_END_STRING), # end of string |
|
57 } |
|
58 |
|
59 FLAGS = { |
|
60 # standard flags |
|
61 "i": SRE_FLAG_IGNORECASE, |
|
62 "L": SRE_FLAG_LOCALE, |
|
63 "m": SRE_FLAG_MULTILINE, |
|
64 "s": SRE_FLAG_DOTALL, |
|
65 "x": SRE_FLAG_VERBOSE, |
|
66 # extensions |
|
67 "t": SRE_FLAG_TEMPLATE, |
|
68 "u": SRE_FLAG_UNICODE, |
|
69 } |
|
70 |
|
71 class Pattern: |
|
72 # master pattern object. keeps track of global attributes |
|
73 def __init__(self): |
|
74 self.flags = 0 |
|
75 self.open = [] |
|
76 self.groups = 1 |
|
77 self.groupdict = {} |
|
78 def opengroup(self, name=None): |
|
79 gid = self.groups |
|
80 self.groups = gid + 1 |
|
81 if name is not None: |
|
82 ogid = self.groupdict.get(name, None) |
|
83 if ogid is not None: |
|
84 raise error, ("redefinition of group name %s as group %d; " |
|
85 "was group %d" % (repr(name), gid, ogid)) |
|
86 self.groupdict[name] = gid |
|
87 self.open.append(gid) |
|
88 return gid |
|
89 def closegroup(self, gid): |
|
90 self.open.remove(gid) |
|
91 def checkgroup(self, gid): |
|
92 return gid < self.groups and gid not in self.open |
|
93 |
|
94 class SubPattern: |
|
95 # a subpattern, in intermediate form |
|
96 def __init__(self, pattern, data=None): |
|
97 self.pattern = pattern |
|
98 if data is None: |
|
99 data = [] |
|
100 self.data = data |
|
101 self.width = None |
|
102 def dump(self, level=0): |
|
103 nl = 1 |
|
104 seqtypes = type(()), type([]) |
|
105 for op, av in self.data: |
|
106 print level*" " + op,; nl = 0 |
|
107 if op == "in": |
|
108 # member sublanguage |
|
109 print; nl = 1 |
|
110 for op, a in av: |
|
111 print (level+1)*" " + op, a |
|
112 elif op == "branch": |
|
113 print; nl = 1 |
|
114 i = 0 |
|
115 for a in av[1]: |
|
116 if i > 0: |
|
117 print level*" " + "or" |
|
118 a.dump(level+1); nl = 1 |
|
119 i = i + 1 |
|
120 elif type(av) in seqtypes: |
|
121 for a in av: |
|
122 if isinstance(a, SubPattern): |
|
123 if not nl: print |
|
124 a.dump(level+1); nl = 1 |
|
125 else: |
|
126 print a, ; nl = 0 |
|
127 else: |
|
128 print av, ; nl = 0 |
|
129 if not nl: print |
|
130 def __repr__(self): |
|
131 return repr(self.data) |
|
132 def __len__(self): |
|
133 return len(self.data) |
|
134 def __delitem__(self, index): |
|
135 del self.data[index] |
|
136 def __getitem__(self, index): |
|
137 if isinstance(index, slice): |
|
138 return SubPattern(self.pattern, self.data[index]) |
|
139 return self.data[index] |
|
140 def __setitem__(self, index, code): |
|
141 self.data[index] = code |
|
142 def insert(self, index, code): |
|
143 self.data.insert(index, code) |
|
144 def append(self, code): |
|
145 self.data.append(code) |
|
146 def getwidth(self): |
|
147 # determine the width (min, max) for this subpattern |
|
148 if self.width: |
|
149 return self.width |
|
150 lo = hi = 0L |
|
151 UNITCODES = (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY) |
|
152 REPEATCODES = (MIN_REPEAT, MAX_REPEAT) |
|
153 for op, av in self.data: |
|
154 if op is BRANCH: |
|
155 i = sys.maxint |
|
156 j = 0 |
|
157 for av in av[1]: |
|
158 l, h = av.getwidth() |
|
159 i = min(i, l) |
|
160 j = max(j, h) |
|
161 lo = lo + i |
|
162 hi = hi + j |
|
163 elif op is CALL: |
|
164 i, j = av.getwidth() |
|
165 lo = lo + i |
|
166 hi = hi + j |
|
167 elif op is SUBPATTERN: |
|
168 i, j = av[1].getwidth() |
|
169 lo = lo + i |
|
170 hi = hi + j |
|
171 elif op in REPEATCODES: |
|
172 i, j = av[2].getwidth() |
|
173 lo = lo + long(i) * av[0] |
|
174 hi = hi + long(j) * av[1] |
|
175 elif op in UNITCODES: |
|
176 lo = lo + 1 |
|
177 hi = hi + 1 |
|
178 elif op == SUCCESS: |
|
179 break |
|
180 self.width = int(min(lo, sys.maxint)), int(min(hi, sys.maxint)) |
|
181 return self.width |
|
182 |
|
183 class Tokenizer: |
|
184 def __init__(self, string): |
|
185 self.string = string |
|
186 self.index = 0 |
|
187 self.__next() |
|
188 def __next(self): |
|
189 if self.index >= len(self.string): |
|
190 self.next = None |
|
191 return |
|
192 char = self.string[self.index] |
|
193 if char[0] == "\\": |
|
194 try: |
|
195 c = self.string[self.index + 1] |
|
196 except IndexError: |
|
197 raise error, "bogus escape (end of line)" |
|
198 char = char + c |
|
199 self.index = self.index + len(char) |
|
200 self.next = char |
|
201 def match(self, char, skip=1): |
|
202 if char == self.next: |
|
203 if skip: |
|
204 self.__next() |
|
205 return 1 |
|
206 return 0 |
|
207 def get(self): |
|
208 this = self.next |
|
209 self.__next() |
|
210 return this |
|
211 def tell(self): |
|
212 return self.index, self.next |
|
213 def seek(self, index): |
|
214 self.index, self.next = index |
|
215 |
|
216 def isident(char): |
|
217 return "a" <= char <= "z" or "A" <= char <= "Z" or char == "_" |
|
218 |
|
219 def isdigit(char): |
|
220 return "0" <= char <= "9" |
|
221 |
|
222 def isname(name): |
|
223 # check that group name is a valid string |
|
224 if not isident(name[0]): |
|
225 return False |
|
226 for char in name[1:]: |
|
227 if not isident(char) and not isdigit(char): |
|
228 return False |
|
229 return True |
|
230 |
|
231 def _class_escape(source, escape): |
|
232 # handle escape code inside character class |
|
233 code = ESCAPES.get(escape) |
|
234 if code: |
|
235 return code |
|
236 code = CATEGORIES.get(escape) |
|
237 if code: |
|
238 return code |
|
239 try: |
|
240 c = escape[1:2] |
|
241 if c == "x": |
|
242 # hexadecimal escape (exactly two digits) |
|
243 while source.next in HEXDIGITS and len(escape) < 4: |
|
244 escape = escape + source.get() |
|
245 escape = escape[2:] |
|
246 if len(escape) != 2: |
|
247 raise error, "bogus escape: %s" % repr("\\" + escape) |
|
248 return LITERAL, int(escape, 16) & 0xff |
|
249 elif c in OCTDIGITS: |
|
250 # octal escape (up to three digits) |
|
251 while source.next in OCTDIGITS and len(escape) < 4: |
|
252 escape = escape + source.get() |
|
253 escape = escape[1:] |
|
254 return LITERAL, int(escape, 8) & 0xff |
|
255 elif c in DIGITS: |
|
256 raise error, "bogus escape: %s" % repr(escape) |
|
257 if len(escape) == 2: |
|
258 return LITERAL, ord(escape[1]) |
|
259 except ValueError: |
|
260 pass |
|
261 raise error, "bogus escape: %s" % repr(escape) |
|
262 |
|
263 def _escape(source, escape, state): |
|
264 # handle escape code in expression |
|
265 code = CATEGORIES.get(escape) |
|
266 if code: |
|
267 return code |
|
268 code = ESCAPES.get(escape) |
|
269 if code: |
|
270 return code |
|
271 try: |
|
272 c = escape[1:2] |
|
273 if c == "x": |
|
274 # hexadecimal escape |
|
275 while source.next in HEXDIGITS and len(escape) < 4: |
|
276 escape = escape + source.get() |
|
277 if len(escape) != 4: |
|
278 raise ValueError |
|
279 return LITERAL, int(escape[2:], 16) & 0xff |
|
280 elif c == "0": |
|
281 # octal escape |
|
282 while source.next in OCTDIGITS and len(escape) < 4: |
|
283 escape = escape + source.get() |
|
284 return LITERAL, int(escape[1:], 8) & 0xff |
|
285 elif c in DIGITS: |
|
286 # octal escape *or* decimal group reference (sigh) |
|
287 if source.next in DIGITS: |
|
288 escape = escape + source.get() |
|
289 if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and |
|
290 source.next in OCTDIGITS): |
|
291 # got three octal digits; this is an octal escape |
|
292 escape = escape + source.get() |
|
293 return LITERAL, int(escape[1:], 8) & 0xff |
|
294 # not an octal escape, so this is a group reference |
|
295 group = int(escape[1:]) |
|
296 if group < state.groups: |
|
297 if not state.checkgroup(group): |
|
298 raise error, "cannot refer to open group" |
|
299 return GROUPREF, group |
|
300 raise ValueError |
|
301 if len(escape) == 2: |
|
302 return LITERAL, ord(escape[1]) |
|
303 except ValueError: |
|
304 pass |
|
305 raise error, "bogus escape: %s" % repr(escape) |
|
306 |
|
307 def _parse_sub(source, state, nested=1): |
|
308 # parse an alternation: a|b|c |
|
309 |
|
310 items = [] |
|
311 itemsappend = items.append |
|
312 sourcematch = source.match |
|
313 while 1: |
|
314 itemsappend(_parse(source, state)) |
|
315 if sourcematch("|"): |
|
316 continue |
|
317 if not nested: |
|
318 break |
|
319 if not source.next or sourcematch(")", 0): |
|
320 break |
|
321 else: |
|
322 raise error, "pattern not properly closed" |
|
323 |
|
324 if len(items) == 1: |
|
325 return items[0] |
|
326 |
|
327 subpattern = SubPattern(state) |
|
328 subpatternappend = subpattern.append |
|
329 |
|
330 # check if all items share a common prefix |
|
331 while 1: |
|
332 prefix = None |
|
333 for item in items: |
|
334 if not item: |
|
335 break |
|
336 if prefix is None: |
|
337 prefix = item[0] |
|
338 elif item[0] != prefix: |
|
339 break |
|
340 else: |
|
341 # all subitems start with a common "prefix". |
|
342 # move it out of the branch |
|
343 for item in items: |
|
344 del item[0] |
|
345 subpatternappend(prefix) |
|
346 continue # check next one |
|
347 break |
|
348 |
|
349 # check if the branch can be replaced by a character set |
|
350 for item in items: |
|
351 if len(item) != 1 or item[0][0] != LITERAL: |
|
352 break |
|
353 else: |
|
354 # we can store this as a character set instead of a |
|
355 # branch (the compiler may optimize this even more) |
|
356 set = [] |
|
357 setappend = set.append |
|
358 for item in items: |
|
359 setappend(item[0]) |
|
360 subpatternappend((IN, set)) |
|
361 return subpattern |
|
362 |
|
363 subpattern.append((BRANCH, (None, items))) |
|
364 return subpattern |
|
365 |
|
366 def _parse_sub_cond(source, state, condgroup): |
|
367 item_yes = _parse(source, state) |
|
368 if source.match("|"): |
|
369 item_no = _parse(source, state) |
|
370 if source.match("|"): |
|
371 raise error, "conditional backref with more than two branches" |
|
372 else: |
|
373 item_no = None |
|
374 if source.next and not source.match(")", 0): |
|
375 raise error, "pattern not properly closed" |
|
376 subpattern = SubPattern(state) |
|
377 subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no))) |
|
378 return subpattern |
|
379 |
|
380 _PATTERNENDERS = set("|)") |
|
381 _ASSERTCHARS = set("=!<") |
|
382 _LOOKBEHINDASSERTCHARS = set("=!") |
|
383 _REPEATCODES = set([MIN_REPEAT, MAX_REPEAT]) |
|
384 |
|
385 def _parse(source, state): |
|
386 # parse a simple pattern |
|
387 subpattern = SubPattern(state) |
|
388 |
|
389 # precompute constants into local variables |
|
390 subpatternappend = subpattern.append |
|
391 sourceget = source.get |
|
392 sourcematch = source.match |
|
393 _len = len |
|
394 PATTERNENDERS = _PATTERNENDERS |
|
395 ASSERTCHARS = _ASSERTCHARS |
|
396 LOOKBEHINDASSERTCHARS = _LOOKBEHINDASSERTCHARS |
|
397 REPEATCODES = _REPEATCODES |
|
398 |
|
399 while 1: |
|
400 |
|
401 if source.next in PATTERNENDERS: |
|
402 break # end of subpattern |
|
403 this = sourceget() |
|
404 if this is None: |
|
405 break # end of pattern |
|
406 |
|
407 if state.flags & SRE_FLAG_VERBOSE: |
|
408 # skip whitespace and comments |
|
409 if this in WHITESPACE: |
|
410 continue |
|
411 if this == "#": |
|
412 while 1: |
|
413 this = sourceget() |
|
414 if this in (None, "\n"): |
|
415 break |
|
416 continue |
|
417 |
|
418 if this and this[0] not in SPECIAL_CHARS: |
|
419 subpatternappend((LITERAL, ord(this))) |
|
420 |
|
421 elif this == "[": |
|
422 # character set |
|
423 set = [] |
|
424 setappend = set.append |
|
425 ## if sourcematch(":"): |
|
426 ## pass # handle character classes |
|
427 if sourcematch("^"): |
|
428 setappend((NEGATE, None)) |
|
429 # check remaining characters |
|
430 start = set[:] |
|
431 while 1: |
|
432 this = sourceget() |
|
433 if this == "]" and set != start: |
|
434 break |
|
435 elif this and this[0] == "\\": |
|
436 code1 = _class_escape(source, this) |
|
437 elif this: |
|
438 code1 = LITERAL, ord(this) |
|
439 else: |
|
440 raise error, "unexpected end of regular expression" |
|
441 if sourcematch("-"): |
|
442 # potential range |
|
443 this = sourceget() |
|
444 if this == "]": |
|
445 if code1[0] is IN: |
|
446 code1 = code1[1][0] |
|
447 setappend(code1) |
|
448 setappend((LITERAL, ord("-"))) |
|
449 break |
|
450 elif this: |
|
451 if this[0] == "\\": |
|
452 code2 = _class_escape(source, this) |
|
453 else: |
|
454 code2 = LITERAL, ord(this) |
|
455 if code1[0] != LITERAL or code2[0] != LITERAL: |
|
456 raise error, "bad character range" |
|
457 lo = code1[1] |
|
458 hi = code2[1] |
|
459 if hi < lo: |
|
460 raise error, "bad character range" |
|
461 setappend((RANGE, (lo, hi))) |
|
462 else: |
|
463 raise error, "unexpected end of regular expression" |
|
464 else: |
|
465 if code1[0] is IN: |
|
466 code1 = code1[1][0] |
|
467 setappend(code1) |
|
468 |
|
469 # XXX: <fl> should move set optimization to compiler! |
|
470 if _len(set)==1 and set[0][0] is LITERAL: |
|
471 subpatternappend(set[0]) # optimization |
|
472 elif _len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL: |
|
473 subpatternappend((NOT_LITERAL, set[1][1])) # optimization |
|
474 else: |
|
475 # XXX: <fl> should add charmap optimization here |
|
476 subpatternappend((IN, set)) |
|
477 |
|
478 elif this and this[0] in REPEAT_CHARS: |
|
479 # repeat previous item |
|
480 if this == "?": |
|
481 min, max = 0, 1 |
|
482 elif this == "*": |
|
483 min, max = 0, MAXREPEAT |
|
484 |
|
485 elif this == "+": |
|
486 min, max = 1, MAXREPEAT |
|
487 elif this == "{": |
|
488 if source.next == "}": |
|
489 subpatternappend((LITERAL, ord(this))) |
|
490 continue |
|
491 here = source.tell() |
|
492 min, max = 0, MAXREPEAT |
|
493 lo = hi = "" |
|
494 while source.next in DIGITS: |
|
495 lo = lo + source.get() |
|
496 if sourcematch(","): |
|
497 while source.next in DIGITS: |
|
498 hi = hi + sourceget() |
|
499 else: |
|
500 hi = lo |
|
501 if not sourcematch("}"): |
|
502 subpatternappend((LITERAL, ord(this))) |
|
503 source.seek(here) |
|
504 continue |
|
505 if lo: |
|
506 min = int(lo) |
|
507 if hi: |
|
508 max = int(hi) |
|
509 if max < min: |
|
510 raise error, "bad repeat interval" |
|
511 else: |
|
512 raise error, "not supported" |
|
513 # figure out which item to repeat |
|
514 if subpattern: |
|
515 item = subpattern[-1:] |
|
516 else: |
|
517 item = None |
|
518 if not item or (_len(item) == 1 and item[0][0] == AT): |
|
519 raise error, "nothing to repeat" |
|
520 if item[0][0] in REPEATCODES: |
|
521 raise error, "multiple repeat" |
|
522 if sourcematch("?"): |
|
523 subpattern[-1] = (MIN_REPEAT, (min, max, item)) |
|
524 else: |
|
525 subpattern[-1] = (MAX_REPEAT, (min, max, item)) |
|
526 |
|
527 elif this == ".": |
|
528 subpatternappend((ANY, None)) |
|
529 |
|
530 elif this == "(": |
|
531 group = 1 |
|
532 name = None |
|
533 condgroup = None |
|
534 if sourcematch("?"): |
|
535 group = 0 |
|
536 # options |
|
537 if sourcematch("P"): |
|
538 # python extensions |
|
539 if sourcematch("<"): |
|
540 # named group: skip forward to end of name |
|
541 name = "" |
|
542 while 1: |
|
543 char = sourceget() |
|
544 if char is None: |
|
545 raise error, "unterminated name" |
|
546 if char == ">": |
|
547 break |
|
548 name = name + char |
|
549 group = 1 |
|
550 if not isname(name): |
|
551 raise error, "bad character in group name" |
|
552 elif sourcematch("="): |
|
553 # named backreference |
|
554 name = "" |
|
555 while 1: |
|
556 char = sourceget() |
|
557 if char is None: |
|
558 raise error, "unterminated name" |
|
559 if char == ")": |
|
560 break |
|
561 name = name + char |
|
562 if not isname(name): |
|
563 raise error, "bad character in group name" |
|
564 gid = state.groupdict.get(name) |
|
565 if gid is None: |
|
566 raise error, "unknown group name" |
|
567 subpatternappend((GROUPREF, gid)) |
|
568 continue |
|
569 else: |
|
570 char = sourceget() |
|
571 if char is None: |
|
572 raise error, "unexpected end of pattern" |
|
573 raise error, "unknown specifier: ?P%s" % char |
|
574 elif sourcematch(":"): |
|
575 # non-capturing group |
|
576 group = 2 |
|
577 elif sourcematch("#"): |
|
578 # comment |
|
579 while 1: |
|
580 if source.next is None or source.next == ")": |
|
581 break |
|
582 sourceget() |
|
583 if not sourcematch(")"): |
|
584 raise error, "unbalanced parenthesis" |
|
585 continue |
|
586 elif source.next in ASSERTCHARS: |
|
587 # lookahead assertions |
|
588 char = sourceget() |
|
589 dir = 1 |
|
590 if char == "<": |
|
591 if source.next not in LOOKBEHINDASSERTCHARS: |
|
592 raise error, "syntax error" |
|
593 dir = -1 # lookbehind |
|
594 char = sourceget() |
|
595 p = _parse_sub(source, state) |
|
596 if not sourcematch(")"): |
|
597 raise error, "unbalanced parenthesis" |
|
598 if char == "=": |
|
599 subpatternappend((ASSERT, (dir, p))) |
|
600 else: |
|
601 subpatternappend((ASSERT_NOT, (dir, p))) |
|
602 continue |
|
603 elif sourcematch("("): |
|
604 # conditional backreference group |
|
605 condname = "" |
|
606 while 1: |
|
607 char = sourceget() |
|
608 if char is None: |
|
609 raise error, "unterminated name" |
|
610 if char == ")": |
|
611 break |
|
612 condname = condname + char |
|
613 group = 2 |
|
614 if isname(condname): |
|
615 condgroup = state.groupdict.get(condname) |
|
616 if condgroup is None: |
|
617 raise error, "unknown group name" |
|
618 else: |
|
619 try: |
|
620 condgroup = int(condname) |
|
621 except ValueError: |
|
622 raise error, "bad character in group name" |
|
623 else: |
|
624 # flags |
|
625 if not source.next in FLAGS: |
|
626 raise error, "unexpected end of pattern" |
|
627 while source.next in FLAGS: |
|
628 state.flags = state.flags | FLAGS[sourceget()] |
|
629 if group: |
|
630 # parse group contents |
|
631 if group == 2: |
|
632 # anonymous group |
|
633 group = None |
|
634 else: |
|
635 group = state.opengroup(name) |
|
636 if condgroup: |
|
637 p = _parse_sub_cond(source, state, condgroup) |
|
638 else: |
|
639 p = _parse_sub(source, state) |
|
640 if not sourcematch(")"): |
|
641 raise error, "unbalanced parenthesis" |
|
642 if group is not None: |
|
643 state.closegroup(group) |
|
644 subpatternappend((SUBPATTERN, (group, p))) |
|
645 else: |
|
646 while 1: |
|
647 char = sourceget() |
|
648 if char is None: |
|
649 raise error, "unexpected end of pattern" |
|
650 if char == ")": |
|
651 break |
|
652 raise error, "unknown extension" |
|
653 |
|
654 elif this == "^": |
|
655 subpatternappend((AT, AT_BEGINNING)) |
|
656 |
|
657 elif this == "$": |
|
658 subpattern.append((AT, AT_END)) |
|
659 |
|
660 elif this and this[0] == "\\": |
|
661 code = _escape(source, this, state) |
|
662 subpatternappend(code) |
|
663 |
|
664 else: |
|
665 raise error, "parser error" |
|
666 |
|
667 return subpattern |
|
668 |
|
669 def parse(str, flags=0, pattern=None): |
|
670 # parse 're' pattern into list of (opcode, argument) tuples |
|
671 |
|
672 source = Tokenizer(str) |
|
673 |
|
674 if pattern is None: |
|
675 pattern = Pattern() |
|
676 pattern.flags = flags |
|
677 pattern.str = str |
|
678 |
|
679 p = _parse_sub(source, pattern, 0) |
|
680 |
|
681 tail = source.get() |
|
682 if tail == ")": |
|
683 raise error, "unbalanced parenthesis" |
|
684 elif tail: |
|
685 raise error, "bogus characters at end of regular expression" |
|
686 |
|
687 if flags & SRE_FLAG_DEBUG: |
|
688 p.dump() |
|
689 |
|
690 if not (flags & SRE_FLAG_VERBOSE) and p.pattern.flags & SRE_FLAG_VERBOSE: |
|
691 # the VERBOSE flag was switched on inside the pattern. to be |
|
692 # on the safe side, we'll parse the whole thing again... |
|
693 return parse(str, p.pattern.flags) |
|
694 |
|
695 return p |
|
696 |
|
697 def parse_template(source, pattern): |
|
698 # parse 're' replacement string into list of literals and |
|
699 # group references |
|
700 s = Tokenizer(source) |
|
701 sget = s.get |
|
702 p = [] |
|
703 a = p.append |
|
704 def literal(literal, p=p, pappend=a): |
|
705 if p and p[-1][0] is LITERAL: |
|
706 p[-1] = LITERAL, p[-1][1] + literal |
|
707 else: |
|
708 pappend((LITERAL, literal)) |
|
709 sep = source[:0] |
|
710 if type(sep) is type(""): |
|
711 makechar = chr |
|
712 else: |
|
713 makechar = unichr |
|
714 while 1: |
|
715 this = sget() |
|
716 if this is None: |
|
717 break # end of replacement string |
|
718 if this and this[0] == "\\": |
|
719 # group |
|
720 c = this[1:2] |
|
721 if c == "g": |
|
722 name = "" |
|
723 if s.match("<"): |
|
724 while 1: |
|
725 char = sget() |
|
726 if char is None: |
|
727 raise error, "unterminated group name" |
|
728 if char == ">": |
|
729 break |
|
730 name = name + char |
|
731 if not name: |
|
732 raise error, "bad group name" |
|
733 try: |
|
734 index = int(name) |
|
735 if index < 0: |
|
736 raise error, "negative group number" |
|
737 except ValueError: |
|
738 if not isname(name): |
|
739 raise error, "bad character in group name" |
|
740 try: |
|
741 index = pattern.groupindex[name] |
|
742 except KeyError: |
|
743 raise IndexError, "unknown group name" |
|
744 a((MARK, index)) |
|
745 elif c == "0": |
|
746 if s.next in OCTDIGITS: |
|
747 this = this + sget() |
|
748 if s.next in OCTDIGITS: |
|
749 this = this + sget() |
|
750 literal(makechar(int(this[1:], 8) & 0xff)) |
|
751 elif c in DIGITS: |
|
752 isoctal = False |
|
753 if s.next in DIGITS: |
|
754 this = this + sget() |
|
755 if (c in OCTDIGITS and this[2] in OCTDIGITS and |
|
756 s.next in OCTDIGITS): |
|
757 this = this + sget() |
|
758 isoctal = True |
|
759 literal(makechar(int(this[1:], 8) & 0xff)) |
|
760 if not isoctal: |
|
761 a((MARK, int(this[1:]))) |
|
762 else: |
|
763 try: |
|
764 this = makechar(ESCAPES[this][1]) |
|
765 except KeyError: |
|
766 pass |
|
767 literal(this) |
|
768 else: |
|
769 literal(this) |
|
770 # convert template to groups and literals lists |
|
771 i = 0 |
|
772 groups = [] |
|
773 groupsappend = groups.append |
|
774 literals = [None] * len(p) |
|
775 for c, s in p: |
|
776 if c is MARK: |
|
777 groupsappend((i, s)) |
|
778 # literal[i] is already None |
|
779 else: |
|
780 literals[i] = s |
|
781 i = i + 1 |
|
782 return groups, literals |
|
783 |
|
784 def expand_template(template, match): |
|
785 g = match.group |
|
786 sep = match.string[:0] |
|
787 groups, literals = template |
|
788 literals = literals[:] |
|
789 try: |
|
790 for index, group in groups: |
|
791 literals[index] = s = g(group) |
|
792 if s is None: |
|
793 raise error, "unmatched group" |
|
794 except IndexError: |
|
795 raise error, "invalid group reference" |
|
796 return sep.join(literals) |