|
1 # |
|
2 # Secret Labs' Regular Expression Engine |
|
3 # |
|
4 # convert template to internal format |
|
5 # |
|
6 # Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved. |
|
7 # |
|
8 # See the sre.py file for information on usage and redistribution. |
|
9 # |
|
10 |
|
11 """Internal support module for sre""" |
|
12 |
|
13 import _sre, sys |
|
14 import sre_parse |
|
15 from sre_constants import * |
|
16 |
|
17 assert _sre.MAGIC == MAGIC, "SRE module mismatch" |
|
18 |
|
19 if _sre.CODESIZE == 2: |
|
20 MAXCODE = 65535 |
|
21 else: |
|
22 MAXCODE = 0xFFFFFFFFL |
|
23 |
|
24 def _identityfunction(x): |
|
25 return x |
|
26 |
|
27 def set(seq): |
|
28 s = {} |
|
29 for elem in seq: |
|
30 s[elem] = 1 |
|
31 return s |
|
32 |
|
33 _LITERAL_CODES = set([LITERAL, NOT_LITERAL]) |
|
34 _REPEATING_CODES = set([REPEAT, MIN_REPEAT, MAX_REPEAT]) |
|
35 _SUCCESS_CODES = set([SUCCESS, FAILURE]) |
|
36 _ASSERT_CODES = set([ASSERT, ASSERT_NOT]) |
|
37 |
|
38 def _compile(code, pattern, flags): |
|
39 # internal: compile a (sub)pattern |
|
40 emit = code.append |
|
41 _len = len |
|
42 LITERAL_CODES = _LITERAL_CODES |
|
43 REPEATING_CODES = _REPEATING_CODES |
|
44 SUCCESS_CODES = _SUCCESS_CODES |
|
45 ASSERT_CODES = _ASSERT_CODES |
|
46 for op, av in pattern: |
|
47 if op in LITERAL_CODES: |
|
48 if flags & SRE_FLAG_IGNORECASE: |
|
49 emit(OPCODES[OP_IGNORE[op]]) |
|
50 emit(_sre.getlower(av, flags)) |
|
51 else: |
|
52 emit(OPCODES[op]) |
|
53 emit(av) |
|
54 elif op is IN: |
|
55 if flags & SRE_FLAG_IGNORECASE: |
|
56 emit(OPCODES[OP_IGNORE[op]]) |
|
57 def fixup(literal, flags=flags): |
|
58 return _sre.getlower(literal, flags) |
|
59 else: |
|
60 emit(OPCODES[op]) |
|
61 fixup = _identityfunction |
|
62 skip = _len(code); emit(0) |
|
63 _compile_charset(av, flags, code, fixup) |
|
64 code[skip] = _len(code) - skip |
|
65 elif op is ANY: |
|
66 if flags & SRE_FLAG_DOTALL: |
|
67 emit(OPCODES[ANY_ALL]) |
|
68 else: |
|
69 emit(OPCODES[ANY]) |
|
70 elif op in REPEATING_CODES: |
|
71 if flags & SRE_FLAG_TEMPLATE: |
|
72 raise error, "internal: unsupported template operator" |
|
73 emit(OPCODES[REPEAT]) |
|
74 skip = _len(code); emit(0) |
|
75 emit(av[0]) |
|
76 emit(av[1]) |
|
77 _compile(code, av[2], flags) |
|
78 emit(OPCODES[SUCCESS]) |
|
79 code[skip] = _len(code) - skip |
|
80 elif _simple(av) and op is not REPEAT: |
|
81 if op is MAX_REPEAT: |
|
82 emit(OPCODES[REPEAT_ONE]) |
|
83 else: |
|
84 emit(OPCODES[MIN_REPEAT_ONE]) |
|
85 skip = _len(code); emit(0) |
|
86 emit(av[0]) |
|
87 emit(av[1]) |
|
88 _compile(code, av[2], flags) |
|
89 emit(OPCODES[SUCCESS]) |
|
90 code[skip] = _len(code) - skip |
|
91 else: |
|
92 emit(OPCODES[REPEAT]) |
|
93 skip = _len(code); emit(0) |
|
94 emit(av[0]) |
|
95 emit(av[1]) |
|
96 _compile(code, av[2], flags) |
|
97 code[skip] = _len(code) - skip |
|
98 if op is MAX_REPEAT: |
|
99 emit(OPCODES[MAX_UNTIL]) |
|
100 else: |
|
101 emit(OPCODES[MIN_UNTIL]) |
|
102 elif op is SUBPATTERN: |
|
103 if av[0]: |
|
104 emit(OPCODES[MARK]) |
|
105 emit((av[0]-1)*2) |
|
106 # _compile_info(code, av[1], flags) |
|
107 _compile(code, av[1], flags) |
|
108 if av[0]: |
|
109 emit(OPCODES[MARK]) |
|
110 emit((av[0]-1)*2+1) |
|
111 elif op in SUCCESS_CODES: |
|
112 emit(OPCODES[op]) |
|
113 elif op in ASSERT_CODES: |
|
114 emit(OPCODES[op]) |
|
115 skip = _len(code); emit(0) |
|
116 if av[0] >= 0: |
|
117 emit(0) # look ahead |
|
118 else: |
|
119 lo, hi = av[1].getwidth() |
|
120 if lo != hi: |
|
121 raise error, "look-behind requires fixed-width pattern" |
|
122 emit(lo) # look behind |
|
123 _compile(code, av[1], flags) |
|
124 emit(OPCODES[SUCCESS]) |
|
125 code[skip] = _len(code) - skip |
|
126 elif op is CALL: |
|
127 emit(OPCODES[op]) |
|
128 skip = _len(code); emit(0) |
|
129 _compile(code, av, flags) |
|
130 emit(OPCODES[SUCCESS]) |
|
131 code[skip] = _len(code) - skip |
|
132 elif op is AT: |
|
133 emit(OPCODES[op]) |
|
134 if flags & SRE_FLAG_MULTILINE: |
|
135 av = AT_MULTILINE.get(av, av) |
|
136 if flags & SRE_FLAG_LOCALE: |
|
137 av = AT_LOCALE.get(av, av) |
|
138 elif flags & SRE_FLAG_UNICODE: |
|
139 av = AT_UNICODE.get(av, av) |
|
140 emit(ATCODES[av]) |
|
141 elif op is BRANCH: |
|
142 emit(OPCODES[op]) |
|
143 tail = [] |
|
144 tailappend = tail.append |
|
145 for av in av[1]: |
|
146 skip = _len(code); emit(0) |
|
147 # _compile_info(code, av, flags) |
|
148 _compile(code, av, flags) |
|
149 emit(OPCODES[JUMP]) |
|
150 tailappend(_len(code)); emit(0) |
|
151 code[skip] = _len(code) - skip |
|
152 emit(0) # end of branch |
|
153 for tail in tail: |
|
154 code[tail] = _len(code) - tail |
|
155 elif op is CATEGORY: |
|
156 emit(OPCODES[op]) |
|
157 if flags & SRE_FLAG_LOCALE: |
|
158 av = CH_LOCALE[av] |
|
159 elif flags & SRE_FLAG_UNICODE: |
|
160 av = CH_UNICODE[av] |
|
161 emit(CHCODES[av]) |
|
162 elif op is GROUPREF: |
|
163 if flags & SRE_FLAG_IGNORECASE: |
|
164 emit(OPCODES[OP_IGNORE[op]]) |
|
165 else: |
|
166 emit(OPCODES[op]) |
|
167 emit(av-1) |
|
168 elif op is GROUPREF_EXISTS: |
|
169 emit(OPCODES[op]) |
|
170 emit(av[0]-1) |
|
171 skipyes = _len(code); emit(0) |
|
172 _compile(code, av[1], flags) |
|
173 if av[2]: |
|
174 emit(OPCODES[JUMP]) |
|
175 skipno = _len(code); emit(0) |
|
176 code[skipyes] = _len(code) - skipyes + 1 |
|
177 _compile(code, av[2], flags) |
|
178 code[skipno] = _len(code) - skipno |
|
179 else: |
|
180 code[skipyes] = _len(code) - skipyes + 1 |
|
181 else: |
|
182 raise ValueError, ("unsupported operand type", op) |
|
183 |
|
184 def _compile_charset(charset, flags, code, fixup=None): |
|
185 # compile charset subprogram |
|
186 emit = code.append |
|
187 if fixup is None: |
|
188 fixup = _identityfunction |
|
189 for op, av in _optimize_charset(charset, fixup): |
|
190 emit(OPCODES[op]) |
|
191 if op is NEGATE: |
|
192 pass |
|
193 elif op is LITERAL: |
|
194 emit(fixup(av)) |
|
195 elif op is RANGE: |
|
196 emit(fixup(av[0])) |
|
197 emit(fixup(av[1])) |
|
198 elif op is CHARSET: |
|
199 code.extend(av) |
|
200 elif op is BIGCHARSET: |
|
201 code.extend(av) |
|
202 elif op is CATEGORY: |
|
203 if flags & SRE_FLAG_LOCALE: |
|
204 emit(CHCODES[CH_LOCALE[av]]) |
|
205 elif flags & SRE_FLAG_UNICODE: |
|
206 emit(CHCODES[CH_UNICODE[av]]) |
|
207 else: |
|
208 emit(CHCODES[av]) |
|
209 else: |
|
210 raise error, "internal: unsupported set operator" |
|
211 emit(OPCODES[FAILURE]) |
|
212 |
|
213 def _optimize_charset(charset, fixup): |
|
214 # internal: optimize character set |
|
215 out = [] |
|
216 outappend = out.append |
|
217 charmap = [0]*256 |
|
218 try: |
|
219 for op, av in charset: |
|
220 if op is NEGATE: |
|
221 outappend((op, av)) |
|
222 elif op is LITERAL: |
|
223 charmap[fixup(av)] = 1 |
|
224 elif op is RANGE: |
|
225 for i in range(fixup(av[0]), fixup(av[1])+1): |
|
226 charmap[i] = 1 |
|
227 elif op is CATEGORY: |
|
228 # XXX: could append to charmap tail |
|
229 return charset # cannot compress |
|
230 except IndexError: |
|
231 # character set contains unicode characters |
|
232 return _optimize_unicode(charset, fixup) |
|
233 # compress character map |
|
234 i = p = n = 0 |
|
235 runs = [] |
|
236 runsappend = runs.append |
|
237 for c in charmap: |
|
238 if c: |
|
239 if n == 0: |
|
240 p = i |
|
241 n = n + 1 |
|
242 elif n: |
|
243 runsappend((p, n)) |
|
244 n = 0 |
|
245 i = i + 1 |
|
246 if n: |
|
247 runsappend((p, n)) |
|
248 if len(runs) <= 2: |
|
249 # use literal/range |
|
250 for p, n in runs: |
|
251 if n == 1: |
|
252 outappend((LITERAL, p)) |
|
253 else: |
|
254 outappend((RANGE, (p, p+n-1))) |
|
255 if len(out) < len(charset): |
|
256 return out |
|
257 else: |
|
258 # use bitmap |
|
259 data = _mk_bitmap(charmap) |
|
260 outappend((CHARSET, data)) |
|
261 return out |
|
262 return charset |
|
263 |
|
264 def _mk_bitmap(bits): |
|
265 data = [] |
|
266 dataappend = data.append |
|
267 if _sre.CODESIZE == 2: |
|
268 start = (1, 0) |
|
269 else: |
|
270 start = (1L, 0L) |
|
271 m, v = start |
|
272 for c in bits: |
|
273 if c: |
|
274 v = v + m |
|
275 m = m + m |
|
276 if m > MAXCODE: |
|
277 dataappend(v) |
|
278 m, v = start |
|
279 return data |
|
280 |
|
281 # To represent a big charset, first a bitmap of all characters in the |
|
282 # set is constructed. Then, this bitmap is sliced into chunks of 256 |
|
283 # characters, duplicate chunks are eliminated, and each chunk is |
|
284 # given a number. In the compiled expression, the charset is |
|
285 # represented by a 16-bit word sequence, consisting of one word for |
|
286 # the number of different chunks, a sequence of 256 bytes (128 words) |
|
287 # of chunk numbers indexed by their original chunk position, and a |
|
288 # sequence of chunks (16 words each). |
|
289 |
|
290 # Compression is normally good: in a typical charset, large ranges of |
|
291 # Unicode will be either completely excluded (e.g. if only cyrillic |
|
292 # letters are to be matched), or completely included (e.g. if large |
|
293 # subranges of Kanji match). These ranges will be represented by |
|
294 # chunks of all one-bits or all zero-bits. |
|
295 |
|
296 # Matching can be also done efficiently: the more significant byte of |
|
297 # the Unicode character is an index into the chunk number, and the |
|
298 # less significant byte is a bit index in the chunk (just like the |
|
299 # CHARSET matching). |
|
300 |
|
301 # In UCS-4 mode, the BIGCHARSET opcode still supports only subsets |
|
302 # of the basic multilingual plane; an efficient representation |
|
303 # for all of UTF-16 has not yet been developed. This means, |
|
304 # in particular, that negated charsets cannot be represented as |
|
305 # bigcharsets. |
|
306 |
|
307 def _optimize_unicode(charset, fixup): |
|
308 try: |
|
309 import array |
|
310 except ImportError: |
|
311 return charset |
|
312 charmap = [0]*65536 |
|
313 negate = 0 |
|
314 try: |
|
315 for op, av in charset: |
|
316 if op is NEGATE: |
|
317 negate = 1 |
|
318 elif op is LITERAL: |
|
319 charmap[fixup(av)] = 1 |
|
320 elif op is RANGE: |
|
321 for i in xrange(fixup(av[0]), fixup(av[1])+1): |
|
322 charmap[i] = 1 |
|
323 elif op is CATEGORY: |
|
324 # XXX: could expand category |
|
325 return charset # cannot compress |
|
326 except IndexError: |
|
327 # non-BMP characters |
|
328 return charset |
|
329 if negate: |
|
330 if sys.maxunicode != 65535: |
|
331 # XXX: negation does not work with big charsets |
|
332 return charset |
|
333 for i in xrange(65536): |
|
334 charmap[i] = not charmap[i] |
|
335 comps = {} |
|
336 mapping = [0]*256 |
|
337 block = 0 |
|
338 data = [] |
|
339 for i in xrange(256): |
|
340 chunk = tuple(charmap[i*256:(i+1)*256]) |
|
341 new = comps.setdefault(chunk, block) |
|
342 mapping[i] = new |
|
343 if new == block: |
|
344 block = block + 1 |
|
345 data = data + _mk_bitmap(chunk) |
|
346 header = [block] |
|
347 if _sre.CODESIZE == 2: |
|
348 code = 'H' |
|
349 else: |
|
350 code = 'I' |
|
351 # Convert block indices to byte array of 256 bytes |
|
352 mapping = array.array('b', mapping).tostring() |
|
353 # Convert byte array to word array |
|
354 mapping = array.array(code, mapping) |
|
355 assert mapping.itemsize == _sre.CODESIZE |
|
356 header = header + mapping.tolist() |
|
357 data[0:0] = header |
|
358 return [(BIGCHARSET, data)] |
|
359 |
|
360 def _simple(av): |
|
361 # check if av is a "simple" operator |
|
362 lo, hi = av[2].getwidth() |
|
363 if lo == 0 and hi == MAXREPEAT: |
|
364 raise error, "nothing to repeat" |
|
365 return lo == hi == 1 and av[2][0][0] != SUBPATTERN |
|
366 |
|
367 def _compile_info(code, pattern, flags): |
|
368 # internal: compile an info block. in the current version, |
|
369 # this contains min/max pattern width, and an optional literal |
|
370 # prefix or a character map |
|
371 lo, hi = pattern.getwidth() |
|
372 if lo == 0: |
|
373 return # not worth it |
|
374 # look for a literal prefix |
|
375 prefix = [] |
|
376 prefixappend = prefix.append |
|
377 prefix_skip = 0 |
|
378 charset = [] # not used |
|
379 charsetappend = charset.append |
|
380 if not (flags & SRE_FLAG_IGNORECASE): |
|
381 # look for literal prefix |
|
382 for op, av in pattern.data: |
|
383 if op is LITERAL: |
|
384 if len(prefix) == prefix_skip: |
|
385 prefix_skip = prefix_skip + 1 |
|
386 prefixappend(av) |
|
387 elif op is SUBPATTERN and len(av[1]) == 1: |
|
388 op, av = av[1][0] |
|
389 if op is LITERAL: |
|
390 prefixappend(av) |
|
391 else: |
|
392 break |
|
393 else: |
|
394 break |
|
395 # if no prefix, look for charset prefix |
|
396 if not prefix and pattern.data: |
|
397 op, av = pattern.data[0] |
|
398 if op is SUBPATTERN and av[1]: |
|
399 op, av = av[1][0] |
|
400 if op is LITERAL: |
|
401 charsetappend((op, av)) |
|
402 elif op is BRANCH: |
|
403 c = [] |
|
404 cappend = c.append |
|
405 for p in av[1]: |
|
406 if not p: |
|
407 break |
|
408 op, av = p[0] |
|
409 if op is LITERAL: |
|
410 cappend((op, av)) |
|
411 else: |
|
412 break |
|
413 else: |
|
414 charset = c |
|
415 elif op is BRANCH: |
|
416 c = [] |
|
417 cappend = c.append |
|
418 for p in av[1]: |
|
419 if not p: |
|
420 break |
|
421 op, av = p[0] |
|
422 if op is LITERAL: |
|
423 cappend((op, av)) |
|
424 else: |
|
425 break |
|
426 else: |
|
427 charset = c |
|
428 elif op is IN: |
|
429 charset = av |
|
430 ## if prefix: |
|
431 ## print "*** PREFIX", prefix, prefix_skip |
|
432 ## if charset: |
|
433 ## print "*** CHARSET", charset |
|
434 # add an info block |
|
435 emit = code.append |
|
436 emit(OPCODES[INFO]) |
|
437 skip = len(code); emit(0) |
|
438 # literal flag |
|
439 mask = 0 |
|
440 if prefix: |
|
441 mask = SRE_INFO_PREFIX |
|
442 if len(prefix) == prefix_skip == len(pattern.data): |
|
443 mask = mask + SRE_INFO_LITERAL |
|
444 elif charset: |
|
445 mask = mask + SRE_INFO_CHARSET |
|
446 emit(mask) |
|
447 # pattern length |
|
448 if lo < MAXCODE: |
|
449 emit(lo) |
|
450 else: |
|
451 emit(MAXCODE) |
|
452 prefix = prefix[:MAXCODE] |
|
453 if hi < MAXCODE: |
|
454 emit(hi) |
|
455 else: |
|
456 emit(0) |
|
457 # add literal prefix |
|
458 if prefix: |
|
459 emit(len(prefix)) # length |
|
460 emit(prefix_skip) # skip |
|
461 code.extend(prefix) |
|
462 # generate overlap table |
|
463 table = [-1] + ([0]*len(prefix)) |
|
464 for i in xrange(len(prefix)): |
|
465 table[i+1] = table[i]+1 |
|
466 while table[i+1] > 0 and prefix[i] != prefix[table[i+1]-1]: |
|
467 table[i+1] = table[table[i+1]-1]+1 |
|
468 code.extend(table[1:]) # don't store first entry |
|
469 elif charset: |
|
470 _compile_charset(charset, flags, code) |
|
471 code[skip] = len(code) - skip |
|
472 |
|
473 try: |
|
474 unicode |
|
475 except NameError: |
|
476 STRING_TYPES = (type(""),) |
|
477 else: |
|
478 STRING_TYPES = (type(""), type(unicode(""))) |
|
479 |
|
480 def isstring(obj): |
|
481 for tp in STRING_TYPES: |
|
482 if isinstance(obj, tp): |
|
483 return 1 |
|
484 return 0 |
|
485 |
|
486 def _code(p, flags): |
|
487 |
|
488 flags = p.pattern.flags | flags |
|
489 code = [] |
|
490 |
|
491 # compile info block |
|
492 _compile_info(code, p, flags) |
|
493 |
|
494 # compile the pattern |
|
495 _compile(code, p.data, flags) |
|
496 |
|
497 code.append(OPCODES[SUCCESS]) |
|
498 |
|
499 return code |
|
500 |
|
501 def compile(p, flags=0): |
|
502 # internal: convert pattern list to internal format |
|
503 |
|
504 if isstring(p): |
|
505 pattern = p |
|
506 p = sre_parse.parse(p, flags) |
|
507 else: |
|
508 pattern = None |
|
509 |
|
510 code = _code(p, flags) |
|
511 |
|
512 # print code |
|
513 |
|
514 # XXX: <fl> get rid of this limitation! |
|
515 if p.pattern.groups > 100: |
|
516 raise AssertionError( |
|
517 "sorry, but this version only supports 100 named groups" |
|
518 ) |
|
519 |
|
520 # map in either direction |
|
521 groupindex = p.pattern.groupdict |
|
522 indexgroup = [None] * p.pattern.groups |
|
523 for k, i in groupindex.items(): |
|
524 indexgroup[i] = k |
|
525 |
|
526 return _sre.compile( |
|
527 pattern, flags | p.pattern.flags, code, |
|
528 p.pattern.groups-1, |
|
529 groupindex, indexgroup |
|
530 ) |