|
1 """A parser for SGML, using the derived class as a static DTD.""" |
|
2 |
|
3 # XXX This only supports those SGML features used by HTML. |
|
4 |
|
5 # XXX There should be a way to distinguish between PCDATA (parsed |
|
6 # character data -- the normal case), RCDATA (replaceable character |
|
7 # data -- only char and entity references and end tags are special) |
|
8 # and CDATA (character data -- only end tags are special). RCDATA is |
|
9 # not supported at all. |
|
10 |
|
11 |
|
12 import markupbase |
|
13 import re |
|
14 |
|
15 __all__ = ["SGMLParser", "SGMLParseError"] |
|
16 |
|
17 # Regular expressions used for parsing |
|
18 |
|
19 interesting = re.compile('[&<]') |
|
20 incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|' |
|
21 '<([a-zA-Z][^<>]*|' |
|
22 '/([a-zA-Z][^<>]*)?|' |
|
23 '![^<>]*)?') |
|
24 |
|
25 entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]') |
|
26 charref = re.compile('&#([0-9]+)[^0-9]') |
|
27 |
|
28 starttagopen = re.compile('<[>a-zA-Z]') |
|
29 shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/') |
|
30 shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/') |
|
31 piclose = re.compile('>') |
|
32 endbracket = re.compile('[<>]') |
|
33 tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*') |
|
34 attrfind = re.compile( |
|
35 r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*' |
|
36 r'(\'[^\']*\'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?') |
|
37 |
|
38 |
|
39 class SGMLParseError(RuntimeError): |
|
40 """Exception raised for all parse errors.""" |
|
41 pass |
|
42 |
|
43 |
|
44 # SGML parser base class -- find tags and call handler functions. |
|
45 # Usage: p = SGMLParser(); p.feed(data); ...; p.close(). |
|
46 # The dtd is defined by deriving a class which defines methods |
|
47 # with special names to handle tags: start_foo and end_foo to handle |
|
48 # <foo> and </foo>, respectively, or do_foo to handle <foo> by itself. |
|
49 # (Tags are converted to lower case for this purpose.) The data |
|
50 # between tags is passed to the parser by calling self.handle_data() |
|
51 # with some data as argument (the data may be split up in arbitrary |
|
52 # chunks). Entity references are passed by calling |
|
53 # self.handle_entityref() with the entity reference as argument. |
|
54 |
|
55 class SGMLParser(markupbase.ParserBase): |
|
56 # Definition of entities -- derived classes may override |
|
57 entity_or_charref = re.compile('&(?:' |
|
58 '([a-zA-Z][-.a-zA-Z0-9]*)|#([0-9]+)' |
|
59 ')(;?)') |
|
60 |
|
61 def __init__(self, verbose=0): |
|
62 """Initialize and reset this instance.""" |
|
63 self.verbose = verbose |
|
64 self.reset() |
|
65 |
|
66 def reset(self): |
|
67 """Reset this instance. Loses all unprocessed data.""" |
|
68 self.__starttag_text = None |
|
69 self.rawdata = '' |
|
70 self.stack = [] |
|
71 self.lasttag = '???' |
|
72 self.nomoretags = 0 |
|
73 self.literal = 0 |
|
74 markupbase.ParserBase.reset(self) |
|
75 |
|
76 def setnomoretags(self): |
|
77 """Enter literal mode (CDATA) till EOF. |
|
78 |
|
79 Intended for derived classes only. |
|
80 """ |
|
81 self.nomoretags = self.literal = 1 |
|
82 |
|
83 def setliteral(self, *args): |
|
84 """Enter literal mode (CDATA). |
|
85 |
|
86 Intended for derived classes only. |
|
87 """ |
|
88 self.literal = 1 |
|
89 |
|
90 def feed(self, data): |
|
91 """Feed some data to the parser. |
|
92 |
|
93 Call this as often as you want, with as little or as much text |
|
94 as you want (may include '\n'). (This just saves the text, |
|
95 all the processing is done by goahead().) |
|
96 """ |
|
97 |
|
98 self.rawdata = self.rawdata + data |
|
99 self.goahead(0) |
|
100 |
|
101 def close(self): |
|
102 """Handle the remaining data.""" |
|
103 self.goahead(1) |
|
104 |
|
105 def error(self, message): |
|
106 raise SGMLParseError(message) |
|
107 |
|
108 # Internal -- handle data as far as reasonable. May leave state |
|
109 # and data to be processed by a subsequent call. If 'end' is |
|
110 # true, force handling all data as if followed by EOF marker. |
|
111 def goahead(self, end): |
|
112 rawdata = self.rawdata |
|
113 i = 0 |
|
114 n = len(rawdata) |
|
115 while i < n: |
|
116 if self.nomoretags: |
|
117 self.handle_data(rawdata[i:n]) |
|
118 i = n |
|
119 break |
|
120 match = interesting.search(rawdata, i) |
|
121 if match: j = match.start() |
|
122 else: j = n |
|
123 if i < j: |
|
124 self.handle_data(rawdata[i:j]) |
|
125 i = j |
|
126 if i == n: break |
|
127 if rawdata[i] == '<': |
|
128 if starttagopen.match(rawdata, i): |
|
129 if self.literal: |
|
130 self.handle_data(rawdata[i]) |
|
131 i = i+1 |
|
132 continue |
|
133 k = self.parse_starttag(i) |
|
134 if k < 0: break |
|
135 i = k |
|
136 continue |
|
137 if rawdata.startswith("</", i): |
|
138 k = self.parse_endtag(i) |
|
139 if k < 0: break |
|
140 i = k |
|
141 self.literal = 0 |
|
142 continue |
|
143 if self.literal: |
|
144 if n > (i + 1): |
|
145 self.handle_data("<") |
|
146 i = i+1 |
|
147 else: |
|
148 # incomplete |
|
149 break |
|
150 continue |
|
151 if rawdata.startswith("<!--", i): |
|
152 # Strictly speaking, a comment is --.*-- |
|
153 # within a declaration tag <!...>. |
|
154 # This should be removed, |
|
155 # and comments handled only in parse_declaration. |
|
156 k = self.parse_comment(i) |
|
157 if k < 0: break |
|
158 i = k |
|
159 continue |
|
160 if rawdata.startswith("<?", i): |
|
161 k = self.parse_pi(i) |
|
162 if k < 0: break |
|
163 i = i+k |
|
164 continue |
|
165 if rawdata.startswith("<!", i): |
|
166 # This is some sort of declaration; in "HTML as |
|
167 # deployed," this should only be the document type |
|
168 # declaration ("<!DOCTYPE html...>"). |
|
169 k = self.parse_declaration(i) |
|
170 if k < 0: break |
|
171 i = k |
|
172 continue |
|
173 elif rawdata[i] == '&': |
|
174 if self.literal: |
|
175 self.handle_data(rawdata[i]) |
|
176 i = i+1 |
|
177 continue |
|
178 match = charref.match(rawdata, i) |
|
179 if match: |
|
180 name = match.group(1) |
|
181 self.handle_charref(name) |
|
182 i = match.end(0) |
|
183 if rawdata[i-1] != ';': i = i-1 |
|
184 continue |
|
185 match = entityref.match(rawdata, i) |
|
186 if match: |
|
187 name = match.group(1) |
|
188 self.handle_entityref(name) |
|
189 i = match.end(0) |
|
190 if rawdata[i-1] != ';': i = i-1 |
|
191 continue |
|
192 else: |
|
193 self.error('neither < nor & ??') |
|
194 # We get here only if incomplete matches but |
|
195 # nothing else |
|
196 match = incomplete.match(rawdata, i) |
|
197 if not match: |
|
198 self.handle_data(rawdata[i]) |
|
199 i = i+1 |
|
200 continue |
|
201 j = match.end(0) |
|
202 if j == n: |
|
203 break # Really incomplete |
|
204 self.handle_data(rawdata[i:j]) |
|
205 i = j |
|
206 # end while |
|
207 if end and i < n: |
|
208 self.handle_data(rawdata[i:n]) |
|
209 i = n |
|
210 self.rawdata = rawdata[i:] |
|
211 # XXX if end: check for empty stack |
|
212 |
|
213 # Extensions for the DOCTYPE scanner: |
|
214 _decl_otherchars = '=' |
|
215 |
|
216 # Internal -- parse processing instr, return length or -1 if not terminated |
|
217 def parse_pi(self, i): |
|
218 rawdata = self.rawdata |
|
219 if rawdata[i:i+2] != '<?': |
|
220 self.error('unexpected call to parse_pi()') |
|
221 match = piclose.search(rawdata, i+2) |
|
222 if not match: |
|
223 return -1 |
|
224 j = match.start(0) |
|
225 self.handle_pi(rawdata[i+2: j]) |
|
226 j = match.end(0) |
|
227 return j-i |
|
228 |
|
229 def get_starttag_text(self): |
|
230 return self.__starttag_text |
|
231 |
|
232 # Internal -- handle starttag, return length or -1 if not terminated |
|
233 def parse_starttag(self, i): |
|
234 self.__starttag_text = None |
|
235 start_pos = i |
|
236 rawdata = self.rawdata |
|
237 if shorttagopen.match(rawdata, i): |
|
238 # SGML shorthand: <tag/data/ == <tag>data</tag> |
|
239 # XXX Can data contain &... (entity or char refs)? |
|
240 # XXX Can data contain < or > (tag characters)? |
|
241 # XXX Can there be whitespace before the first /? |
|
242 match = shorttag.match(rawdata, i) |
|
243 if not match: |
|
244 return -1 |
|
245 tag, data = match.group(1, 2) |
|
246 self.__starttag_text = '<%s/' % tag |
|
247 tag = tag.lower() |
|
248 k = match.end(0) |
|
249 self.finish_shorttag(tag, data) |
|
250 self.__starttag_text = rawdata[start_pos:match.end(1) + 1] |
|
251 return k |
|
252 # XXX The following should skip matching quotes (' or ") |
|
253 # As a shortcut way to exit, this isn't so bad, but shouldn't |
|
254 # be used to locate the actual end of the start tag since the |
|
255 # < or > characters may be embedded in an attribute value. |
|
256 match = endbracket.search(rawdata, i+1) |
|
257 if not match: |
|
258 return -1 |
|
259 j = match.start(0) |
|
260 # Now parse the data between i+1 and j into a tag and attrs |
|
261 attrs = [] |
|
262 if rawdata[i:i+2] == '<>': |
|
263 # SGML shorthand: <> == <last open tag seen> |
|
264 k = j |
|
265 tag = self.lasttag |
|
266 else: |
|
267 match = tagfind.match(rawdata, i+1) |
|
268 if not match: |
|
269 self.error('unexpected call to parse_starttag') |
|
270 k = match.end(0) |
|
271 tag = rawdata[i+1:k].lower() |
|
272 self.lasttag = tag |
|
273 while k < j: |
|
274 match = attrfind.match(rawdata, k) |
|
275 if not match: break |
|
276 attrname, rest, attrvalue = match.group(1, 2, 3) |
|
277 if not rest: |
|
278 attrvalue = attrname |
|
279 else: |
|
280 if (attrvalue[:1] == "'" == attrvalue[-1:] or |
|
281 attrvalue[:1] == '"' == attrvalue[-1:]): |
|
282 # strip quotes |
|
283 attrvalue = attrvalue[1:-1] |
|
284 attrvalue = self.entity_or_charref.sub( |
|
285 self._convert_ref, attrvalue) |
|
286 attrs.append((attrname.lower(), attrvalue)) |
|
287 k = match.end(0) |
|
288 if rawdata[j] == '>': |
|
289 j = j+1 |
|
290 self.__starttag_text = rawdata[start_pos:j] |
|
291 self.finish_starttag(tag, attrs) |
|
292 return j |
|
293 |
|
294 # Internal -- convert entity or character reference |
|
295 def _convert_ref(self, match): |
|
296 if match.group(2): |
|
297 return self.convert_charref(match.group(2)) or \ |
|
298 '&#%s%s' % match.groups()[1:] |
|
299 elif match.group(3): |
|
300 return self.convert_entityref(match.group(1)) or \ |
|
301 '&%s;' % match.group(1) |
|
302 else: |
|
303 return '&%s' % match.group(1) |
|
304 |
|
305 # Internal -- parse endtag |
|
306 def parse_endtag(self, i): |
|
307 rawdata = self.rawdata |
|
308 match = endbracket.search(rawdata, i+1) |
|
309 if not match: |
|
310 return -1 |
|
311 j = match.start(0) |
|
312 tag = rawdata[i+2:j].strip().lower() |
|
313 if rawdata[j] == '>': |
|
314 j = j+1 |
|
315 self.finish_endtag(tag) |
|
316 return j |
|
317 |
|
318 # Internal -- finish parsing of <tag/data/ (same as <tag>data</tag>) |
|
319 def finish_shorttag(self, tag, data): |
|
320 self.finish_starttag(tag, []) |
|
321 self.handle_data(data) |
|
322 self.finish_endtag(tag) |
|
323 |
|
324 # Internal -- finish processing of start tag |
|
325 # Return -1 for unknown tag, 0 for open-only tag, 1 for balanced tag |
|
326 def finish_starttag(self, tag, attrs): |
|
327 try: |
|
328 method = getattr(self, 'start_' + tag) |
|
329 except AttributeError: |
|
330 try: |
|
331 method = getattr(self, 'do_' + tag) |
|
332 except AttributeError: |
|
333 self.unknown_starttag(tag, attrs) |
|
334 return -1 |
|
335 else: |
|
336 self.handle_starttag(tag, method, attrs) |
|
337 return 0 |
|
338 else: |
|
339 self.stack.append(tag) |
|
340 self.handle_starttag(tag, method, attrs) |
|
341 return 1 |
|
342 |
|
343 # Internal -- finish processing of end tag |
|
344 def finish_endtag(self, tag): |
|
345 if not tag: |
|
346 found = len(self.stack) - 1 |
|
347 if found < 0: |
|
348 self.unknown_endtag(tag) |
|
349 return |
|
350 else: |
|
351 if tag not in self.stack: |
|
352 try: |
|
353 method = getattr(self, 'end_' + tag) |
|
354 except AttributeError: |
|
355 self.unknown_endtag(tag) |
|
356 else: |
|
357 self.report_unbalanced(tag) |
|
358 return |
|
359 found = len(self.stack) |
|
360 for i in range(found): |
|
361 if self.stack[i] == tag: found = i |
|
362 while len(self.stack) > found: |
|
363 tag = self.stack[-1] |
|
364 try: |
|
365 method = getattr(self, 'end_' + tag) |
|
366 except AttributeError: |
|
367 method = None |
|
368 if method: |
|
369 self.handle_endtag(tag, method) |
|
370 else: |
|
371 self.unknown_endtag(tag) |
|
372 del self.stack[-1] |
|
373 |
|
374 # Overridable -- handle start tag |
|
375 def handle_starttag(self, tag, method, attrs): |
|
376 method(attrs) |
|
377 |
|
378 # Overridable -- handle end tag |
|
379 def handle_endtag(self, tag, method): |
|
380 method() |
|
381 |
|
382 # Example -- report an unbalanced </...> tag. |
|
383 def report_unbalanced(self, tag): |
|
384 if self.verbose: |
|
385 print '*** Unbalanced </' + tag + '>' |
|
386 print '*** Stack:', self.stack |
|
387 |
|
388 def convert_charref(self, name): |
|
389 """Convert character reference, may be overridden.""" |
|
390 try: |
|
391 n = int(name) |
|
392 except ValueError: |
|
393 return |
|
394 if not 0 <= n <= 255: |
|
395 return |
|
396 return self.convert_codepoint(n) |
|
397 |
|
398 def convert_codepoint(self, codepoint): |
|
399 return chr(codepoint) |
|
400 |
|
401 def handle_charref(self, name): |
|
402 """Handle character reference, no need to override.""" |
|
403 replacement = self.convert_charref(name) |
|
404 if replacement is None: |
|
405 self.unknown_charref(name) |
|
406 else: |
|
407 self.handle_data(replacement) |
|
408 |
|
409 # Definition of entities -- derived classes may override |
|
410 entitydefs = \ |
|
411 {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''} |
|
412 |
|
413 def convert_entityref(self, name): |
|
414 """Convert entity references. |
|
415 |
|
416 As an alternative to overriding this method; one can tailor the |
|
417 results by setting up the self.entitydefs mapping appropriately. |
|
418 """ |
|
419 table = self.entitydefs |
|
420 if name in table: |
|
421 return table[name] |
|
422 else: |
|
423 return |
|
424 |
|
425 def handle_entityref(self, name): |
|
426 """Handle entity references, no need to override.""" |
|
427 replacement = self.convert_entityref(name) |
|
428 if replacement is None: |
|
429 self.unknown_entityref(name) |
|
430 else: |
|
431 self.handle_data(self.convert_entityref(name)) |
|
432 |
|
433 # Example -- handle data, should be overridden |
|
434 def handle_data(self, data): |
|
435 pass |
|
436 |
|
437 # Example -- handle comment, could be overridden |
|
438 def handle_comment(self, data): |
|
439 pass |
|
440 |
|
441 # Example -- handle declaration, could be overridden |
|
442 def handle_decl(self, decl): |
|
443 pass |
|
444 |
|
445 # Example -- handle processing instruction, could be overridden |
|
446 def handle_pi(self, data): |
|
447 pass |
|
448 |
|
449 # To be overridden -- handlers for unknown objects |
|
450 def unknown_starttag(self, tag, attrs): pass |
|
451 def unknown_endtag(self, tag): pass |
|
452 def unknown_charref(self, ref): pass |
|
453 def unknown_entityref(self, ref): pass |
|
454 |
|
455 |
|
456 class TestSGMLParser(SGMLParser): |
|
457 |
|
458 def __init__(self, verbose=0): |
|
459 self.testdata = "" |
|
460 SGMLParser.__init__(self, verbose) |
|
461 |
|
462 def handle_data(self, data): |
|
463 self.testdata = self.testdata + data |
|
464 if len(repr(self.testdata)) >= 70: |
|
465 self.flush() |
|
466 |
|
467 def flush(self): |
|
468 data = self.testdata |
|
469 if data: |
|
470 self.testdata = "" |
|
471 print 'data:', repr(data) |
|
472 |
|
473 def handle_comment(self, data): |
|
474 self.flush() |
|
475 r = repr(data) |
|
476 if len(r) > 68: |
|
477 r = r[:32] + '...' + r[-32:] |
|
478 print 'comment:', r |
|
479 |
|
480 def unknown_starttag(self, tag, attrs): |
|
481 self.flush() |
|
482 if not attrs: |
|
483 print 'start tag: <' + tag + '>' |
|
484 else: |
|
485 print 'start tag: <' + tag, |
|
486 for name, value in attrs: |
|
487 print name + '=' + '"' + value + '"', |
|
488 print '>' |
|
489 |
|
490 def unknown_endtag(self, tag): |
|
491 self.flush() |
|
492 print 'end tag: </' + tag + '>' |
|
493 |
|
494 def unknown_entityref(self, ref): |
|
495 self.flush() |
|
496 print '*** unknown entity ref: &' + ref + ';' |
|
497 |
|
498 def unknown_charref(self, ref): |
|
499 self.flush() |
|
500 print '*** unknown char ref: &#' + ref + ';' |
|
501 |
|
502 def unknown_decl(self, data): |
|
503 self.flush() |
|
504 print '*** unknown decl: [' + data + ']' |
|
505 |
|
506 def close(self): |
|
507 SGMLParser.close(self) |
|
508 self.flush() |
|
509 |
|
510 |
|
511 def test(args = None): |
|
512 import sys |
|
513 |
|
514 if args is None: |
|
515 args = sys.argv[1:] |
|
516 |
|
517 if args and args[0] == '-s': |
|
518 args = args[1:] |
|
519 klass = SGMLParser |
|
520 else: |
|
521 klass = TestSGMLParser |
|
522 |
|
523 if args: |
|
524 file = args[0] |
|
525 else: |
|
526 file = 'test.html' |
|
527 |
|
528 if file == '-': |
|
529 f = sys.stdin |
|
530 else: |
|
531 try: |
|
532 f = open(file, 'r') |
|
533 except IOError, msg: |
|
534 print file, ":", msg |
|
535 sys.exit(1) |
|
536 |
|
537 data = f.read() |
|
538 if f is not sys.stdin: |
|
539 f.close() |
|
540 |
|
541 x = klass() |
|
542 for c in data: |
|
543 x.feed(c) |
|
544 x.close() |
|
545 |
|
546 |
|
547 if __name__ == '__main__': |
|
548 test() |