|
1 """A parser for SGML, using the derived class as a static DTD.""" |
|
2 |
|
3 # XXX This only supports those SGML features used by HTML. |
|
4 |
|
5 # XXX There should be a way to distinguish between PCDATA (parsed |
|
6 # character data -- the normal case), RCDATA (replaceable character |
|
7 # data -- only char and entity references and end tags are special) |
|
8 # and CDATA (character data -- only end tags are special). RCDATA is |
|
9 # not supported at all. |
|
10 |
|
11 |
|
12 from warnings import warnpy3k |
|
13 warnpy3k("the sgmllib module has been removed in Python 3.0", |
|
14 stacklevel=2) |
|
15 del warnpy3k |
|
16 |
|
17 import markupbase |
|
18 import re |
|
19 |
|
20 __all__ = ["SGMLParser", "SGMLParseError"] |
|
21 |
|
22 # Regular expressions used for parsing |
|
23 |
|
24 interesting = re.compile('[&<]') |
|
25 incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|' |
|
26 '<([a-zA-Z][^<>]*|' |
|
27 '/([a-zA-Z][^<>]*)?|' |
|
28 '![^<>]*)?') |
|
29 |
|
30 entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]') |
|
31 charref = re.compile('&#([0-9]+)[^0-9]') |
|
32 |
|
33 starttagopen = re.compile('<[>a-zA-Z]') |
|
34 shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/') |
|
35 shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/') |
|
36 piclose = re.compile('>') |
|
37 endbracket = re.compile('[<>]') |
|
38 tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*') |
|
39 attrfind = re.compile( |
|
40 r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*' |
|
41 r'(\'[^\']*\'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?') |
|
42 |
|
43 |
|
44 class SGMLParseError(RuntimeError): |
|
45 """Exception raised for all parse errors.""" |
|
46 pass |
|
47 |
|
48 |
|
49 # SGML parser base class -- find tags and call handler functions. |
|
50 # Usage: p = SGMLParser(); p.feed(data); ...; p.close(). |
|
51 # The dtd is defined by deriving a class which defines methods |
|
52 # with special names to handle tags: start_foo and end_foo to handle |
|
53 # <foo> and </foo>, respectively, or do_foo to handle <foo> by itself. |
|
54 # (Tags are converted to lower case for this purpose.) The data |
|
55 # between tags is passed to the parser by calling self.handle_data() |
|
56 # with some data as argument (the data may be split up in arbitrary |
|
57 # chunks). Entity references are passed by calling |
|
58 # self.handle_entityref() with the entity reference as argument. |
|
59 |
|
60 class SGMLParser(markupbase.ParserBase): |
|
61 # Definition of entities -- derived classes may override |
|
62 entity_or_charref = re.compile('&(?:' |
|
63 '([a-zA-Z][-.a-zA-Z0-9]*)|#([0-9]+)' |
|
64 ')(;?)') |
|
65 |
|
66 def __init__(self, verbose=0): |
|
67 """Initialize and reset this instance.""" |
|
68 self.verbose = verbose |
|
69 self.reset() |
|
70 |
|
71 def reset(self): |
|
72 """Reset this instance. Loses all unprocessed data.""" |
|
73 self.__starttag_text = None |
|
74 self.rawdata = '' |
|
75 self.stack = [] |
|
76 self.lasttag = '???' |
|
77 self.nomoretags = 0 |
|
78 self.literal = 0 |
|
79 markupbase.ParserBase.reset(self) |
|
80 |
|
81 def setnomoretags(self): |
|
82 """Enter literal mode (CDATA) till EOF. |
|
83 |
|
84 Intended for derived classes only. |
|
85 """ |
|
86 self.nomoretags = self.literal = 1 |
|
87 |
|
88 def setliteral(self, *args): |
|
89 """Enter literal mode (CDATA). |
|
90 |
|
91 Intended for derived classes only. |
|
92 """ |
|
93 self.literal = 1 |
|
94 |
|
95 def feed(self, data): |
|
96 """Feed some data to the parser. |
|
97 |
|
98 Call this as often as you want, with as little or as much text |
|
99 as you want (may include '\n'). (This just saves the text, |
|
100 all the processing is done by goahead().) |
|
101 """ |
|
102 |
|
103 self.rawdata = self.rawdata + data |
|
104 self.goahead(0) |
|
105 |
|
106 def close(self): |
|
107 """Handle the remaining data.""" |
|
108 self.goahead(1) |
|
109 |
|
110 def error(self, message): |
|
111 raise SGMLParseError(message) |
|
112 |
|
113 # Internal -- handle data as far as reasonable. May leave state |
|
114 # and data to be processed by a subsequent call. If 'end' is |
|
115 # true, force handling all data as if followed by EOF marker. |
|
116 def goahead(self, end): |
|
117 rawdata = self.rawdata |
|
118 i = 0 |
|
119 n = len(rawdata) |
|
120 while i < n: |
|
121 if self.nomoretags: |
|
122 self.handle_data(rawdata[i:n]) |
|
123 i = n |
|
124 break |
|
125 match = interesting.search(rawdata, i) |
|
126 if match: j = match.start() |
|
127 else: j = n |
|
128 if i < j: |
|
129 self.handle_data(rawdata[i:j]) |
|
130 i = j |
|
131 if i == n: break |
|
132 if rawdata[i] == '<': |
|
133 if starttagopen.match(rawdata, i): |
|
134 if self.literal: |
|
135 self.handle_data(rawdata[i]) |
|
136 i = i+1 |
|
137 continue |
|
138 k = self.parse_starttag(i) |
|
139 if k < 0: break |
|
140 i = k |
|
141 continue |
|
142 if rawdata.startswith("</", i): |
|
143 k = self.parse_endtag(i) |
|
144 if k < 0: break |
|
145 i = k |
|
146 self.literal = 0 |
|
147 continue |
|
148 if self.literal: |
|
149 if n > (i + 1): |
|
150 self.handle_data("<") |
|
151 i = i+1 |
|
152 else: |
|
153 # incomplete |
|
154 break |
|
155 continue |
|
156 if rawdata.startswith("<!--", i): |
|
157 # Strictly speaking, a comment is --.*-- |
|
158 # within a declaration tag <!...>. |
|
159 # This should be removed, |
|
160 # and comments handled only in parse_declaration. |
|
161 k = self.parse_comment(i) |
|
162 if k < 0: break |
|
163 i = k |
|
164 continue |
|
165 if rawdata.startswith("<?", i): |
|
166 k = self.parse_pi(i) |
|
167 if k < 0: break |
|
168 i = i+k |
|
169 continue |
|
170 if rawdata.startswith("<!", i): |
|
171 # This is some sort of declaration; in "HTML as |
|
172 # deployed," this should only be the document type |
|
173 # declaration ("<!DOCTYPE html...>"). |
|
174 k = self.parse_declaration(i) |
|
175 if k < 0: break |
|
176 i = k |
|
177 continue |
|
178 elif rawdata[i] == '&': |
|
179 if self.literal: |
|
180 self.handle_data(rawdata[i]) |
|
181 i = i+1 |
|
182 continue |
|
183 match = charref.match(rawdata, i) |
|
184 if match: |
|
185 name = match.group(1) |
|
186 self.handle_charref(name) |
|
187 i = match.end(0) |
|
188 if rawdata[i-1] != ';': i = i-1 |
|
189 continue |
|
190 match = entityref.match(rawdata, i) |
|
191 if match: |
|
192 name = match.group(1) |
|
193 self.handle_entityref(name) |
|
194 i = match.end(0) |
|
195 if rawdata[i-1] != ';': i = i-1 |
|
196 continue |
|
197 else: |
|
198 self.error('neither < nor & ??') |
|
199 # We get here only if incomplete matches but |
|
200 # nothing else |
|
201 match = incomplete.match(rawdata, i) |
|
202 if not match: |
|
203 self.handle_data(rawdata[i]) |
|
204 i = i+1 |
|
205 continue |
|
206 j = match.end(0) |
|
207 if j == n: |
|
208 break # Really incomplete |
|
209 self.handle_data(rawdata[i:j]) |
|
210 i = j |
|
211 # end while |
|
212 if end and i < n: |
|
213 self.handle_data(rawdata[i:n]) |
|
214 i = n |
|
215 self.rawdata = rawdata[i:] |
|
216 # XXX if end: check for empty stack |
|
217 |
|
218 # Extensions for the DOCTYPE scanner: |
|
219 _decl_otherchars = '=' |
|
220 |
|
221 # Internal -- parse processing instr, return length or -1 if not terminated |
|
222 def parse_pi(self, i): |
|
223 rawdata = self.rawdata |
|
224 if rawdata[i:i+2] != '<?': |
|
225 self.error('unexpected call to parse_pi()') |
|
226 match = piclose.search(rawdata, i+2) |
|
227 if not match: |
|
228 return -1 |
|
229 j = match.start(0) |
|
230 self.handle_pi(rawdata[i+2: j]) |
|
231 j = match.end(0) |
|
232 return j-i |
|
233 |
|
234 def get_starttag_text(self): |
|
235 return self.__starttag_text |
|
236 |
|
237 # Internal -- handle starttag, return length or -1 if not terminated |
|
238 def parse_starttag(self, i): |
|
239 self.__starttag_text = None |
|
240 start_pos = i |
|
241 rawdata = self.rawdata |
|
242 if shorttagopen.match(rawdata, i): |
|
243 # SGML shorthand: <tag/data/ == <tag>data</tag> |
|
244 # XXX Can data contain &... (entity or char refs)? |
|
245 # XXX Can data contain < or > (tag characters)? |
|
246 # XXX Can there be whitespace before the first /? |
|
247 match = shorttag.match(rawdata, i) |
|
248 if not match: |
|
249 return -1 |
|
250 tag, data = match.group(1, 2) |
|
251 self.__starttag_text = '<%s/' % tag |
|
252 tag = tag.lower() |
|
253 k = match.end(0) |
|
254 self.finish_shorttag(tag, data) |
|
255 self.__starttag_text = rawdata[start_pos:match.end(1) + 1] |
|
256 return k |
|
257 # XXX The following should skip matching quotes (' or ") |
|
258 # As a shortcut way to exit, this isn't so bad, but shouldn't |
|
259 # be used to locate the actual end of the start tag since the |
|
260 # < or > characters may be embedded in an attribute value. |
|
261 match = endbracket.search(rawdata, i+1) |
|
262 if not match: |
|
263 return -1 |
|
264 j = match.start(0) |
|
265 # Now parse the data between i+1 and j into a tag and attrs |
|
266 attrs = [] |
|
267 if rawdata[i:i+2] == '<>': |
|
268 # SGML shorthand: <> == <last open tag seen> |
|
269 k = j |
|
270 tag = self.lasttag |
|
271 else: |
|
272 match = tagfind.match(rawdata, i+1) |
|
273 if not match: |
|
274 self.error('unexpected call to parse_starttag') |
|
275 k = match.end(0) |
|
276 tag = rawdata[i+1:k].lower() |
|
277 self.lasttag = tag |
|
278 while k < j: |
|
279 match = attrfind.match(rawdata, k) |
|
280 if not match: break |
|
281 attrname, rest, attrvalue = match.group(1, 2, 3) |
|
282 if not rest: |
|
283 attrvalue = attrname |
|
284 else: |
|
285 if (attrvalue[:1] == "'" == attrvalue[-1:] or |
|
286 attrvalue[:1] == '"' == attrvalue[-1:]): |
|
287 # strip quotes |
|
288 attrvalue = attrvalue[1:-1] |
|
289 attrvalue = self.entity_or_charref.sub( |
|
290 self._convert_ref, attrvalue) |
|
291 attrs.append((attrname.lower(), attrvalue)) |
|
292 k = match.end(0) |
|
293 if rawdata[j] == '>': |
|
294 j = j+1 |
|
295 self.__starttag_text = rawdata[start_pos:j] |
|
296 self.finish_starttag(tag, attrs) |
|
297 return j |
|
298 |
|
299 # Internal -- convert entity or character reference |
|
300 def _convert_ref(self, match): |
|
301 if match.group(2): |
|
302 return self.convert_charref(match.group(2)) or \ |
|
303 '&#%s%s' % match.groups()[1:] |
|
304 elif match.group(3): |
|
305 return self.convert_entityref(match.group(1)) or \ |
|
306 '&%s;' % match.group(1) |
|
307 else: |
|
308 return '&%s' % match.group(1) |
|
309 |
|
310 # Internal -- parse endtag |
|
311 def parse_endtag(self, i): |
|
312 rawdata = self.rawdata |
|
313 match = endbracket.search(rawdata, i+1) |
|
314 if not match: |
|
315 return -1 |
|
316 j = match.start(0) |
|
317 tag = rawdata[i+2:j].strip().lower() |
|
318 if rawdata[j] == '>': |
|
319 j = j+1 |
|
320 self.finish_endtag(tag) |
|
321 return j |
|
322 |
|
323 # Internal -- finish parsing of <tag/data/ (same as <tag>data</tag>) |
|
324 def finish_shorttag(self, tag, data): |
|
325 self.finish_starttag(tag, []) |
|
326 self.handle_data(data) |
|
327 self.finish_endtag(tag) |
|
328 |
|
329 # Internal -- finish processing of start tag |
|
330 # Return -1 for unknown tag, 0 for open-only tag, 1 for balanced tag |
|
331 def finish_starttag(self, tag, attrs): |
|
332 try: |
|
333 method = getattr(self, 'start_' + tag) |
|
334 except AttributeError: |
|
335 try: |
|
336 method = getattr(self, 'do_' + tag) |
|
337 except AttributeError: |
|
338 self.unknown_starttag(tag, attrs) |
|
339 return -1 |
|
340 else: |
|
341 self.handle_starttag(tag, method, attrs) |
|
342 return 0 |
|
343 else: |
|
344 self.stack.append(tag) |
|
345 self.handle_starttag(tag, method, attrs) |
|
346 return 1 |
|
347 |
|
348 # Internal -- finish processing of end tag |
|
349 def finish_endtag(self, tag): |
|
350 if not tag: |
|
351 found = len(self.stack) - 1 |
|
352 if found < 0: |
|
353 self.unknown_endtag(tag) |
|
354 return |
|
355 else: |
|
356 if tag not in self.stack: |
|
357 try: |
|
358 method = getattr(self, 'end_' + tag) |
|
359 except AttributeError: |
|
360 self.unknown_endtag(tag) |
|
361 else: |
|
362 self.report_unbalanced(tag) |
|
363 return |
|
364 found = len(self.stack) |
|
365 for i in range(found): |
|
366 if self.stack[i] == tag: found = i |
|
367 while len(self.stack) > found: |
|
368 tag = self.stack[-1] |
|
369 try: |
|
370 method = getattr(self, 'end_' + tag) |
|
371 except AttributeError: |
|
372 method = None |
|
373 if method: |
|
374 self.handle_endtag(tag, method) |
|
375 else: |
|
376 self.unknown_endtag(tag) |
|
377 del self.stack[-1] |
|
378 |
|
379 # Overridable -- handle start tag |
|
380 def handle_starttag(self, tag, method, attrs): |
|
381 method(attrs) |
|
382 |
|
383 # Overridable -- handle end tag |
|
384 def handle_endtag(self, tag, method): |
|
385 method() |
|
386 |
|
387 # Example -- report an unbalanced </...> tag. |
|
388 def report_unbalanced(self, tag): |
|
389 if self.verbose: |
|
390 print '*** Unbalanced </' + tag + '>' |
|
391 print '*** Stack:', self.stack |
|
392 |
|
393 def convert_charref(self, name): |
|
394 """Convert character reference, may be overridden.""" |
|
395 try: |
|
396 n = int(name) |
|
397 except ValueError: |
|
398 return |
|
399 if not 0 <= n <= 255: |
|
400 return |
|
401 return self.convert_codepoint(n) |
|
402 |
|
403 def convert_codepoint(self, codepoint): |
|
404 return chr(codepoint) |
|
405 |
|
406 def handle_charref(self, name): |
|
407 """Handle character reference, no need to override.""" |
|
408 replacement = self.convert_charref(name) |
|
409 if replacement is None: |
|
410 self.unknown_charref(name) |
|
411 else: |
|
412 self.handle_data(replacement) |
|
413 |
|
414 # Definition of entities -- derived classes may override |
|
415 entitydefs = \ |
|
416 {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''} |
|
417 |
|
418 def convert_entityref(self, name): |
|
419 """Convert entity references. |
|
420 |
|
421 As an alternative to overriding this method; one can tailor the |
|
422 results by setting up the self.entitydefs mapping appropriately. |
|
423 """ |
|
424 table = self.entitydefs |
|
425 if name in table: |
|
426 return table[name] |
|
427 else: |
|
428 return |
|
429 |
|
430 def handle_entityref(self, name): |
|
431 """Handle entity references, no need to override.""" |
|
432 replacement = self.convert_entityref(name) |
|
433 if replacement is None: |
|
434 self.unknown_entityref(name) |
|
435 else: |
|
436 self.handle_data(replacement) |
|
437 |
|
438 # Example -- handle data, should be overridden |
|
439 def handle_data(self, data): |
|
440 pass |
|
441 |
|
442 # Example -- handle comment, could be overridden |
|
443 def handle_comment(self, data): |
|
444 pass |
|
445 |
|
446 # Example -- handle declaration, could be overridden |
|
447 def handle_decl(self, decl): |
|
448 pass |
|
449 |
|
450 # Example -- handle processing instruction, could be overridden |
|
451 def handle_pi(self, data): |
|
452 pass |
|
453 |
|
454 # To be overridden -- handlers for unknown objects |
|
455 def unknown_starttag(self, tag, attrs): pass |
|
456 def unknown_endtag(self, tag): pass |
|
457 def unknown_charref(self, ref): pass |
|
458 def unknown_entityref(self, ref): pass |
|
459 |
|
460 |
|
461 class TestSGMLParser(SGMLParser): |
|
462 |
|
463 def __init__(self, verbose=0): |
|
464 self.testdata = "" |
|
465 SGMLParser.__init__(self, verbose) |
|
466 |
|
467 def handle_data(self, data): |
|
468 self.testdata = self.testdata + data |
|
469 if len(repr(self.testdata)) >= 70: |
|
470 self.flush() |
|
471 |
|
472 def flush(self): |
|
473 data = self.testdata |
|
474 if data: |
|
475 self.testdata = "" |
|
476 print 'data:', repr(data) |
|
477 |
|
478 def handle_comment(self, data): |
|
479 self.flush() |
|
480 r = repr(data) |
|
481 if len(r) > 68: |
|
482 r = r[:32] + '...' + r[-32:] |
|
483 print 'comment:', r |
|
484 |
|
485 def unknown_starttag(self, tag, attrs): |
|
486 self.flush() |
|
487 if not attrs: |
|
488 print 'start tag: <' + tag + '>' |
|
489 else: |
|
490 print 'start tag: <' + tag, |
|
491 for name, value in attrs: |
|
492 print name + '=' + '"' + value + '"', |
|
493 print '>' |
|
494 |
|
495 def unknown_endtag(self, tag): |
|
496 self.flush() |
|
497 print 'end tag: </' + tag + '>' |
|
498 |
|
499 def unknown_entityref(self, ref): |
|
500 self.flush() |
|
501 print '*** unknown entity ref: &' + ref + ';' |
|
502 |
|
503 def unknown_charref(self, ref): |
|
504 self.flush() |
|
505 print '*** unknown char ref: &#' + ref + ';' |
|
506 |
|
507 def unknown_decl(self, data): |
|
508 self.flush() |
|
509 print '*** unknown decl: [' + data + ']' |
|
510 |
|
511 def close(self): |
|
512 SGMLParser.close(self) |
|
513 self.flush() |
|
514 |
|
515 |
|
516 def test(args = None): |
|
517 import sys |
|
518 |
|
519 if args is None: |
|
520 args = sys.argv[1:] |
|
521 |
|
522 if args and args[0] == '-s': |
|
523 args = args[1:] |
|
524 klass = SGMLParser |
|
525 else: |
|
526 klass = TestSGMLParser |
|
527 |
|
528 if args: |
|
529 file = args[0] |
|
530 else: |
|
531 file = 'test.html' |
|
532 |
|
533 if file == '-': |
|
534 f = sys.stdin |
|
535 else: |
|
536 try: |
|
537 f = open(file, 'r') |
|
538 except IOError, msg: |
|
539 print file, ":", msg |
|
540 sys.exit(1) |
|
541 |
|
542 data = f.read() |
|
543 if f is not sys.stdin: |
|
544 f.close() |
|
545 |
|
546 x = klass() |
|
547 for c in data: |
|
548 x.feed(c) |
|
549 x.close() |
|
550 |
|
551 |
|
552 if __name__ == '__main__': |
|
553 test() |