|
1 import htmlentitydefs |
|
2 import pprint |
|
3 import re |
|
4 import sgmllib |
|
5 import unittest |
|
6 from test import test_support |
|
7 |
|
8 |
|
9 class EventCollector(sgmllib.SGMLParser): |
|
10 |
|
11 def __init__(self): |
|
12 self.events = [] |
|
13 self.append = self.events.append |
|
14 sgmllib.SGMLParser.__init__(self) |
|
15 |
|
16 def get_events(self): |
|
17 # Normalize the list of events so that buffer artefacts don't |
|
18 # separate runs of contiguous characters. |
|
19 L = [] |
|
20 prevtype = None |
|
21 for event in self.events: |
|
22 type = event[0] |
|
23 if type == prevtype == "data": |
|
24 L[-1] = ("data", L[-1][1] + event[1]) |
|
25 else: |
|
26 L.append(event) |
|
27 prevtype = type |
|
28 self.events = L |
|
29 return L |
|
30 |
|
31 # structure markup |
|
32 |
|
33 def unknown_starttag(self, tag, attrs): |
|
34 self.append(("starttag", tag, attrs)) |
|
35 |
|
36 def unknown_endtag(self, tag): |
|
37 self.append(("endtag", tag)) |
|
38 |
|
39 # all other markup |
|
40 |
|
41 def handle_comment(self, data): |
|
42 self.append(("comment", data)) |
|
43 |
|
44 def handle_charref(self, data): |
|
45 self.append(("charref", data)) |
|
46 |
|
47 def handle_data(self, data): |
|
48 self.append(("data", data)) |
|
49 |
|
50 def handle_decl(self, decl): |
|
51 self.append(("decl", decl)) |
|
52 |
|
53 def handle_entityref(self, data): |
|
54 self.append(("entityref", data)) |
|
55 |
|
56 def handle_pi(self, data): |
|
57 self.append(("pi", data)) |
|
58 |
|
59 def unknown_decl(self, decl): |
|
60 self.append(("unknown decl", decl)) |
|
61 |
|
62 |
|
63 class CDATAEventCollector(EventCollector): |
|
64 def start_cdata(self, attrs): |
|
65 self.append(("starttag", "cdata", attrs)) |
|
66 self.setliteral() |
|
67 |
|
68 |
|
69 class HTMLEntityCollector(EventCollector): |
|
70 |
|
71 entity_or_charref = re.compile('(?:&([a-zA-Z][-.a-zA-Z0-9]*)' |
|
72 '|&#(x[0-9a-zA-Z]+|[0-9]+))(;?)') |
|
73 |
|
74 def convert_charref(self, name): |
|
75 self.append(("charref", "convert", name)) |
|
76 if name[0] != "x": |
|
77 return EventCollector.convert_charref(self, name) |
|
78 |
|
79 def convert_codepoint(self, codepoint): |
|
80 self.append(("codepoint", "convert", codepoint)) |
|
81 EventCollector.convert_codepoint(self, codepoint) |
|
82 |
|
83 def convert_entityref(self, name): |
|
84 self.append(("entityref", "convert", name)) |
|
85 return EventCollector.convert_entityref(self, name) |
|
86 |
|
87 # These to record that they were called, then pass the call along |
|
88 # to the default implementation so that it's actions can be |
|
89 # recorded. |
|
90 |
|
91 def handle_charref(self, data): |
|
92 self.append(("charref", data)) |
|
93 sgmllib.SGMLParser.handle_charref(self, data) |
|
94 |
|
95 def handle_entityref(self, data): |
|
96 self.append(("entityref", data)) |
|
97 sgmllib.SGMLParser.handle_entityref(self, data) |
|
98 |
|
99 |
|
100 class SGMLParserTestCase(unittest.TestCase): |
|
101 |
|
102 collector = EventCollector |
|
103 |
|
104 def get_events(self, source): |
|
105 parser = self.collector() |
|
106 try: |
|
107 for s in source: |
|
108 parser.feed(s) |
|
109 parser.close() |
|
110 except: |
|
111 #self.events = parser.events |
|
112 raise |
|
113 return parser.get_events() |
|
114 |
|
115 def check_events(self, source, expected_events): |
|
116 try: |
|
117 events = self.get_events(source) |
|
118 except: |
|
119 import sys |
|
120 #print >>sys.stderr, pprint.pformat(self.events) |
|
121 raise |
|
122 if events != expected_events: |
|
123 self.fail("received events did not match expected events\n" |
|
124 "Expected:\n" + pprint.pformat(expected_events) + |
|
125 "\nReceived:\n" + pprint.pformat(events)) |
|
126 |
|
127 def check_parse_error(self, source): |
|
128 parser = EventCollector() |
|
129 try: |
|
130 parser.feed(source) |
|
131 parser.close() |
|
132 except sgmllib.SGMLParseError: |
|
133 pass |
|
134 else: |
|
135 self.fail("expected SGMLParseError for %r\nReceived:\n%s" |
|
136 % (source, pprint.pformat(parser.get_events()))) |
|
137 |
|
138 def test_doctype_decl_internal(self): |
|
139 inside = """\ |
|
140 DOCTYPE html PUBLIC '-//W3C//DTD HTML 4.01//EN' |
|
141 SYSTEM 'http://www.w3.org/TR/html401/strict.dtd' [ |
|
142 <!ELEMENT html - O EMPTY> |
|
143 <!ATTLIST html |
|
144 version CDATA #IMPLIED |
|
145 profile CDATA 'DublinCore'> |
|
146 <!NOTATION datatype SYSTEM 'http://xml.python.org/notations/python-module'> |
|
147 <!ENTITY myEntity 'internal parsed entity'> |
|
148 <!ENTITY anEntity SYSTEM 'http://xml.python.org/entities/something.xml'> |
|
149 <!ENTITY % paramEntity 'name|name|name'> |
|
150 %paramEntity; |
|
151 <!-- comment --> |
|
152 ]""" |
|
153 self.check_events(["<!%s>" % inside], [ |
|
154 ("decl", inside), |
|
155 ]) |
|
156 |
|
157 def test_doctype_decl_external(self): |
|
158 inside = "DOCTYPE html PUBLIC '-//W3C//DTD HTML 4.01//EN'" |
|
159 self.check_events("<!%s>" % inside, [ |
|
160 ("decl", inside), |
|
161 ]) |
|
162 |
|
163 def test_underscore_in_attrname(self): |
|
164 # SF bug #436621 |
|
165 """Make sure attribute names with underscores are accepted""" |
|
166 self.check_events("<a has_under _under>", [ |
|
167 ("starttag", "a", [("has_under", "has_under"), |
|
168 ("_under", "_under")]), |
|
169 ]) |
|
170 |
|
171 def test_underscore_in_tagname(self): |
|
172 # SF bug #436621 |
|
173 """Make sure tag names with underscores are accepted""" |
|
174 self.check_events("<has_under></has_under>", [ |
|
175 ("starttag", "has_under", []), |
|
176 ("endtag", "has_under"), |
|
177 ]) |
|
178 |
|
179 def test_quotes_in_unquoted_attrs(self): |
|
180 # SF bug #436621 |
|
181 """Be sure quotes in unquoted attributes are made part of the value""" |
|
182 self.check_events("<a href=foo'bar\"baz>", [ |
|
183 ("starttag", "a", [("href", "foo'bar\"baz")]), |
|
184 ]) |
|
185 |
|
186 def test_xhtml_empty_tag(self): |
|
187 """Handling of XHTML-style empty start tags""" |
|
188 self.check_events("<br />text<i></i>", [ |
|
189 ("starttag", "br", []), |
|
190 ("data", "text"), |
|
191 ("starttag", "i", []), |
|
192 ("endtag", "i"), |
|
193 ]) |
|
194 |
|
195 def test_processing_instruction_only(self): |
|
196 self.check_events("<?processing instruction>", [ |
|
197 ("pi", "processing instruction"), |
|
198 ]) |
|
199 |
|
200 def test_bad_nesting(self): |
|
201 self.check_events("<a><b></a></b>", [ |
|
202 ("starttag", "a", []), |
|
203 ("starttag", "b", []), |
|
204 ("endtag", "a"), |
|
205 ("endtag", "b"), |
|
206 ]) |
|
207 |
|
208 def test_bare_ampersands(self): |
|
209 self.check_events("this text & contains & ampersands &", [ |
|
210 ("data", "this text & contains & ampersands &"), |
|
211 ]) |
|
212 |
|
213 def test_bare_pointy_brackets(self): |
|
214 self.check_events("this < text > contains < bare>pointy< brackets", [ |
|
215 ("data", "this < text > contains < bare>pointy< brackets"), |
|
216 ]) |
|
217 |
|
218 def test_attr_syntax(self): |
|
219 output = [ |
|
220 ("starttag", "a", [("b", "v"), ("c", "v"), ("d", "v"), ("e", "e")]) |
|
221 ] |
|
222 self.check_events("""<a b='v' c="v" d=v e>""", output) |
|
223 self.check_events("""<a b = 'v' c = "v" d = v e>""", output) |
|
224 self.check_events("""<a\nb\n=\n'v'\nc\n=\n"v"\nd\n=\nv\ne>""", output) |
|
225 self.check_events("""<a\tb\t=\t'v'\tc\t=\t"v"\td\t=\tv\te>""", output) |
|
226 |
|
227 def test_attr_values(self): |
|
228 self.check_events("""<a b='xxx\n\txxx' c="yyy\t\nyyy" d='\txyz\n'>""", |
|
229 [("starttag", "a", [("b", "xxx\n\txxx"), |
|
230 ("c", "yyy\t\nyyy"), |
|
231 ("d", "\txyz\n")]) |
|
232 ]) |
|
233 self.check_events("""<a b='' c="">""", [ |
|
234 ("starttag", "a", [("b", ""), ("c", "")]), |
|
235 ]) |
|
236 # URL construction stuff from RFC 1808: |
|
237 safe = "$-_.+" |
|
238 extra = "!*'()," |
|
239 reserved = ";/?:@&=" |
|
240 url = "http://example.com:8080/path/to/file?%s%s%s" % ( |
|
241 safe, extra, reserved) |
|
242 self.check_events("""<e a=%s>""" % url, [ |
|
243 ("starttag", "e", [("a", url)]), |
|
244 ]) |
|
245 # Regression test for SF patch #669683. |
|
246 self.check_events("<e a=rgb(1,2,3)>", [ |
|
247 ("starttag", "e", [("a", "rgb(1,2,3)")]), |
|
248 ]) |
|
249 |
|
250 def test_attr_values_entities(self): |
|
251 """Substitution of entities and charrefs in attribute values""" |
|
252 # SF bug #1452246 |
|
253 self.check_events("""<a b=< c=<> d=<-> e='< ' |
|
254 f="&xxx;" g=' !' h='Ǵ' |
|
255 i='x?a=b&c=d;' |
|
256 j='&#42;' k='&#42;'>""", |
|
257 [("starttag", "a", [("b", "<"), |
|
258 ("c", "<>"), |
|
259 ("d", "<->"), |
|
260 ("e", "< "), |
|
261 ("f", "&xxx;"), |
|
262 ("g", " !"), |
|
263 ("h", "Ǵ"), |
|
264 ("i", "x?a=b&c=d;"), |
|
265 ("j", "*"), |
|
266 ("k", "*"), |
|
267 ])]) |
|
268 |
|
269 def test_convert_overrides(self): |
|
270 # This checks that the character and entity reference |
|
271 # conversion helpers are called at the documented times. No |
|
272 # attempt is made to really change what the parser accepts. |
|
273 # |
|
274 self.collector = HTMLEntityCollector |
|
275 self.check_events(('<a title="“test”">foo</a>' |
|
276 '&foobar;*'), [ |
|
277 ('entityref', 'convert', 'ldquo'), |
|
278 ('charref', 'convert', 'x201d'), |
|
279 ('starttag', 'a', [('title', '“test”')]), |
|
280 ('data', 'foo'), |
|
281 ('endtag', 'a'), |
|
282 ('entityref', 'foobar'), |
|
283 ('entityref', 'convert', 'foobar'), |
|
284 ('charref', '42'), |
|
285 ('charref', 'convert', '42'), |
|
286 ('codepoint', 'convert', 42), |
|
287 ]) |
|
288 |
|
289 def test_attr_funky_names(self): |
|
290 self.check_events("""<a a.b='v' c:d=v e-f=v>""", [ |
|
291 ("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")]), |
|
292 ]) |
|
293 |
|
294 def test_attr_value_ip6_url(self): |
|
295 # http://www.python.org/sf/853506 |
|
296 self.check_events(("<a href='http://[1080::8:800:200C:417A]/'>" |
|
297 "<a href=http://[1080::8:800:200C:417A]/>"), [ |
|
298 ("starttag", "a", [("href", "http://[1080::8:800:200C:417A]/")]), |
|
299 ("starttag", "a", [("href", "http://[1080::8:800:200C:417A]/")]), |
|
300 ]) |
|
301 |
|
302 def test_illegal_declarations(self): |
|
303 s = 'abc<!spacer type="block" height="25">def' |
|
304 self.check_events(s, [ |
|
305 ("data", "abc"), |
|
306 ("unknown decl", 'spacer type="block" height="25"'), |
|
307 ("data", "def"), |
|
308 ]) |
|
309 |
|
310 def test_weird_starttags(self): |
|
311 self.check_events("<a<a>", [ |
|
312 ("starttag", "a", []), |
|
313 ("starttag", "a", []), |
|
314 ]) |
|
315 self.check_events("</a<a>", [ |
|
316 ("endtag", "a"), |
|
317 ("starttag", "a", []), |
|
318 ]) |
|
319 |
|
320 def test_declaration_junk_chars(self): |
|
321 self.check_parse_error("<!DOCTYPE foo $ >") |
|
322 |
|
323 def test_get_starttag_text(self): |
|
324 s = """<foobar \n one="1"\ttwo=2 >""" |
|
325 self.check_events(s, [ |
|
326 ("starttag", "foobar", [("one", "1"), ("two", "2")]), |
|
327 ]) |
|
328 |
|
329 def test_cdata_content(self): |
|
330 s = ("<cdata> <!-- not a comment --> ¬-an-entity-ref; </cdata>" |
|
331 "<notcdata> <!-- comment --> </notcdata>") |
|
332 self.collector = CDATAEventCollector |
|
333 self.check_events(s, [ |
|
334 ("starttag", "cdata", []), |
|
335 ("data", " <!-- not a comment --> ¬-an-entity-ref; "), |
|
336 ("endtag", "cdata"), |
|
337 ("starttag", "notcdata", []), |
|
338 ("data", " "), |
|
339 ("comment", " comment "), |
|
340 ("data", " "), |
|
341 ("endtag", "notcdata"), |
|
342 ]) |
|
343 s = """<cdata> <not a='start tag'> </cdata>""" |
|
344 self.check_events(s, [ |
|
345 ("starttag", "cdata", []), |
|
346 ("data", " <not a='start tag'> "), |
|
347 ("endtag", "cdata"), |
|
348 ]) |
|
349 |
|
350 def test_illegal_declarations(self): |
|
351 s = 'abc<!spacer type="block" height="25">def' |
|
352 self.check_events(s, [ |
|
353 ("data", "abc"), |
|
354 ("unknown decl", 'spacer type="block" height="25"'), |
|
355 ("data", "def"), |
|
356 ]) |
|
357 |
|
358 def test_enumerated_attr_type(self): |
|
359 s = "<!DOCTYPE doc [<!ATTLIST doc attr (a | b) >]>" |
|
360 self.check_events(s, [ |
|
361 ('decl', 'DOCTYPE doc [<!ATTLIST doc attr (a | b) >]'), |
|
362 ]) |
|
363 |
|
364 def test_read_chunks(self): |
|
365 # SF bug #1541697, this caused sgml parser to hang |
|
366 # Just verify this code doesn't cause a hang. |
|
367 CHUNK = 1024 # increasing this to 8212 makes the problem go away |
|
368 |
|
369 f = open(test_support.findfile('sgml_input.html')) |
|
370 fp = sgmllib.SGMLParser() |
|
371 while 1: |
|
372 data = f.read(CHUNK) |
|
373 fp.feed(data) |
|
374 if len(data) != CHUNK: |
|
375 break |
|
376 |
|
377 # XXX These tests have been disabled by prefixing their names with |
|
378 # an underscore. The first two exercise outstanding bugs in the |
|
379 # sgmllib module, and the third exhibits questionable behavior |
|
380 # that needs to be carefully considered before changing it. |
|
381 |
|
382 def _test_starttag_end_boundary(self): |
|
383 self.check_events("<a b='<'>", [("starttag", "a", [("b", "<")])]) |
|
384 self.check_events("<a b='>'>", [("starttag", "a", [("b", ">")])]) |
|
385 |
|
386 def _test_buffer_artefacts(self): |
|
387 output = [("starttag", "a", [("b", "<")])] |
|
388 self.check_events(["<a b='<'>"], output) |
|
389 self.check_events(["<a ", "b='<'>"], output) |
|
390 self.check_events(["<a b", "='<'>"], output) |
|
391 self.check_events(["<a b=", "'<'>"], output) |
|
392 self.check_events(["<a b='<", "'>"], output) |
|
393 self.check_events(["<a b='<'", ">"], output) |
|
394 |
|
395 output = [("starttag", "a", [("b", ">")])] |
|
396 self.check_events(["<a b='>'>"], output) |
|
397 self.check_events(["<a ", "b='>'>"], output) |
|
398 self.check_events(["<a b", "='>'>"], output) |
|
399 self.check_events(["<a b=", "'>'>"], output) |
|
400 self.check_events(["<a b='>", "'>"], output) |
|
401 self.check_events(["<a b='>'", ">"], output) |
|
402 |
|
403 output = [("comment", "abc")] |
|
404 self.check_events(["", "<!--abc-->"], output) |
|
405 self.check_events(["<", "!--abc-->"], output) |
|
406 self.check_events(["<!", "--abc-->"], output) |
|
407 self.check_events(["<!-", "-abc-->"], output) |
|
408 self.check_events(["<!--", "abc-->"], output) |
|
409 self.check_events(["<!--a", "bc-->"], output) |
|
410 self.check_events(["<!--ab", "c-->"], output) |
|
411 self.check_events(["<!--abc", "-->"], output) |
|
412 self.check_events(["<!--abc-", "->"], output) |
|
413 self.check_events(["<!--abc--", ">"], output) |
|
414 self.check_events(["<!--abc-->", ""], output) |
|
415 |
|
416 def _test_starttag_junk_chars(self): |
|
417 self.check_parse_error("<") |
|
418 self.check_parse_error("<>") |
|
419 self.check_parse_error("</$>") |
|
420 self.check_parse_error("</") |
|
421 self.check_parse_error("</a") |
|
422 self.check_parse_error("<$") |
|
423 self.check_parse_error("<$>") |
|
424 self.check_parse_error("<!") |
|
425 self.check_parse_error("<a $>") |
|
426 self.check_parse_error("<a") |
|
427 self.check_parse_error("<a foo='bar'") |
|
428 self.check_parse_error("<a foo='bar") |
|
429 self.check_parse_error("<a foo='>'") |
|
430 self.check_parse_error("<a foo='>") |
|
431 self.check_parse_error("<a foo=>") |
|
432 |
|
433 |
|
434 def test_main(): |
|
435 test_support.run_unittest(SGMLParserTestCase) |
|
436 |
|
437 |
|
438 if __name__ == "__main__": |
|
439 test_main() |