python-2.5.2/win32/Lib/test/test_sgmllib.py
changeset 0 ae805ac0140d
equal deleted inserted replaced
-1:000000000000 0:ae805ac0140d
       
     1 import htmlentitydefs
       
     2 import pprint
       
     3 import re
       
     4 import sgmllib
       
     5 import unittest
       
     6 from test import test_support
       
     7 
       
     8 
       
     9 class EventCollector(sgmllib.SGMLParser):
       
    10 
       
    11     def __init__(self):
       
    12         self.events = []
       
    13         self.append = self.events.append
       
    14         sgmllib.SGMLParser.__init__(self)
       
    15 
       
    16     def get_events(self):
       
    17         # Normalize the list of events so that buffer artefacts don't
       
    18         # separate runs of contiguous characters.
       
    19         L = []
       
    20         prevtype = None
       
    21         for event in self.events:
       
    22             type = event[0]
       
    23             if type == prevtype == "data":
       
    24                 L[-1] = ("data", L[-1][1] + event[1])
       
    25             else:
       
    26                 L.append(event)
       
    27             prevtype = type
       
    28         self.events = L
       
    29         return L
       
    30 
       
    31     # structure markup
       
    32 
       
    33     def unknown_starttag(self, tag, attrs):
       
    34         self.append(("starttag", tag, attrs))
       
    35 
       
    36     def unknown_endtag(self, tag):
       
    37         self.append(("endtag", tag))
       
    38 
       
    39     # all other markup
       
    40 
       
    41     def handle_comment(self, data):
       
    42         self.append(("comment", data))
       
    43 
       
    44     def handle_charref(self, data):
       
    45         self.append(("charref", data))
       
    46 
       
    47     def handle_data(self, data):
       
    48         self.append(("data", data))
       
    49 
       
    50     def handle_decl(self, decl):
       
    51         self.append(("decl", decl))
       
    52 
       
    53     def handle_entityref(self, data):
       
    54         self.append(("entityref", data))
       
    55 
       
    56     def handle_pi(self, data):
       
    57         self.append(("pi", data))
       
    58 
       
    59     def unknown_decl(self, decl):
       
    60         self.append(("unknown decl", decl))
       
    61 
       
    62 
       
    63 class CDATAEventCollector(EventCollector):
       
    64     def start_cdata(self, attrs):
       
    65         self.append(("starttag", "cdata", attrs))
       
    66         self.setliteral()
       
    67 
       
    68 
       
    69 class HTMLEntityCollector(EventCollector):
       
    70 
       
    71     entity_or_charref = re.compile('(?:&([a-zA-Z][-.a-zA-Z0-9]*)'
       
    72         '|&#(x[0-9a-zA-Z]+|[0-9]+))(;?)')
       
    73 
       
    74     def convert_charref(self, name):
       
    75         self.append(("charref", "convert", name))
       
    76         if name[0] != "x":
       
    77             return EventCollector.convert_charref(self, name)
       
    78 
       
    79     def convert_codepoint(self, codepoint):
       
    80         self.append(("codepoint", "convert", codepoint))
       
    81         EventCollector.convert_codepoint(self, codepoint)
       
    82 
       
    83     def convert_entityref(self, name):
       
    84         self.append(("entityref", "convert", name))
       
    85         return EventCollector.convert_entityref(self, name)
       
    86 
       
    87     # These to record that they were called, then pass the call along
       
    88     # to the default implementation so that it's actions can be
       
    89     # recorded.
       
    90 
       
    91     def handle_charref(self, data):
       
    92         self.append(("charref", data))
       
    93         sgmllib.SGMLParser.handle_charref(self, data)
       
    94 
       
    95     def handle_entityref(self, data):
       
    96         self.append(("entityref", data))
       
    97         sgmllib.SGMLParser.handle_entityref(self, data)
       
    98 
       
    99 
       
   100 class SGMLParserTestCase(unittest.TestCase):
       
   101 
       
   102     collector = EventCollector
       
   103 
       
   104     def get_events(self, source):
       
   105         parser = self.collector()
       
   106         try:
       
   107             for s in source:
       
   108                 parser.feed(s)
       
   109             parser.close()
       
   110         except:
       
   111             #self.events = parser.events
       
   112             raise
       
   113         return parser.get_events()
       
   114 
       
   115     def check_events(self, source, expected_events):
       
   116         try:
       
   117             events = self.get_events(source)
       
   118         except:
       
   119             import sys
       
   120             #print >>sys.stderr, pprint.pformat(self.events)
       
   121             raise
       
   122         if events != expected_events:
       
   123             self.fail("received events did not match expected events\n"
       
   124                       "Expected:\n" + pprint.pformat(expected_events) +
       
   125                       "\nReceived:\n" + pprint.pformat(events))
       
   126 
       
   127     def check_parse_error(self, source):
       
   128         parser = EventCollector()
       
   129         try:
       
   130             parser.feed(source)
       
   131             parser.close()
       
   132         except sgmllib.SGMLParseError:
       
   133             pass
       
   134         else:
       
   135             self.fail("expected SGMLParseError for %r\nReceived:\n%s"
       
   136                       % (source, pprint.pformat(parser.get_events())))
       
   137 
       
   138     def test_doctype_decl_internal(self):
       
   139         inside = """\
       
   140 DOCTYPE html PUBLIC '-//W3C//DTD HTML 4.01//EN'
       
   141              SYSTEM 'http://www.w3.org/TR/html401/strict.dtd' [
       
   142   <!ELEMENT html - O EMPTY>
       
   143   <!ATTLIST html
       
   144       version CDATA #IMPLIED
       
   145       profile CDATA 'DublinCore'>
       
   146   <!NOTATION datatype SYSTEM 'http://xml.python.org/notations/python-module'>
       
   147   <!ENTITY myEntity 'internal parsed entity'>
       
   148   <!ENTITY anEntity SYSTEM 'http://xml.python.org/entities/something.xml'>
       
   149   <!ENTITY % paramEntity 'name|name|name'>
       
   150   %paramEntity;
       
   151   <!-- comment -->
       
   152 ]"""
       
   153         self.check_events(["<!%s>" % inside], [
       
   154             ("decl", inside),
       
   155             ])
       
   156 
       
   157     def test_doctype_decl_external(self):
       
   158         inside = "DOCTYPE html PUBLIC '-//W3C//DTD HTML 4.01//EN'"
       
   159         self.check_events("<!%s>" % inside, [
       
   160             ("decl", inside),
       
   161             ])
       
   162 
       
   163     def test_underscore_in_attrname(self):
       
   164         # SF bug #436621
       
   165         """Make sure attribute names with underscores are accepted"""
       
   166         self.check_events("<a has_under _under>", [
       
   167             ("starttag", "a", [("has_under", "has_under"),
       
   168                                ("_under", "_under")]),
       
   169             ])
       
   170 
       
   171     def test_underscore_in_tagname(self):
       
   172         # SF bug #436621
       
   173         """Make sure tag names with underscores are accepted"""
       
   174         self.check_events("<has_under></has_under>", [
       
   175             ("starttag", "has_under", []),
       
   176             ("endtag", "has_under"),
       
   177             ])
       
   178 
       
   179     def test_quotes_in_unquoted_attrs(self):
       
   180         # SF bug #436621
       
   181         """Be sure quotes in unquoted attributes are made part of the value"""
       
   182         self.check_events("<a href=foo'bar\"baz>", [
       
   183             ("starttag", "a", [("href", "foo'bar\"baz")]),
       
   184             ])
       
   185 
       
   186     def test_xhtml_empty_tag(self):
       
   187         """Handling of XHTML-style empty start tags"""
       
   188         self.check_events("<br />text<i></i>", [
       
   189             ("starttag", "br", []),
       
   190             ("data", "text"),
       
   191             ("starttag", "i", []),
       
   192             ("endtag", "i"),
       
   193             ])
       
   194 
       
   195     def test_processing_instruction_only(self):
       
   196         self.check_events("<?processing instruction>", [
       
   197             ("pi", "processing instruction"),
       
   198             ])
       
   199 
       
   200     def test_bad_nesting(self):
       
   201         self.check_events("<a><b></a></b>", [
       
   202             ("starttag", "a", []),
       
   203             ("starttag", "b", []),
       
   204             ("endtag", "a"),
       
   205             ("endtag", "b"),
       
   206             ])
       
   207 
       
   208     def test_bare_ampersands(self):
       
   209         self.check_events("this text & contains & ampersands &", [
       
   210             ("data", "this text & contains & ampersands &"),
       
   211             ])
       
   212 
       
   213     def test_bare_pointy_brackets(self):
       
   214         self.check_events("this < text > contains < bare>pointy< brackets", [
       
   215             ("data", "this < text > contains < bare>pointy< brackets"),
       
   216             ])
       
   217 
       
   218     def test_attr_syntax(self):
       
   219         output = [
       
   220           ("starttag", "a", [("b", "v"), ("c", "v"), ("d", "v"), ("e", "e")])
       
   221           ]
       
   222         self.check_events("""<a b='v' c="v" d=v e>""", output)
       
   223         self.check_events("""<a  b = 'v' c = "v" d = v e>""", output)
       
   224         self.check_events("""<a\nb\n=\n'v'\nc\n=\n"v"\nd\n=\nv\ne>""", output)
       
   225         self.check_events("""<a\tb\t=\t'v'\tc\t=\t"v"\td\t=\tv\te>""", output)
       
   226 
       
   227     def test_attr_values(self):
       
   228         self.check_events("""<a b='xxx\n\txxx' c="yyy\t\nyyy" d='\txyz\n'>""",
       
   229                         [("starttag", "a", [("b", "xxx\n\txxx"),
       
   230                                             ("c", "yyy\t\nyyy"),
       
   231                                             ("d", "\txyz\n")])
       
   232                          ])
       
   233         self.check_events("""<a b='' c="">""", [
       
   234             ("starttag", "a", [("b", ""), ("c", "")]),
       
   235             ])
       
   236         # URL construction stuff from RFC 1808:
       
   237         safe = "$-_.+"
       
   238         extra = "!*'(),"
       
   239         reserved = ";/?:@&="
       
   240         url = "http://example.com:8080/path/to/file?%s%s%s" % (
       
   241             safe, extra, reserved)
       
   242         self.check_events("""<e a=%s>""" % url, [
       
   243             ("starttag", "e", [("a", url)]),
       
   244             ])
       
   245         # Regression test for SF patch #669683.
       
   246         self.check_events("<e a=rgb(1,2,3)>", [
       
   247             ("starttag", "e", [("a", "rgb(1,2,3)")]),
       
   248             ])
       
   249 
       
   250     def test_attr_values_entities(self):
       
   251         """Substitution of entities and charrefs in attribute values"""
       
   252         # SF bug #1452246
       
   253         self.check_events("""<a b=&lt; c=&lt;&gt; d=&lt-&gt; e='&lt; '
       
   254                                 f="&xxx;" g='&#32;&#33;' h='&#500;'
       
   255                                 i='x?a=b&c=d;'
       
   256                                 j='&amp;#42;' k='&#38;#42;'>""",
       
   257             [("starttag", "a", [("b", "<"),
       
   258                                 ("c", "<>"),
       
   259                                 ("d", "&lt->"),
       
   260                                 ("e", "< "),
       
   261                                 ("f", "&xxx;"),
       
   262                                 ("g", " !"),
       
   263                                 ("h", "&#500;"),
       
   264                                 ("i", "x?a=b&c=d;"),
       
   265                                 ("j", "&#42;"),
       
   266                                 ("k", "&#42;"),
       
   267                                 ])])
       
   268 
       
   269     def test_convert_overrides(self):
       
   270         # This checks that the character and entity reference
       
   271         # conversion helpers are called at the documented times.  No
       
   272         # attempt is made to really change what the parser accepts.
       
   273         #
       
   274         self.collector = HTMLEntityCollector
       
   275         self.check_events(('<a title="&ldquo;test&#x201d;">foo</a>'
       
   276                            '&foobar;&#42;'), [
       
   277             ('entityref', 'convert', 'ldquo'),
       
   278             ('charref', 'convert', 'x201d'),
       
   279             ('starttag', 'a', [('title', '&ldquo;test&#x201d;')]),
       
   280             ('data', 'foo'),
       
   281             ('endtag', 'a'),
       
   282             ('entityref', 'foobar'),
       
   283             ('entityref', 'convert', 'foobar'),
       
   284             ('charref', '42'),
       
   285             ('charref', 'convert', '42'),
       
   286             ('codepoint', 'convert', 42),
       
   287             ])
       
   288 
       
   289     def test_attr_funky_names(self):
       
   290         self.check_events("""<a a.b='v' c:d=v e-f=v>""", [
       
   291             ("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")]),
       
   292             ])
       
   293 
       
   294     def test_attr_value_ip6_url(self):
       
   295         # http://www.python.org/sf/853506
       
   296         self.check_events(("<a href='http://[1080::8:800:200C:417A]/'>"
       
   297                            "<a href=http://[1080::8:800:200C:417A]/>"), [
       
   298             ("starttag", "a", [("href", "http://[1080::8:800:200C:417A]/")]),
       
   299             ("starttag", "a", [("href", "http://[1080::8:800:200C:417A]/")]),
       
   300             ])
       
   301 
       
   302     def test_illegal_declarations(self):
       
   303         s = 'abc<!spacer type="block" height="25">def'
       
   304         self.check_events(s, [
       
   305             ("data", "abc"),
       
   306             ("unknown decl", 'spacer type="block" height="25"'),
       
   307             ("data", "def"),
       
   308             ])
       
   309 
       
   310     def test_weird_starttags(self):
       
   311         self.check_events("<a<a>", [
       
   312             ("starttag", "a", []),
       
   313             ("starttag", "a", []),
       
   314             ])
       
   315         self.check_events("</a<a>", [
       
   316             ("endtag", "a"),
       
   317             ("starttag", "a", []),
       
   318             ])
       
   319 
       
   320     def test_declaration_junk_chars(self):
       
   321         self.check_parse_error("<!DOCTYPE foo $ >")
       
   322 
       
   323     def test_get_starttag_text(self):
       
   324         s = """<foobar   \n   one="1"\ttwo=2   >"""
       
   325         self.check_events(s, [
       
   326             ("starttag", "foobar", [("one", "1"), ("two", "2")]),
       
   327             ])
       
   328 
       
   329     def test_cdata_content(self):
       
   330         s = ("<cdata> <!-- not a comment --> &not-an-entity-ref; </cdata>"
       
   331              "<notcdata> <!-- comment --> </notcdata>")
       
   332         self.collector = CDATAEventCollector
       
   333         self.check_events(s, [
       
   334             ("starttag", "cdata", []),
       
   335             ("data", " <!-- not a comment --> &not-an-entity-ref; "),
       
   336             ("endtag", "cdata"),
       
   337             ("starttag", "notcdata", []),
       
   338             ("data", " "),
       
   339             ("comment", " comment "),
       
   340             ("data", " "),
       
   341             ("endtag", "notcdata"),
       
   342             ])
       
   343         s = """<cdata> <not a='start tag'> </cdata>"""
       
   344         self.check_events(s, [
       
   345             ("starttag", "cdata", []),
       
   346             ("data", " <not a='start tag'> "),
       
   347             ("endtag", "cdata"),
       
   348             ])
       
   349 
       
   350     def test_illegal_declarations(self):
       
   351         s = 'abc<!spacer type="block" height="25">def'
       
   352         self.check_events(s, [
       
   353             ("data", "abc"),
       
   354             ("unknown decl", 'spacer type="block" height="25"'),
       
   355             ("data", "def"),
       
   356             ])
       
   357 
       
   358     def test_enumerated_attr_type(self):
       
   359         s = "<!DOCTYPE doc [<!ATTLIST doc attr (a | b) >]>"
       
   360         self.check_events(s, [
       
   361             ('decl', 'DOCTYPE doc [<!ATTLIST doc attr (a | b) >]'),
       
   362             ])
       
   363 
       
   364     def test_read_chunks(self):
       
   365         # SF bug #1541697, this caused sgml parser to hang
       
   366         # Just verify this code doesn't cause a hang.
       
   367         CHUNK = 1024  # increasing this to 8212 makes the problem go away
       
   368 
       
   369         f = open(test_support.findfile('sgml_input.html'))
       
   370         fp = sgmllib.SGMLParser()
       
   371         while 1:
       
   372             data = f.read(CHUNK)
       
   373             fp.feed(data)
       
   374             if len(data) != CHUNK:
       
   375                 break
       
   376 
       
   377     # XXX These tests have been disabled by prefixing their names with
       
   378     # an underscore.  The first two exercise outstanding bugs in the
       
   379     # sgmllib module, and the third exhibits questionable behavior
       
   380     # that needs to be carefully considered before changing it.
       
   381 
       
   382     def _test_starttag_end_boundary(self):
       
   383         self.check_events("<a b='<'>", [("starttag", "a", [("b", "<")])])
       
   384         self.check_events("<a b='>'>", [("starttag", "a", [("b", ">")])])
       
   385 
       
   386     def _test_buffer_artefacts(self):
       
   387         output = [("starttag", "a", [("b", "<")])]
       
   388         self.check_events(["<a b='<'>"], output)
       
   389         self.check_events(["<a ", "b='<'>"], output)
       
   390         self.check_events(["<a b", "='<'>"], output)
       
   391         self.check_events(["<a b=", "'<'>"], output)
       
   392         self.check_events(["<a b='<", "'>"], output)
       
   393         self.check_events(["<a b='<'", ">"], output)
       
   394 
       
   395         output = [("starttag", "a", [("b", ">")])]
       
   396         self.check_events(["<a b='>'>"], output)
       
   397         self.check_events(["<a ", "b='>'>"], output)
       
   398         self.check_events(["<a b", "='>'>"], output)
       
   399         self.check_events(["<a b=", "'>'>"], output)
       
   400         self.check_events(["<a b='>", "'>"], output)
       
   401         self.check_events(["<a b='>'", ">"], output)
       
   402 
       
   403         output = [("comment", "abc")]
       
   404         self.check_events(["", "<!--abc-->"], output)
       
   405         self.check_events(["<", "!--abc-->"], output)
       
   406         self.check_events(["<!", "--abc-->"], output)
       
   407         self.check_events(["<!-", "-abc-->"], output)
       
   408         self.check_events(["<!--", "abc-->"], output)
       
   409         self.check_events(["<!--a", "bc-->"], output)
       
   410         self.check_events(["<!--ab", "c-->"], output)
       
   411         self.check_events(["<!--abc", "-->"], output)
       
   412         self.check_events(["<!--abc-", "->"], output)
       
   413         self.check_events(["<!--abc--", ">"], output)
       
   414         self.check_events(["<!--abc-->", ""], output)
       
   415 
       
   416     def _test_starttag_junk_chars(self):
       
   417         self.check_parse_error("<")
       
   418         self.check_parse_error("<>")
       
   419         self.check_parse_error("</$>")
       
   420         self.check_parse_error("</")
       
   421         self.check_parse_error("</a")
       
   422         self.check_parse_error("<$")
       
   423         self.check_parse_error("<$>")
       
   424         self.check_parse_error("<!")
       
   425         self.check_parse_error("<a $>")
       
   426         self.check_parse_error("<a")
       
   427         self.check_parse_error("<a foo='bar'")
       
   428         self.check_parse_error("<a foo='bar")
       
   429         self.check_parse_error("<a foo='>'")
       
   430         self.check_parse_error("<a foo='>")
       
   431         self.check_parse_error("<a foo=>")
       
   432 
       
   433 
       
   434 def test_main():
       
   435     test_support.run_unittest(SGMLParserTestCase)
       
   436 
       
   437 
       
   438 if __name__ == "__main__":
       
   439     test_main()