symbian-qemu-0.9.1-12/python-win32-2.6.1/lib/htmllib.py
changeset 1 2fb8b9db1c86
equal deleted inserted replaced
0:ffa851df0825 1:2fb8b9db1c86
       
     1 """HTML 2.0 parser.
       
     2 
       
     3 See the HTML 2.0 specification:
       
     4 http://www.w3.org/hypertext/WWW/MarkUp/html-spec/html-spec_toc.html
       
     5 """
       
     6 
       
     7 from warnings import warnpy3k
       
     8 warnpy3k("the htmllib module has been removed in Python 3.0",
       
     9          stacklevel=2)
       
    10 del warnpy3k
       
    11 
       
    12 import sgmllib
       
    13 
       
    14 from formatter import AS_IS
       
    15 
       
    16 __all__ = ["HTMLParser", "HTMLParseError"]
       
    17 
       
    18 
       
    19 class HTMLParseError(sgmllib.SGMLParseError):
       
    20     """Error raised when an HTML document can't be parsed."""
       
    21 
       
    22 
       
    23 class HTMLParser(sgmllib.SGMLParser):
       
    24     """This is the basic HTML parser class.
       
    25 
       
    26     It supports all entity names required by the XHTML 1.0 Recommendation.
       
    27     It also defines handlers for all HTML 2.0 and many HTML 3.0 and 3.2
       
    28     elements.
       
    29 
       
    30     """
       
    31 
       
    32     from htmlentitydefs import entitydefs
       
    33 
       
    34     def __init__(self, formatter, verbose=0):
       
    35         """Creates an instance of the HTMLParser class.
       
    36 
       
    37         The formatter parameter is the formatter instance associated with
       
    38         the parser.
       
    39 
       
    40         """
       
    41         sgmllib.SGMLParser.__init__(self, verbose)
       
    42         self.formatter = formatter
       
    43 
       
    44     def error(self, message):
       
    45         raise HTMLParseError(message)
       
    46 
       
    47     def reset(self):
       
    48         sgmllib.SGMLParser.reset(self)
       
    49         self.savedata = None
       
    50         self.isindex = 0
       
    51         self.title = None
       
    52         self.base = None
       
    53         self.anchor = None
       
    54         self.anchorlist = []
       
    55         self.nofill = 0
       
    56         self.list_stack = []
       
    57 
       
    58     # ------ Methods used internally; some may be overridden
       
    59 
       
    60     # --- Formatter interface, taking care of 'savedata' mode;
       
    61     # shouldn't need to be overridden
       
    62 
       
    63     def handle_data(self, data):
       
    64         if self.savedata is not None:
       
    65             self.savedata = self.savedata + data
       
    66         else:
       
    67             if self.nofill:
       
    68                 self.formatter.add_literal_data(data)
       
    69             else:
       
    70                 self.formatter.add_flowing_data(data)
       
    71 
       
    72     # --- Hooks to save data; shouldn't need to be overridden
       
    73 
       
    74     def save_bgn(self):
       
    75         """Begins saving character data in a buffer instead of sending it
       
    76         to the formatter object.
       
    77 
       
    78         Retrieve the stored data via the save_end() method.  Use of the
       
    79         save_bgn() / save_end() pair may not be nested.
       
    80 
       
    81         """
       
    82         self.savedata = ''
       
    83 
       
    84     def save_end(self):
       
    85         """Ends buffering character data and returns all data saved since
       
    86         the preceding call to the save_bgn() method.
       
    87 
       
    88         If the nofill flag is false, whitespace is collapsed to single
       
    89         spaces.  A call to this method without a preceding call to the
       
    90         save_bgn() method will raise a TypeError exception.
       
    91 
       
    92         """
       
    93         data = self.savedata
       
    94         self.savedata = None
       
    95         if not self.nofill:
       
    96             data = ' '.join(data.split())
       
    97         return data
       
    98 
       
    99     # --- Hooks for anchors; should probably be overridden
       
   100 
       
   101     def anchor_bgn(self, href, name, type):
       
   102         """This method is called at the start of an anchor region.
       
   103 
       
   104         The arguments correspond to the attributes of the <A> tag with
       
   105         the same names.  The default implementation maintains a list of
       
   106         hyperlinks (defined by the HREF attribute for <A> tags) within
       
   107         the document.  The list of hyperlinks is available as the data
       
   108         attribute anchorlist.
       
   109 
       
   110         """
       
   111         self.anchor = href
       
   112         if self.anchor:
       
   113             self.anchorlist.append(href)
       
   114 
       
   115     def anchor_end(self):
       
   116         """This method is called at the end of an anchor region.
       
   117 
       
   118         The default implementation adds a textual footnote marker using an
       
   119         index into the list of hyperlinks created by the anchor_bgn()method.
       
   120 
       
   121         """
       
   122         if self.anchor:
       
   123             self.handle_data("[%d]" % len(self.anchorlist))
       
   124             self.anchor = None
       
   125 
       
   126     # --- Hook for images; should probably be overridden
       
   127 
       
   128     def handle_image(self, src, alt, *args):
       
   129         """This method is called to handle images.
       
   130 
       
   131         The default implementation simply passes the alt value to the
       
   132         handle_data() method.
       
   133 
       
   134         """
       
   135         self.handle_data(alt)
       
   136 
       
   137     # --------- Top level elememts
       
   138 
       
   139     def start_html(self, attrs): pass
       
   140     def end_html(self): pass
       
   141 
       
   142     def start_head(self, attrs): pass
       
   143     def end_head(self): pass
       
   144 
       
   145     def start_body(self, attrs): pass
       
   146     def end_body(self): pass
       
   147 
       
   148     # ------ Head elements
       
   149 
       
   150     def start_title(self, attrs):
       
   151         self.save_bgn()
       
   152 
       
   153     def end_title(self):
       
   154         self.title = self.save_end()
       
   155 
       
   156     def do_base(self, attrs):
       
   157         for a, v in attrs:
       
   158             if a == 'href':
       
   159                 self.base = v
       
   160 
       
   161     def do_isindex(self, attrs):
       
   162         self.isindex = 1
       
   163 
       
   164     def do_link(self, attrs):
       
   165         pass
       
   166 
       
   167     def do_meta(self, attrs):
       
   168         pass
       
   169 
       
   170     def do_nextid(self, attrs): # Deprecated
       
   171         pass
       
   172 
       
   173     # ------ Body elements
       
   174 
       
   175     # --- Headings
       
   176 
       
   177     def start_h1(self, attrs):
       
   178         self.formatter.end_paragraph(1)
       
   179         self.formatter.push_font(('h1', 0, 1, 0))
       
   180 
       
   181     def end_h1(self):
       
   182         self.formatter.end_paragraph(1)
       
   183         self.formatter.pop_font()
       
   184 
       
   185     def start_h2(self, attrs):
       
   186         self.formatter.end_paragraph(1)
       
   187         self.formatter.push_font(('h2', 0, 1, 0))
       
   188 
       
   189     def end_h2(self):
       
   190         self.formatter.end_paragraph(1)
       
   191         self.formatter.pop_font()
       
   192 
       
   193     def start_h3(self, attrs):
       
   194         self.formatter.end_paragraph(1)
       
   195         self.formatter.push_font(('h3', 0, 1, 0))
       
   196 
       
   197     def end_h3(self):
       
   198         self.formatter.end_paragraph(1)
       
   199         self.formatter.pop_font()
       
   200 
       
   201     def start_h4(self, attrs):
       
   202         self.formatter.end_paragraph(1)
       
   203         self.formatter.push_font(('h4', 0, 1, 0))
       
   204 
       
   205     def end_h4(self):
       
   206         self.formatter.end_paragraph(1)
       
   207         self.formatter.pop_font()
       
   208 
       
   209     def start_h5(self, attrs):
       
   210         self.formatter.end_paragraph(1)
       
   211         self.formatter.push_font(('h5', 0, 1, 0))
       
   212 
       
   213     def end_h5(self):
       
   214         self.formatter.end_paragraph(1)
       
   215         self.formatter.pop_font()
       
   216 
       
   217     def start_h6(self, attrs):
       
   218         self.formatter.end_paragraph(1)
       
   219         self.formatter.push_font(('h6', 0, 1, 0))
       
   220 
       
   221     def end_h6(self):
       
   222         self.formatter.end_paragraph(1)
       
   223         self.formatter.pop_font()
       
   224 
       
   225     # --- Block Structuring Elements
       
   226 
       
   227     def do_p(self, attrs):
       
   228         self.formatter.end_paragraph(1)
       
   229 
       
   230     def start_pre(self, attrs):
       
   231         self.formatter.end_paragraph(1)
       
   232         self.formatter.push_font((AS_IS, AS_IS, AS_IS, 1))
       
   233         self.nofill = self.nofill + 1
       
   234 
       
   235     def end_pre(self):
       
   236         self.formatter.end_paragraph(1)
       
   237         self.formatter.pop_font()
       
   238         self.nofill = max(0, self.nofill - 1)
       
   239 
       
   240     def start_xmp(self, attrs):
       
   241         self.start_pre(attrs)
       
   242         self.setliteral('xmp') # Tell SGML parser
       
   243 
       
   244     def end_xmp(self):
       
   245         self.end_pre()
       
   246 
       
   247     def start_listing(self, attrs):
       
   248         self.start_pre(attrs)
       
   249         self.setliteral('listing') # Tell SGML parser
       
   250 
       
   251     def end_listing(self):
       
   252         self.end_pre()
       
   253 
       
   254     def start_address(self, attrs):
       
   255         self.formatter.end_paragraph(0)
       
   256         self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS))
       
   257 
       
   258     def end_address(self):
       
   259         self.formatter.end_paragraph(0)
       
   260         self.formatter.pop_font()
       
   261 
       
   262     def start_blockquote(self, attrs):
       
   263         self.formatter.end_paragraph(1)
       
   264         self.formatter.push_margin('blockquote')
       
   265 
       
   266     def end_blockquote(self):
       
   267         self.formatter.end_paragraph(1)
       
   268         self.formatter.pop_margin()
       
   269 
       
   270     # --- List Elements
       
   271 
       
   272     def start_ul(self, attrs):
       
   273         self.formatter.end_paragraph(not self.list_stack)
       
   274         self.formatter.push_margin('ul')
       
   275         self.list_stack.append(['ul', '*', 0])
       
   276 
       
   277     def end_ul(self):
       
   278         if self.list_stack: del self.list_stack[-1]
       
   279         self.formatter.end_paragraph(not self.list_stack)
       
   280         self.formatter.pop_margin()
       
   281 
       
   282     def do_li(self, attrs):
       
   283         self.formatter.end_paragraph(0)
       
   284         if self.list_stack:
       
   285             [dummy, label, counter] = top = self.list_stack[-1]
       
   286             top[2] = counter = counter+1
       
   287         else:
       
   288             label, counter = '*', 0
       
   289         self.formatter.add_label_data(label, counter)
       
   290 
       
   291     def start_ol(self, attrs):
       
   292         self.formatter.end_paragraph(not self.list_stack)
       
   293         self.formatter.push_margin('ol')
       
   294         label = '1.'
       
   295         for a, v in attrs:
       
   296             if a == 'type':
       
   297                 if len(v) == 1: v = v + '.'
       
   298                 label = v
       
   299         self.list_stack.append(['ol', label, 0])
       
   300 
       
   301     def end_ol(self):
       
   302         if self.list_stack: del self.list_stack[-1]
       
   303         self.formatter.end_paragraph(not self.list_stack)
       
   304         self.formatter.pop_margin()
       
   305 
       
   306     def start_menu(self, attrs):
       
   307         self.start_ul(attrs)
       
   308 
       
   309     def end_menu(self):
       
   310         self.end_ul()
       
   311 
       
   312     def start_dir(self, attrs):
       
   313         self.start_ul(attrs)
       
   314 
       
   315     def end_dir(self):
       
   316         self.end_ul()
       
   317 
       
   318     def start_dl(self, attrs):
       
   319         self.formatter.end_paragraph(1)
       
   320         self.list_stack.append(['dl', '', 0])
       
   321 
       
   322     def end_dl(self):
       
   323         self.ddpop(1)
       
   324         if self.list_stack: del self.list_stack[-1]
       
   325 
       
   326     def do_dt(self, attrs):
       
   327         self.ddpop()
       
   328 
       
   329     def do_dd(self, attrs):
       
   330         self.ddpop()
       
   331         self.formatter.push_margin('dd')
       
   332         self.list_stack.append(['dd', '', 0])
       
   333 
       
   334     def ddpop(self, bl=0):
       
   335         self.formatter.end_paragraph(bl)
       
   336         if self.list_stack:
       
   337             if self.list_stack[-1][0] == 'dd':
       
   338                 del self.list_stack[-1]
       
   339                 self.formatter.pop_margin()
       
   340 
       
   341     # --- Phrase Markup
       
   342 
       
   343     # Idiomatic Elements
       
   344 
       
   345     def start_cite(self, attrs): self.start_i(attrs)
       
   346     def end_cite(self): self.end_i()
       
   347 
       
   348     def start_code(self, attrs): self.start_tt(attrs)
       
   349     def end_code(self): self.end_tt()
       
   350 
       
   351     def start_em(self, attrs): self.start_i(attrs)
       
   352     def end_em(self): self.end_i()
       
   353 
       
   354     def start_kbd(self, attrs): self.start_tt(attrs)
       
   355     def end_kbd(self): self.end_tt()
       
   356 
       
   357     def start_samp(self, attrs): self.start_tt(attrs)
       
   358     def end_samp(self): self.end_tt()
       
   359 
       
   360     def start_strong(self, attrs): self.start_b(attrs)
       
   361     def end_strong(self): self.end_b()
       
   362 
       
   363     def start_var(self, attrs): self.start_i(attrs)
       
   364     def end_var(self): self.end_i()
       
   365 
       
   366     # Typographic Elements
       
   367 
       
   368     def start_i(self, attrs):
       
   369         self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS))
       
   370     def end_i(self):
       
   371         self.formatter.pop_font()
       
   372 
       
   373     def start_b(self, attrs):
       
   374         self.formatter.push_font((AS_IS, AS_IS, 1, AS_IS))
       
   375     def end_b(self):
       
   376         self.formatter.pop_font()
       
   377 
       
   378     def start_tt(self, attrs):
       
   379         self.formatter.push_font((AS_IS, AS_IS, AS_IS, 1))
       
   380     def end_tt(self):
       
   381         self.formatter.pop_font()
       
   382 
       
   383     def start_a(self, attrs):
       
   384         href = ''
       
   385         name = ''
       
   386         type = ''
       
   387         for attrname, value in attrs:
       
   388             value = value.strip()
       
   389             if attrname == 'href':
       
   390                 href = value
       
   391             if attrname == 'name':
       
   392                 name = value
       
   393             if attrname == 'type':
       
   394                 type = value.lower()
       
   395         self.anchor_bgn(href, name, type)
       
   396 
       
   397     def end_a(self):
       
   398         self.anchor_end()
       
   399 
       
   400     # --- Line Break
       
   401 
       
   402     def do_br(self, attrs):
       
   403         self.formatter.add_line_break()
       
   404 
       
   405     # --- Horizontal Rule
       
   406 
       
   407     def do_hr(self, attrs):
       
   408         self.formatter.add_hor_rule()
       
   409 
       
   410     # --- Image
       
   411 
       
   412     def do_img(self, attrs):
       
   413         align = ''
       
   414         alt = '(image)'
       
   415         ismap = ''
       
   416         src = ''
       
   417         width = 0
       
   418         height = 0
       
   419         for attrname, value in attrs:
       
   420             if attrname == 'align':
       
   421                 align = value
       
   422             if attrname == 'alt':
       
   423                 alt = value
       
   424             if attrname == 'ismap':
       
   425                 ismap = value
       
   426             if attrname == 'src':
       
   427                 src = value
       
   428             if attrname == 'width':
       
   429                 try: width = int(value)
       
   430                 except ValueError: pass
       
   431             if attrname == 'height':
       
   432                 try: height = int(value)
       
   433                 except ValueError: pass
       
   434         self.handle_image(src, alt, ismap, align, width, height)
       
   435 
       
   436     # --- Really Old Unofficial Deprecated Stuff
       
   437 
       
   438     def do_plaintext(self, attrs):
       
   439         self.start_pre(attrs)
       
   440         self.setnomoretags() # Tell SGML parser
       
   441 
       
   442     # --- Unhandled tags
       
   443 
       
   444     def unknown_starttag(self, tag, attrs):
       
   445         pass
       
   446 
       
   447     def unknown_endtag(self, tag):
       
   448         pass
       
   449 
       
   450 
       
   451 def test(args = None):
       
   452     import sys, formatter
       
   453 
       
   454     if not args:
       
   455         args = sys.argv[1:]
       
   456 
       
   457     silent = args and args[0] == '-s'
       
   458     if silent:
       
   459         del args[0]
       
   460 
       
   461     if args:
       
   462         file = args[0]
       
   463     else:
       
   464         file = 'test.html'
       
   465 
       
   466     if file == '-':
       
   467         f = sys.stdin
       
   468     else:
       
   469         try:
       
   470             f = open(file, 'r')
       
   471         except IOError, msg:
       
   472             print file, ":", msg
       
   473             sys.exit(1)
       
   474 
       
   475     data = f.read()
       
   476 
       
   477     if f is not sys.stdin:
       
   478         f.close()
       
   479 
       
   480     if silent:
       
   481         f = formatter.NullFormatter()
       
   482     else:
       
   483         f = formatter.AbstractFormatter(formatter.DumbWriter())
       
   484 
       
   485     p = HTMLParser(f)
       
   486     p.feed(data)
       
   487     p.close()
       
   488 
       
   489 
       
   490 if __name__ == '__main__':
       
   491     test()