|
1 """HTML 2.0 parser. |
|
2 |
|
3 See the HTML 2.0 specification: |
|
4 http://www.w3.org/hypertext/WWW/MarkUp/html-spec/html-spec_toc.html |
|
5 """ |
|
6 |
|
7 from warnings import warnpy3k |
|
8 warnpy3k("the htmllib module has been removed in Python 3.0", |
|
9 stacklevel=2) |
|
10 del warnpy3k |
|
11 |
|
12 import sgmllib |
|
13 |
|
14 from formatter import AS_IS |
|
15 |
|
16 __all__ = ["HTMLParser", "HTMLParseError"] |
|
17 |
|
18 |
|
19 class HTMLParseError(sgmllib.SGMLParseError): |
|
20 """Error raised when an HTML document can't be parsed.""" |
|
21 |
|
22 |
|
23 class HTMLParser(sgmllib.SGMLParser): |
|
24 """This is the basic HTML parser class. |
|
25 |
|
26 It supports all entity names required by the XHTML 1.0 Recommendation. |
|
27 It also defines handlers for all HTML 2.0 and many HTML 3.0 and 3.2 |
|
28 elements. |
|
29 |
|
30 """ |
|
31 |
|
32 from htmlentitydefs import entitydefs |
|
33 |
|
34 def __init__(self, formatter, verbose=0): |
|
35 """Creates an instance of the HTMLParser class. |
|
36 |
|
37 The formatter parameter is the formatter instance associated with |
|
38 the parser. |
|
39 |
|
40 """ |
|
41 sgmllib.SGMLParser.__init__(self, verbose) |
|
42 self.formatter = formatter |
|
43 |
|
44 def error(self, message): |
|
45 raise HTMLParseError(message) |
|
46 |
|
47 def reset(self): |
|
48 sgmllib.SGMLParser.reset(self) |
|
49 self.savedata = None |
|
50 self.isindex = 0 |
|
51 self.title = None |
|
52 self.base = None |
|
53 self.anchor = None |
|
54 self.anchorlist = [] |
|
55 self.nofill = 0 |
|
56 self.list_stack = [] |
|
57 |
|
58 # ------ Methods used internally; some may be overridden |
|
59 |
|
60 # --- Formatter interface, taking care of 'savedata' mode; |
|
61 # shouldn't need to be overridden |
|
62 |
|
63 def handle_data(self, data): |
|
64 if self.savedata is not None: |
|
65 self.savedata = self.savedata + data |
|
66 else: |
|
67 if self.nofill: |
|
68 self.formatter.add_literal_data(data) |
|
69 else: |
|
70 self.formatter.add_flowing_data(data) |
|
71 |
|
72 # --- Hooks to save data; shouldn't need to be overridden |
|
73 |
|
74 def save_bgn(self): |
|
75 """Begins saving character data in a buffer instead of sending it |
|
76 to the formatter object. |
|
77 |
|
78 Retrieve the stored data via the save_end() method. Use of the |
|
79 save_bgn() / save_end() pair may not be nested. |
|
80 |
|
81 """ |
|
82 self.savedata = '' |
|
83 |
|
84 def save_end(self): |
|
85 """Ends buffering character data and returns all data saved since |
|
86 the preceding call to the save_bgn() method. |
|
87 |
|
88 If the nofill flag is false, whitespace is collapsed to single |
|
89 spaces. A call to this method without a preceding call to the |
|
90 save_bgn() method will raise a TypeError exception. |
|
91 |
|
92 """ |
|
93 data = self.savedata |
|
94 self.savedata = None |
|
95 if not self.nofill: |
|
96 data = ' '.join(data.split()) |
|
97 return data |
|
98 |
|
99 # --- Hooks for anchors; should probably be overridden |
|
100 |
|
101 def anchor_bgn(self, href, name, type): |
|
102 """This method is called at the start of an anchor region. |
|
103 |
|
104 The arguments correspond to the attributes of the <A> tag with |
|
105 the same names. The default implementation maintains a list of |
|
106 hyperlinks (defined by the HREF attribute for <A> tags) within |
|
107 the document. The list of hyperlinks is available as the data |
|
108 attribute anchorlist. |
|
109 |
|
110 """ |
|
111 self.anchor = href |
|
112 if self.anchor: |
|
113 self.anchorlist.append(href) |
|
114 |
|
115 def anchor_end(self): |
|
116 """This method is called at the end of an anchor region. |
|
117 |
|
118 The default implementation adds a textual footnote marker using an |
|
119 index into the list of hyperlinks created by the anchor_bgn()method. |
|
120 |
|
121 """ |
|
122 if self.anchor: |
|
123 self.handle_data("[%d]" % len(self.anchorlist)) |
|
124 self.anchor = None |
|
125 |
|
126 # --- Hook for images; should probably be overridden |
|
127 |
|
128 def handle_image(self, src, alt, *args): |
|
129 """This method is called to handle images. |
|
130 |
|
131 The default implementation simply passes the alt value to the |
|
132 handle_data() method. |
|
133 |
|
134 """ |
|
135 self.handle_data(alt) |
|
136 |
|
137 # --------- Top level elememts |
|
138 |
|
139 def start_html(self, attrs): pass |
|
140 def end_html(self): pass |
|
141 |
|
142 def start_head(self, attrs): pass |
|
143 def end_head(self): pass |
|
144 |
|
145 def start_body(self, attrs): pass |
|
146 def end_body(self): pass |
|
147 |
|
148 # ------ Head elements |
|
149 |
|
150 def start_title(self, attrs): |
|
151 self.save_bgn() |
|
152 |
|
153 def end_title(self): |
|
154 self.title = self.save_end() |
|
155 |
|
156 def do_base(self, attrs): |
|
157 for a, v in attrs: |
|
158 if a == 'href': |
|
159 self.base = v |
|
160 |
|
161 def do_isindex(self, attrs): |
|
162 self.isindex = 1 |
|
163 |
|
164 def do_link(self, attrs): |
|
165 pass |
|
166 |
|
167 def do_meta(self, attrs): |
|
168 pass |
|
169 |
|
170 def do_nextid(self, attrs): # Deprecated |
|
171 pass |
|
172 |
|
173 # ------ Body elements |
|
174 |
|
175 # --- Headings |
|
176 |
|
177 def start_h1(self, attrs): |
|
178 self.formatter.end_paragraph(1) |
|
179 self.formatter.push_font(('h1', 0, 1, 0)) |
|
180 |
|
181 def end_h1(self): |
|
182 self.formatter.end_paragraph(1) |
|
183 self.formatter.pop_font() |
|
184 |
|
185 def start_h2(self, attrs): |
|
186 self.formatter.end_paragraph(1) |
|
187 self.formatter.push_font(('h2', 0, 1, 0)) |
|
188 |
|
189 def end_h2(self): |
|
190 self.formatter.end_paragraph(1) |
|
191 self.formatter.pop_font() |
|
192 |
|
193 def start_h3(self, attrs): |
|
194 self.formatter.end_paragraph(1) |
|
195 self.formatter.push_font(('h3', 0, 1, 0)) |
|
196 |
|
197 def end_h3(self): |
|
198 self.formatter.end_paragraph(1) |
|
199 self.formatter.pop_font() |
|
200 |
|
201 def start_h4(self, attrs): |
|
202 self.formatter.end_paragraph(1) |
|
203 self.formatter.push_font(('h4', 0, 1, 0)) |
|
204 |
|
205 def end_h4(self): |
|
206 self.formatter.end_paragraph(1) |
|
207 self.formatter.pop_font() |
|
208 |
|
209 def start_h5(self, attrs): |
|
210 self.formatter.end_paragraph(1) |
|
211 self.formatter.push_font(('h5', 0, 1, 0)) |
|
212 |
|
213 def end_h5(self): |
|
214 self.formatter.end_paragraph(1) |
|
215 self.formatter.pop_font() |
|
216 |
|
217 def start_h6(self, attrs): |
|
218 self.formatter.end_paragraph(1) |
|
219 self.formatter.push_font(('h6', 0, 1, 0)) |
|
220 |
|
221 def end_h6(self): |
|
222 self.formatter.end_paragraph(1) |
|
223 self.formatter.pop_font() |
|
224 |
|
225 # --- Block Structuring Elements |
|
226 |
|
227 def do_p(self, attrs): |
|
228 self.formatter.end_paragraph(1) |
|
229 |
|
230 def start_pre(self, attrs): |
|
231 self.formatter.end_paragraph(1) |
|
232 self.formatter.push_font((AS_IS, AS_IS, AS_IS, 1)) |
|
233 self.nofill = self.nofill + 1 |
|
234 |
|
235 def end_pre(self): |
|
236 self.formatter.end_paragraph(1) |
|
237 self.formatter.pop_font() |
|
238 self.nofill = max(0, self.nofill - 1) |
|
239 |
|
240 def start_xmp(self, attrs): |
|
241 self.start_pre(attrs) |
|
242 self.setliteral('xmp') # Tell SGML parser |
|
243 |
|
244 def end_xmp(self): |
|
245 self.end_pre() |
|
246 |
|
247 def start_listing(self, attrs): |
|
248 self.start_pre(attrs) |
|
249 self.setliteral('listing') # Tell SGML parser |
|
250 |
|
251 def end_listing(self): |
|
252 self.end_pre() |
|
253 |
|
254 def start_address(self, attrs): |
|
255 self.formatter.end_paragraph(0) |
|
256 self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS)) |
|
257 |
|
258 def end_address(self): |
|
259 self.formatter.end_paragraph(0) |
|
260 self.formatter.pop_font() |
|
261 |
|
262 def start_blockquote(self, attrs): |
|
263 self.formatter.end_paragraph(1) |
|
264 self.formatter.push_margin('blockquote') |
|
265 |
|
266 def end_blockquote(self): |
|
267 self.formatter.end_paragraph(1) |
|
268 self.formatter.pop_margin() |
|
269 |
|
270 # --- List Elements |
|
271 |
|
272 def start_ul(self, attrs): |
|
273 self.formatter.end_paragraph(not self.list_stack) |
|
274 self.formatter.push_margin('ul') |
|
275 self.list_stack.append(['ul', '*', 0]) |
|
276 |
|
277 def end_ul(self): |
|
278 if self.list_stack: del self.list_stack[-1] |
|
279 self.formatter.end_paragraph(not self.list_stack) |
|
280 self.formatter.pop_margin() |
|
281 |
|
282 def do_li(self, attrs): |
|
283 self.formatter.end_paragraph(0) |
|
284 if self.list_stack: |
|
285 [dummy, label, counter] = top = self.list_stack[-1] |
|
286 top[2] = counter = counter+1 |
|
287 else: |
|
288 label, counter = '*', 0 |
|
289 self.formatter.add_label_data(label, counter) |
|
290 |
|
291 def start_ol(self, attrs): |
|
292 self.formatter.end_paragraph(not self.list_stack) |
|
293 self.formatter.push_margin('ol') |
|
294 label = '1.' |
|
295 for a, v in attrs: |
|
296 if a == 'type': |
|
297 if len(v) == 1: v = v + '.' |
|
298 label = v |
|
299 self.list_stack.append(['ol', label, 0]) |
|
300 |
|
301 def end_ol(self): |
|
302 if self.list_stack: del self.list_stack[-1] |
|
303 self.formatter.end_paragraph(not self.list_stack) |
|
304 self.formatter.pop_margin() |
|
305 |
|
306 def start_menu(self, attrs): |
|
307 self.start_ul(attrs) |
|
308 |
|
309 def end_menu(self): |
|
310 self.end_ul() |
|
311 |
|
312 def start_dir(self, attrs): |
|
313 self.start_ul(attrs) |
|
314 |
|
315 def end_dir(self): |
|
316 self.end_ul() |
|
317 |
|
318 def start_dl(self, attrs): |
|
319 self.formatter.end_paragraph(1) |
|
320 self.list_stack.append(['dl', '', 0]) |
|
321 |
|
322 def end_dl(self): |
|
323 self.ddpop(1) |
|
324 if self.list_stack: del self.list_stack[-1] |
|
325 |
|
326 def do_dt(self, attrs): |
|
327 self.ddpop() |
|
328 |
|
329 def do_dd(self, attrs): |
|
330 self.ddpop() |
|
331 self.formatter.push_margin('dd') |
|
332 self.list_stack.append(['dd', '', 0]) |
|
333 |
|
334 def ddpop(self, bl=0): |
|
335 self.formatter.end_paragraph(bl) |
|
336 if self.list_stack: |
|
337 if self.list_stack[-1][0] == 'dd': |
|
338 del self.list_stack[-1] |
|
339 self.formatter.pop_margin() |
|
340 |
|
341 # --- Phrase Markup |
|
342 |
|
343 # Idiomatic Elements |
|
344 |
|
345 def start_cite(self, attrs): self.start_i(attrs) |
|
346 def end_cite(self): self.end_i() |
|
347 |
|
348 def start_code(self, attrs): self.start_tt(attrs) |
|
349 def end_code(self): self.end_tt() |
|
350 |
|
351 def start_em(self, attrs): self.start_i(attrs) |
|
352 def end_em(self): self.end_i() |
|
353 |
|
354 def start_kbd(self, attrs): self.start_tt(attrs) |
|
355 def end_kbd(self): self.end_tt() |
|
356 |
|
357 def start_samp(self, attrs): self.start_tt(attrs) |
|
358 def end_samp(self): self.end_tt() |
|
359 |
|
360 def start_strong(self, attrs): self.start_b(attrs) |
|
361 def end_strong(self): self.end_b() |
|
362 |
|
363 def start_var(self, attrs): self.start_i(attrs) |
|
364 def end_var(self): self.end_i() |
|
365 |
|
366 # Typographic Elements |
|
367 |
|
368 def start_i(self, attrs): |
|
369 self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS)) |
|
370 def end_i(self): |
|
371 self.formatter.pop_font() |
|
372 |
|
373 def start_b(self, attrs): |
|
374 self.formatter.push_font((AS_IS, AS_IS, 1, AS_IS)) |
|
375 def end_b(self): |
|
376 self.formatter.pop_font() |
|
377 |
|
378 def start_tt(self, attrs): |
|
379 self.formatter.push_font((AS_IS, AS_IS, AS_IS, 1)) |
|
380 def end_tt(self): |
|
381 self.formatter.pop_font() |
|
382 |
|
383 def start_a(self, attrs): |
|
384 href = '' |
|
385 name = '' |
|
386 type = '' |
|
387 for attrname, value in attrs: |
|
388 value = value.strip() |
|
389 if attrname == 'href': |
|
390 href = value |
|
391 if attrname == 'name': |
|
392 name = value |
|
393 if attrname == 'type': |
|
394 type = value.lower() |
|
395 self.anchor_bgn(href, name, type) |
|
396 |
|
397 def end_a(self): |
|
398 self.anchor_end() |
|
399 |
|
400 # --- Line Break |
|
401 |
|
402 def do_br(self, attrs): |
|
403 self.formatter.add_line_break() |
|
404 |
|
405 # --- Horizontal Rule |
|
406 |
|
407 def do_hr(self, attrs): |
|
408 self.formatter.add_hor_rule() |
|
409 |
|
410 # --- Image |
|
411 |
|
412 def do_img(self, attrs): |
|
413 align = '' |
|
414 alt = '(image)' |
|
415 ismap = '' |
|
416 src = '' |
|
417 width = 0 |
|
418 height = 0 |
|
419 for attrname, value in attrs: |
|
420 if attrname == 'align': |
|
421 align = value |
|
422 if attrname == 'alt': |
|
423 alt = value |
|
424 if attrname == 'ismap': |
|
425 ismap = value |
|
426 if attrname == 'src': |
|
427 src = value |
|
428 if attrname == 'width': |
|
429 try: width = int(value) |
|
430 except ValueError: pass |
|
431 if attrname == 'height': |
|
432 try: height = int(value) |
|
433 except ValueError: pass |
|
434 self.handle_image(src, alt, ismap, align, width, height) |
|
435 |
|
436 # --- Really Old Unofficial Deprecated Stuff |
|
437 |
|
438 def do_plaintext(self, attrs): |
|
439 self.start_pre(attrs) |
|
440 self.setnomoretags() # Tell SGML parser |
|
441 |
|
442 # --- Unhandled tags |
|
443 |
|
444 def unknown_starttag(self, tag, attrs): |
|
445 pass |
|
446 |
|
447 def unknown_endtag(self, tag): |
|
448 pass |
|
449 |
|
450 |
|
451 def test(args = None): |
|
452 import sys, formatter |
|
453 |
|
454 if not args: |
|
455 args = sys.argv[1:] |
|
456 |
|
457 silent = args and args[0] == '-s' |
|
458 if silent: |
|
459 del args[0] |
|
460 |
|
461 if args: |
|
462 file = args[0] |
|
463 else: |
|
464 file = 'test.html' |
|
465 |
|
466 if file == '-': |
|
467 f = sys.stdin |
|
468 else: |
|
469 try: |
|
470 f = open(file, 'r') |
|
471 except IOError, msg: |
|
472 print file, ":", msg |
|
473 sys.exit(1) |
|
474 |
|
475 data = f.read() |
|
476 |
|
477 if f is not sys.stdin: |
|
478 f.close() |
|
479 |
|
480 if silent: |
|
481 f = formatter.NullFormatter() |
|
482 else: |
|
483 f = formatter.AbstractFormatter(formatter.DumbWriter()) |
|
484 |
|
485 p = HTMLParser(f) |
|
486 p.feed(data) |
|
487 p.close() |
|
488 |
|
489 |
|
490 if __name__ == '__main__': |
|
491 test() |