|
1 """A parser for HTML and XHTML.""" |
|
2 |
|
3 # This file is based on sgmllib.py, but the API is slightly different. |
|
4 |
|
5 # XXX There should be a way to distinguish between PCDATA (parsed |
|
6 # character data -- the normal case), RCDATA (replaceable character |
|
7 # data -- only char and entity references and end tags are special) |
|
8 # and CDATA (character data -- only end tags are special). |
|
9 |
|
10 |
|
11 import markupbase |
|
12 import re |
|
13 |
|
14 # Regular expressions used for parsing |
|
15 |
|
16 interesting_normal = re.compile('[&<]') |
|
17 interesting_cdata = re.compile(r'<(/|\Z)') |
|
18 incomplete = re.compile('&[a-zA-Z#]') |
|
19 |
|
20 entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]') |
|
21 charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]') |
|
22 |
|
23 starttagopen = re.compile('<[a-zA-Z]') |
|
24 piclose = re.compile('>') |
|
25 commentclose = re.compile(r'--\s*>') |
|
26 tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*') |
|
27 attrfind = re.compile( |
|
28 r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*' |
|
29 r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~@]*))?') |
|
30 |
|
31 locatestarttagend = re.compile(r""" |
|
32 <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name |
|
33 (?:\s+ # whitespace before attribute name |
|
34 (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name |
|
35 (?:\s*=\s* # value indicator |
|
36 (?:'[^']*' # LITA-enclosed value |
|
37 |\"[^\"]*\" # LIT-enclosed value |
|
38 |[^'\">\s]+ # bare value |
|
39 ) |
|
40 )? |
|
41 ) |
|
42 )* |
|
43 \s* # trailing whitespace |
|
44 """, re.VERBOSE) |
|
45 endendtag = re.compile('>') |
|
46 endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>') |
|
47 |
|
48 |
|
49 class HTMLParseError(Exception): |
|
50 """Exception raised for all parse errors.""" |
|
51 |
|
52 def __init__(self, msg, position=(None, None)): |
|
53 assert msg |
|
54 self.msg = msg |
|
55 self.lineno = position[0] |
|
56 self.offset = position[1] |
|
57 |
|
58 def __str__(self): |
|
59 result = self.msg |
|
60 if self.lineno is not None: |
|
61 result = result + ", at line %d" % self.lineno |
|
62 if self.offset is not None: |
|
63 result = result + ", column %d" % (self.offset + 1) |
|
64 return result |
|
65 |
|
66 |
|
67 class HTMLParser(markupbase.ParserBase): |
|
68 """Find tags and other markup and call handler functions. |
|
69 |
|
70 Usage: |
|
71 p = HTMLParser() |
|
72 p.feed(data) |
|
73 ... |
|
74 p.close() |
|
75 |
|
76 Start tags are handled by calling self.handle_starttag() or |
|
77 self.handle_startendtag(); end tags by self.handle_endtag(). The |
|
78 data between tags is passed from the parser to the derived class |
|
79 by calling self.handle_data() with the data as argument (the data |
|
80 may be split up in arbitrary chunks). Entity references are |
|
81 passed by calling self.handle_entityref() with the entity |
|
82 reference as the argument. Numeric character references are |
|
83 passed to self.handle_charref() with the string containing the |
|
84 reference as the argument. |
|
85 """ |
|
86 |
|
87 CDATA_CONTENT_ELEMENTS = ("script", "style") |
|
88 |
|
89 |
|
90 def __init__(self): |
|
91 """Initialize and reset this instance.""" |
|
92 self.reset() |
|
93 |
|
94 def reset(self): |
|
95 """Reset this instance. Loses all unprocessed data.""" |
|
96 self.rawdata = '' |
|
97 self.lasttag = '???' |
|
98 self.interesting = interesting_normal |
|
99 markupbase.ParserBase.reset(self) |
|
100 |
|
101 def feed(self, data): |
|
102 """Feed data to the parser. |
|
103 |
|
104 Call this as often as you want, with as little or as much text |
|
105 as you want (may include '\n'). |
|
106 """ |
|
107 self.rawdata = self.rawdata + data |
|
108 self.goahead(0) |
|
109 |
|
110 def close(self): |
|
111 """Handle any buffered data.""" |
|
112 self.goahead(1) |
|
113 |
|
114 def error(self, message): |
|
115 raise HTMLParseError(message, self.getpos()) |
|
116 |
|
117 __starttag_text = None |
|
118 |
|
119 def get_starttag_text(self): |
|
120 """Return full source of start tag: '<...>'.""" |
|
121 return self.__starttag_text |
|
122 |
|
123 def set_cdata_mode(self): |
|
124 self.interesting = interesting_cdata |
|
125 |
|
126 def clear_cdata_mode(self): |
|
127 self.interesting = interesting_normal |
|
128 |
|
129 # Internal -- handle data as far as reasonable. May leave state |
|
130 # and data to be processed by a subsequent call. If 'end' is |
|
131 # true, force handling all data as if followed by EOF marker. |
|
132 def goahead(self, end): |
|
133 rawdata = self.rawdata |
|
134 i = 0 |
|
135 n = len(rawdata) |
|
136 while i < n: |
|
137 match = self.interesting.search(rawdata, i) # < or & |
|
138 if match: |
|
139 j = match.start() |
|
140 else: |
|
141 j = n |
|
142 if i < j: self.handle_data(rawdata[i:j]) |
|
143 i = self.updatepos(i, j) |
|
144 if i == n: break |
|
145 startswith = rawdata.startswith |
|
146 if startswith('<', i): |
|
147 if starttagopen.match(rawdata, i): # < + letter |
|
148 k = self.parse_starttag(i) |
|
149 elif startswith("</", i): |
|
150 k = self.parse_endtag(i) |
|
151 elif startswith("<!--", i): |
|
152 k = self.parse_comment(i) |
|
153 elif startswith("<?", i): |
|
154 k = self.parse_pi(i) |
|
155 elif startswith("<!", i): |
|
156 k = self.parse_declaration(i) |
|
157 elif (i + 1) < n: |
|
158 self.handle_data("<") |
|
159 k = i + 1 |
|
160 else: |
|
161 break |
|
162 if k < 0: |
|
163 if end: |
|
164 self.error("EOF in middle of construct") |
|
165 break |
|
166 i = self.updatepos(i, k) |
|
167 elif startswith("&#", i): |
|
168 match = charref.match(rawdata, i) |
|
169 if match: |
|
170 name = match.group()[2:-1] |
|
171 self.handle_charref(name) |
|
172 k = match.end() |
|
173 if not startswith(';', k-1): |
|
174 k = k - 1 |
|
175 i = self.updatepos(i, k) |
|
176 continue |
|
177 else: |
|
178 break |
|
179 elif startswith('&', i): |
|
180 match = entityref.match(rawdata, i) |
|
181 if match: |
|
182 name = match.group(1) |
|
183 self.handle_entityref(name) |
|
184 k = match.end() |
|
185 if not startswith(';', k-1): |
|
186 k = k - 1 |
|
187 i = self.updatepos(i, k) |
|
188 continue |
|
189 match = incomplete.match(rawdata, i) |
|
190 if match: |
|
191 # match.group() will contain at least 2 chars |
|
192 if end and match.group() == rawdata[i:]: |
|
193 self.error("EOF in middle of entity or char ref") |
|
194 # incomplete |
|
195 break |
|
196 elif (i + 1) < n: |
|
197 # not the end of the buffer, and can't be confused |
|
198 # with some other construct |
|
199 self.handle_data("&") |
|
200 i = self.updatepos(i, i + 1) |
|
201 else: |
|
202 break |
|
203 else: |
|
204 assert 0, "interesting.search() lied" |
|
205 # end while |
|
206 if end and i < n: |
|
207 self.handle_data(rawdata[i:n]) |
|
208 i = self.updatepos(i, n) |
|
209 self.rawdata = rawdata[i:] |
|
210 |
|
211 # Internal -- parse processing instr, return end or -1 if not terminated |
|
212 def parse_pi(self, i): |
|
213 rawdata = self.rawdata |
|
214 assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()' |
|
215 match = piclose.search(rawdata, i+2) # > |
|
216 if not match: |
|
217 return -1 |
|
218 j = match.start() |
|
219 self.handle_pi(rawdata[i+2: j]) |
|
220 j = match.end() |
|
221 return j |
|
222 |
|
223 # Internal -- handle starttag, return end or -1 if not terminated |
|
224 def parse_starttag(self, i): |
|
225 self.__starttag_text = None |
|
226 endpos = self.check_for_whole_start_tag(i) |
|
227 if endpos < 0: |
|
228 return endpos |
|
229 rawdata = self.rawdata |
|
230 self.__starttag_text = rawdata[i:endpos] |
|
231 |
|
232 # Now parse the data between i+1 and j into a tag and attrs |
|
233 attrs = [] |
|
234 match = tagfind.match(rawdata, i+1) |
|
235 assert match, 'unexpected call to parse_starttag()' |
|
236 k = match.end() |
|
237 self.lasttag = tag = rawdata[i+1:k].lower() |
|
238 |
|
239 while k < endpos: |
|
240 m = attrfind.match(rawdata, k) |
|
241 if not m: |
|
242 break |
|
243 attrname, rest, attrvalue = m.group(1, 2, 3) |
|
244 if not rest: |
|
245 attrvalue = None |
|
246 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ |
|
247 attrvalue[:1] == '"' == attrvalue[-1:]: |
|
248 attrvalue = attrvalue[1:-1] |
|
249 attrvalue = self.unescape(attrvalue) |
|
250 attrs.append((attrname.lower(), attrvalue)) |
|
251 k = m.end() |
|
252 |
|
253 end = rawdata[k:endpos].strip() |
|
254 if end not in (">", "/>"): |
|
255 lineno, offset = self.getpos() |
|
256 if "\n" in self.__starttag_text: |
|
257 lineno = lineno + self.__starttag_text.count("\n") |
|
258 offset = len(self.__starttag_text) \ |
|
259 - self.__starttag_text.rfind("\n") |
|
260 else: |
|
261 offset = offset + len(self.__starttag_text) |
|
262 self.error("junk characters in start tag: %r" |
|
263 % (rawdata[k:endpos][:20],)) |
|
264 if end.endswith('/>'): |
|
265 # XHTML-style empty tag: <span attr="value" /> |
|
266 self.handle_startendtag(tag, attrs) |
|
267 else: |
|
268 self.handle_starttag(tag, attrs) |
|
269 if tag in self.CDATA_CONTENT_ELEMENTS: |
|
270 self.set_cdata_mode() |
|
271 return endpos |
|
272 |
|
273 # Internal -- check to see if we have a complete starttag; return end |
|
274 # or -1 if incomplete. |
|
275 def check_for_whole_start_tag(self, i): |
|
276 rawdata = self.rawdata |
|
277 m = locatestarttagend.match(rawdata, i) |
|
278 if m: |
|
279 j = m.end() |
|
280 next = rawdata[j:j+1] |
|
281 if next == ">": |
|
282 return j + 1 |
|
283 if next == "/": |
|
284 if rawdata.startswith("/>", j): |
|
285 return j + 2 |
|
286 if rawdata.startswith("/", j): |
|
287 # buffer boundary |
|
288 return -1 |
|
289 # else bogus input |
|
290 self.updatepos(i, j + 1) |
|
291 self.error("malformed empty start tag") |
|
292 if next == "": |
|
293 # end of input |
|
294 return -1 |
|
295 if next in ("abcdefghijklmnopqrstuvwxyz=/" |
|
296 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"): |
|
297 # end of input in or before attribute value, or we have the |
|
298 # '/' from a '/>' ending |
|
299 return -1 |
|
300 self.updatepos(i, j) |
|
301 self.error("malformed start tag") |
|
302 raise AssertionError("we should not get here!") |
|
303 |
|
304 # Internal -- parse endtag, return end or -1 if incomplete |
|
305 def parse_endtag(self, i): |
|
306 rawdata = self.rawdata |
|
307 assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag" |
|
308 match = endendtag.search(rawdata, i+1) # > |
|
309 if not match: |
|
310 return -1 |
|
311 j = match.end() |
|
312 match = endtagfind.match(rawdata, i) # </ + tag + > |
|
313 if not match: |
|
314 self.error("bad end tag: %r" % (rawdata[i:j],)) |
|
315 tag = match.group(1) |
|
316 self.handle_endtag(tag.lower()) |
|
317 self.clear_cdata_mode() |
|
318 return j |
|
319 |
|
320 # Overridable -- finish processing of start+end tag: <tag.../> |
|
321 def handle_startendtag(self, tag, attrs): |
|
322 self.handle_starttag(tag, attrs) |
|
323 self.handle_endtag(tag) |
|
324 |
|
325 # Overridable -- handle start tag |
|
326 def handle_starttag(self, tag, attrs): |
|
327 pass |
|
328 |
|
329 # Overridable -- handle end tag |
|
330 def handle_endtag(self, tag): |
|
331 pass |
|
332 |
|
333 # Overridable -- handle character reference |
|
334 def handle_charref(self, name): |
|
335 pass |
|
336 |
|
337 # Overridable -- handle entity reference |
|
338 def handle_entityref(self, name): |
|
339 pass |
|
340 |
|
341 # Overridable -- handle data |
|
342 def handle_data(self, data): |
|
343 pass |
|
344 |
|
345 # Overridable -- handle comment |
|
346 def handle_comment(self, data): |
|
347 pass |
|
348 |
|
349 # Overridable -- handle declaration |
|
350 def handle_decl(self, decl): |
|
351 pass |
|
352 |
|
353 # Overridable -- handle processing instruction |
|
354 def handle_pi(self, data): |
|
355 pass |
|
356 |
|
357 def unknown_decl(self, data): |
|
358 self.error("unknown declaration: %r" % (data,)) |
|
359 |
|
360 # Internal -- helper to remove special character quoting |
|
361 entitydefs = None |
|
362 def unescape(self, s): |
|
363 if '&' not in s: |
|
364 return s |
|
365 def replaceEntities(s): |
|
366 s = s.groups()[0] |
|
367 if s[0] == "#": |
|
368 s = s[1:] |
|
369 if s[0] in ['x','X']: |
|
370 c = int(s[1:], 16) |
|
371 else: |
|
372 c = int(s) |
|
373 return unichr(c) |
|
374 else: |
|
375 # Cannot use name2codepoint directly, because HTMLParser supports apos, |
|
376 # which is not part of HTML 4 |
|
377 import htmlentitydefs |
|
378 if HTMLParser.entitydefs is None: |
|
379 entitydefs = HTMLParser.entitydefs = {'apos':u"'"} |
|
380 for k, v in htmlentitydefs.name2codepoint.iteritems(): |
|
381 entitydefs[k] = unichr(v) |
|
382 try: |
|
383 return self.entitydefs[s] |
|
384 except KeyError: |
|
385 return '&'+s+';' |
|
386 |
|
387 return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));", replaceEntities, s) |