|
1 """Shared support for scanning document type declarations in HTML and XHTML. |
|
2 |
|
3 This module is used as a foundation for the HTMLParser and sgmllib |
|
4 modules (indirectly, for htmllib as well). It has no documented |
|
5 public API and should not be used directly. |
|
6 |
|
7 """ |
|
8 |
|
9 import re |
|
10 |
|
11 _declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*').match |
|
12 _declstringlit_match = re.compile(r'(\'[^\']*\'|"[^"]*")\s*').match |
|
13 _commentclose = re.compile(r'--\s*>') |
|
14 _markedsectionclose = re.compile(r']\s*]\s*>') |
|
15 |
|
16 # An analysis of the MS-Word extensions is available at |
|
17 # http://www.planetpublish.com/xmlarena/xap/Thursday/WordtoXML.pdf |
|
18 |
|
19 _msmarkedsectionclose = re.compile(r']\s*>') |
|
20 |
|
21 del re |
|
22 |
|
23 |
|
24 class ParserBase: |
|
25 """Parser base class which provides some common support methods used |
|
26 by the SGML/HTML and XHTML parsers.""" |
|
27 |
|
28 def __init__(self): |
|
29 if self.__class__ is ParserBase: |
|
30 raise RuntimeError( |
|
31 "markupbase.ParserBase must be subclassed") |
|
32 |
|
33 def error(self, message): |
|
34 raise NotImplementedError( |
|
35 "subclasses of ParserBase must override error()") |
|
36 |
|
37 def reset(self): |
|
38 self.lineno = 1 |
|
39 self.offset = 0 |
|
40 |
|
41 def getpos(self): |
|
42 """Return current line number and offset.""" |
|
43 return self.lineno, self.offset |
|
44 |
|
45 # Internal -- update line number and offset. This should be |
|
46 # called for each piece of data exactly once, in order -- in other |
|
47 # words the concatenation of all the input strings to this |
|
48 # function should be exactly the entire input. |
|
49 def updatepos(self, i, j): |
|
50 if i >= j: |
|
51 return j |
|
52 rawdata = self.rawdata |
|
53 nlines = rawdata.count("\n", i, j) |
|
54 if nlines: |
|
55 self.lineno = self.lineno + nlines |
|
56 pos = rawdata.rindex("\n", i, j) # Should not fail |
|
57 self.offset = j-(pos+1) |
|
58 else: |
|
59 self.offset = self.offset + j-i |
|
60 return j |
|
61 |
|
62 _decl_otherchars = '' |
|
63 |
|
64 # Internal -- parse declaration (for use by subclasses). |
|
65 def parse_declaration(self, i): |
|
66 # This is some sort of declaration; in "HTML as |
|
67 # deployed," this should only be the document type |
|
68 # declaration ("<!DOCTYPE html...>"). |
|
69 # ISO 8879:1986, however, has more complex |
|
70 # declaration syntax for elements in <!...>, including: |
|
71 # --comment-- |
|
72 # [marked section] |
|
73 # name in the following list: ENTITY, DOCTYPE, ELEMENT, |
|
74 # ATTLIST, NOTATION, SHORTREF, USEMAP, |
|
75 # LINKTYPE, LINK, IDLINK, USELINK, SYSTEM |
|
76 rawdata = self.rawdata |
|
77 j = i + 2 |
|
78 assert rawdata[i:j] == "<!", "unexpected call to parse_declaration" |
|
79 if rawdata[j:j+1] == ">": |
|
80 # the empty comment <!> |
|
81 return j + 1 |
|
82 if rawdata[j:j+1] in ("-", ""): |
|
83 # Start of comment followed by buffer boundary, |
|
84 # or just a buffer boundary. |
|
85 return -1 |
|
86 # A simple, practical version could look like: ((name|stringlit) S*) + '>' |
|
87 n = len(rawdata) |
|
88 if rawdata[j:j+2] == '--': #comment |
|
89 # Locate --.*-- as the body of the comment |
|
90 return self.parse_comment(i) |
|
91 elif rawdata[j] == '[': #marked section |
|
92 # Locate [statusWord [...arbitrary SGML...]] as the body of the marked section |
|
93 # Where statusWord is one of TEMP, CDATA, IGNORE, INCLUDE, RCDATA |
|
94 # Note that this is extended by Microsoft Office "Save as Web" function |
|
95 # to include [if...] and [endif]. |
|
96 return self.parse_marked_section(i) |
|
97 else: #all other declaration elements |
|
98 decltype, j = self._scan_name(j, i) |
|
99 if j < 0: |
|
100 return j |
|
101 if decltype == "doctype": |
|
102 self._decl_otherchars = '' |
|
103 while j < n: |
|
104 c = rawdata[j] |
|
105 if c == ">": |
|
106 # end of declaration syntax |
|
107 data = rawdata[i+2:j] |
|
108 if decltype == "doctype": |
|
109 self.handle_decl(data) |
|
110 else: |
|
111 self.unknown_decl(data) |
|
112 return j + 1 |
|
113 if c in "\"'": |
|
114 m = _declstringlit_match(rawdata, j) |
|
115 if not m: |
|
116 return -1 # incomplete |
|
117 j = m.end() |
|
118 elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ": |
|
119 name, j = self._scan_name(j, i) |
|
120 elif c in self._decl_otherchars: |
|
121 j = j + 1 |
|
122 elif c == "[": |
|
123 # this could be handled in a separate doctype parser |
|
124 if decltype == "doctype": |
|
125 j = self._parse_doctype_subset(j + 1, i) |
|
126 elif decltype in ("attlist", "linktype", "link", "element"): |
|
127 # must tolerate []'d groups in a content model in an element declaration |
|
128 # also in data attribute specifications of attlist declaration |
|
129 # also link type declaration subsets in linktype declarations |
|
130 # also link attribute specification lists in link declarations |
|
131 self.error("unsupported '[' char in %s declaration" % decltype) |
|
132 else: |
|
133 self.error("unexpected '[' char in declaration") |
|
134 else: |
|
135 self.error( |
|
136 "unexpected %r char in declaration" % rawdata[j]) |
|
137 if j < 0: |
|
138 return j |
|
139 return -1 # incomplete |
|
140 |
|
141 # Internal -- parse a marked section |
|
142 # Override this to handle MS-word extension syntax <![if word]>content<![endif]> |
|
143 def parse_marked_section(self, i, report=1): |
|
144 rawdata= self.rawdata |
|
145 assert rawdata[i:i+3] == '<![', "unexpected call to parse_marked_section()" |
|
146 sectName, j = self._scan_name( i+3, i ) |
|
147 if j < 0: |
|
148 return j |
|
149 if sectName in ("temp", "cdata", "ignore", "include", "rcdata"): |
|
150 # look for standard ]]> ending |
|
151 match= _markedsectionclose.search(rawdata, i+3) |
|
152 elif sectName in ("if", "else", "endif"): |
|
153 # look for MS Office ]> ending |
|
154 match= _msmarkedsectionclose.search(rawdata, i+3) |
|
155 else: |
|
156 self.error('unknown status keyword %r in marked section' % rawdata[i+3:j]) |
|
157 if not match: |
|
158 return -1 |
|
159 if report: |
|
160 j = match.start(0) |
|
161 self.unknown_decl(rawdata[i+3: j]) |
|
162 return match.end(0) |
|
163 |
|
164 # Internal -- parse comment, return length or -1 if not terminated |
|
165 def parse_comment(self, i, report=1): |
|
166 rawdata = self.rawdata |
|
167 if rawdata[i:i+4] != '<!--': |
|
168 self.error('unexpected call to parse_comment()') |
|
169 match = _commentclose.search(rawdata, i+4) |
|
170 if not match: |
|
171 return -1 |
|
172 if report: |
|
173 j = match.start(0) |
|
174 self.handle_comment(rawdata[i+4: j]) |
|
175 return match.end(0) |
|
176 |
|
177 # Internal -- scan past the internal subset in a <!DOCTYPE declaration, |
|
178 # returning the index just past any whitespace following the trailing ']'. |
|
179 def _parse_doctype_subset(self, i, declstartpos): |
|
180 rawdata = self.rawdata |
|
181 n = len(rawdata) |
|
182 j = i |
|
183 while j < n: |
|
184 c = rawdata[j] |
|
185 if c == "<": |
|
186 s = rawdata[j:j+2] |
|
187 if s == "<": |
|
188 # end of buffer; incomplete |
|
189 return -1 |
|
190 if s != "<!": |
|
191 self.updatepos(declstartpos, j + 1) |
|
192 self.error("unexpected char in internal subset (in %r)" % s) |
|
193 if (j + 2) == n: |
|
194 # end of buffer; incomplete |
|
195 return -1 |
|
196 if (j + 4) > n: |
|
197 # end of buffer; incomplete |
|
198 return -1 |
|
199 if rawdata[j:j+4] == "<!--": |
|
200 j = self.parse_comment(j, report=0) |
|
201 if j < 0: |
|
202 return j |
|
203 continue |
|
204 name, j = self._scan_name(j + 2, declstartpos) |
|
205 if j == -1: |
|
206 return -1 |
|
207 if name not in ("attlist", "element", "entity", "notation"): |
|
208 self.updatepos(declstartpos, j + 2) |
|
209 self.error( |
|
210 "unknown declaration %r in internal subset" % name) |
|
211 # handle the individual names |
|
212 meth = getattr(self, "_parse_doctype_" + name) |
|
213 j = meth(j, declstartpos) |
|
214 if j < 0: |
|
215 return j |
|
216 elif c == "%": |
|
217 # parameter entity reference |
|
218 if (j + 1) == n: |
|
219 # end of buffer; incomplete |
|
220 return -1 |
|
221 s, j = self._scan_name(j + 1, declstartpos) |
|
222 if j < 0: |
|
223 return j |
|
224 if rawdata[j] == ";": |
|
225 j = j + 1 |
|
226 elif c == "]": |
|
227 j = j + 1 |
|
228 while j < n and rawdata[j].isspace(): |
|
229 j = j + 1 |
|
230 if j < n: |
|
231 if rawdata[j] == ">": |
|
232 return j |
|
233 self.updatepos(declstartpos, j) |
|
234 self.error("unexpected char after internal subset") |
|
235 else: |
|
236 return -1 |
|
237 elif c.isspace(): |
|
238 j = j + 1 |
|
239 else: |
|
240 self.updatepos(declstartpos, j) |
|
241 self.error("unexpected char %r in internal subset" % c) |
|
242 # end of buffer reached |
|
243 return -1 |
|
244 |
|
245 # Internal -- scan past <!ELEMENT declarations |
|
246 def _parse_doctype_element(self, i, declstartpos): |
|
247 name, j = self._scan_name(i, declstartpos) |
|
248 if j == -1: |
|
249 return -1 |
|
250 # style content model; just skip until '>' |
|
251 rawdata = self.rawdata |
|
252 if '>' in rawdata[j:]: |
|
253 return rawdata.find(">", j) + 1 |
|
254 return -1 |
|
255 |
|
256 # Internal -- scan past <!ATTLIST declarations |
|
257 def _parse_doctype_attlist(self, i, declstartpos): |
|
258 rawdata = self.rawdata |
|
259 name, j = self._scan_name(i, declstartpos) |
|
260 c = rawdata[j:j+1] |
|
261 if c == "": |
|
262 return -1 |
|
263 if c == ">": |
|
264 return j + 1 |
|
265 while 1: |
|
266 # scan a series of attribute descriptions; simplified: |
|
267 # name type [value] [#constraint] |
|
268 name, j = self._scan_name(j, declstartpos) |
|
269 if j < 0: |
|
270 return j |
|
271 c = rawdata[j:j+1] |
|
272 if c == "": |
|
273 return -1 |
|
274 if c == "(": |
|
275 # an enumerated type; look for ')' |
|
276 if ")" in rawdata[j:]: |
|
277 j = rawdata.find(")", j) + 1 |
|
278 else: |
|
279 return -1 |
|
280 while rawdata[j:j+1].isspace(): |
|
281 j = j + 1 |
|
282 if not rawdata[j:]: |
|
283 # end of buffer, incomplete |
|
284 return -1 |
|
285 else: |
|
286 name, j = self._scan_name(j, declstartpos) |
|
287 c = rawdata[j:j+1] |
|
288 if not c: |
|
289 return -1 |
|
290 if c in "'\"": |
|
291 m = _declstringlit_match(rawdata, j) |
|
292 if m: |
|
293 j = m.end() |
|
294 else: |
|
295 return -1 |
|
296 c = rawdata[j:j+1] |
|
297 if not c: |
|
298 return -1 |
|
299 if c == "#": |
|
300 if rawdata[j:] == "#": |
|
301 # end of buffer |
|
302 return -1 |
|
303 name, j = self._scan_name(j + 1, declstartpos) |
|
304 if j < 0: |
|
305 return j |
|
306 c = rawdata[j:j+1] |
|
307 if not c: |
|
308 return -1 |
|
309 if c == '>': |
|
310 # all done |
|
311 return j + 1 |
|
312 |
|
313 # Internal -- scan past <!NOTATION declarations |
|
314 def _parse_doctype_notation(self, i, declstartpos): |
|
315 name, j = self._scan_name(i, declstartpos) |
|
316 if j < 0: |
|
317 return j |
|
318 rawdata = self.rawdata |
|
319 while 1: |
|
320 c = rawdata[j:j+1] |
|
321 if not c: |
|
322 # end of buffer; incomplete |
|
323 return -1 |
|
324 if c == '>': |
|
325 return j + 1 |
|
326 if c in "'\"": |
|
327 m = _declstringlit_match(rawdata, j) |
|
328 if not m: |
|
329 return -1 |
|
330 j = m.end() |
|
331 else: |
|
332 name, j = self._scan_name(j, declstartpos) |
|
333 if j < 0: |
|
334 return j |
|
335 |
|
336 # Internal -- scan past <!ENTITY declarations |
|
337 def _parse_doctype_entity(self, i, declstartpos): |
|
338 rawdata = self.rawdata |
|
339 if rawdata[i:i+1] == "%": |
|
340 j = i + 1 |
|
341 while 1: |
|
342 c = rawdata[j:j+1] |
|
343 if not c: |
|
344 return -1 |
|
345 if c.isspace(): |
|
346 j = j + 1 |
|
347 else: |
|
348 break |
|
349 else: |
|
350 j = i |
|
351 name, j = self._scan_name(j, declstartpos) |
|
352 if j < 0: |
|
353 return j |
|
354 while 1: |
|
355 c = self.rawdata[j:j+1] |
|
356 if not c: |
|
357 return -1 |
|
358 if c in "'\"": |
|
359 m = _declstringlit_match(rawdata, j) |
|
360 if m: |
|
361 j = m.end() |
|
362 else: |
|
363 return -1 # incomplete |
|
364 elif c == ">": |
|
365 return j + 1 |
|
366 else: |
|
367 name, j = self._scan_name(j, declstartpos) |
|
368 if j < 0: |
|
369 return j |
|
370 |
|
371 # Internal -- scan a name token and the new position and the token, or |
|
372 # return -1 if we've reached the end of the buffer. |
|
373 def _scan_name(self, i, declstartpos): |
|
374 rawdata = self.rawdata |
|
375 n = len(rawdata) |
|
376 if i == n: |
|
377 return None, -1 |
|
378 m = _declname_match(rawdata, i) |
|
379 if m: |
|
380 s = m.group() |
|
381 name = s.strip() |
|
382 if (i + len(s)) == n: |
|
383 return None, -1 # end of buffer |
|
384 return name.lower(), m.end() |
|
385 else: |
|
386 self.updatepos(declstartpos, i) |
|
387 self.error("expected name token at %r" |
|
388 % rawdata[declstartpos:declstartpos+20]) |
|
389 |
|
390 # To be overridden -- handlers for unknown objects |
|
391 def unknown_decl(self, data): |
|
392 pass |