|
1 """ |
|
2 SAX driver for the pyexpat C module. This driver works with |
|
3 pyexpat.__version__ == '2.22'. |
|
4 """ |
|
5 |
|
6 version = "0.20" |
|
7 |
|
8 from xml.sax._exceptions import * |
|
9 from xml.sax.handler import feature_validation, feature_namespaces |
|
10 from xml.sax.handler import feature_namespace_prefixes |
|
11 from xml.sax.handler import feature_external_ges, feature_external_pes |
|
12 from xml.sax.handler import feature_string_interning |
|
13 from xml.sax.handler import property_xml_string, property_interning_dict |
|
14 |
|
15 # xml.parsers.expat does not raise ImportError in Jython |
|
16 import sys |
|
17 if sys.platform[:4] == "java": |
|
18 raise SAXReaderNotAvailable("expat not available in Java", None) |
|
19 del sys |
|
20 |
|
21 try: |
|
22 from xml.parsers import expat |
|
23 except ImportError: |
|
24 raise SAXReaderNotAvailable("expat not supported", None) |
|
25 else: |
|
26 if not hasattr(expat, "ParserCreate"): |
|
27 raise SAXReaderNotAvailable("expat not supported", None) |
|
28 from xml.sax import xmlreader, saxutils, handler |
|
29 |
|
30 AttributesImpl = xmlreader.AttributesImpl |
|
31 AttributesNSImpl = xmlreader.AttributesNSImpl |
|
32 |
|
33 # If we're using a sufficiently recent version of Python, we can use |
|
34 # weak references to avoid cycles between the parser and content |
|
35 # handler, otherwise we'll just have to pretend. |
|
36 try: |
|
37 import _weakref |
|
38 except ImportError: |
|
39 def _mkproxy(o): |
|
40 return o |
|
41 else: |
|
42 import weakref |
|
43 _mkproxy = weakref.proxy |
|
44 del weakref, _weakref |
|
45 |
|
46 # --- ExpatLocator |
|
47 |
|
48 class ExpatLocator(xmlreader.Locator): |
|
49 """Locator for use with the ExpatParser class. |
|
50 |
|
51 This uses a weak reference to the parser object to avoid creating |
|
52 a circular reference between the parser and the content handler. |
|
53 """ |
|
54 def __init__(self, parser): |
|
55 self._ref = _mkproxy(parser) |
|
56 |
|
57 def getColumnNumber(self): |
|
58 parser = self._ref |
|
59 if parser._parser is None: |
|
60 return None |
|
61 return parser._parser.ErrorColumnNumber |
|
62 |
|
63 def getLineNumber(self): |
|
64 parser = self._ref |
|
65 if parser._parser is None: |
|
66 return 1 |
|
67 return parser._parser.ErrorLineNumber |
|
68 |
|
69 def getPublicId(self): |
|
70 parser = self._ref |
|
71 if parser is None: |
|
72 return None |
|
73 return parser._source.getPublicId() |
|
74 |
|
75 def getSystemId(self): |
|
76 parser = self._ref |
|
77 if parser is None: |
|
78 return None |
|
79 return parser._source.getSystemId() |
|
80 |
|
81 |
|
82 # --- ExpatParser |
|
83 |
|
84 class ExpatParser(xmlreader.IncrementalParser, xmlreader.Locator): |
|
85 """SAX driver for the pyexpat C module.""" |
|
86 |
|
87 def __init__(self, namespaceHandling=0, bufsize=2**16-20): |
|
88 xmlreader.IncrementalParser.__init__(self, bufsize) |
|
89 self._source = xmlreader.InputSource() |
|
90 self._parser = None |
|
91 self._namespaces = namespaceHandling |
|
92 self._lex_handler_prop = None |
|
93 self._parsing = 0 |
|
94 self._entity_stack = [] |
|
95 self._external_ges = 1 |
|
96 self._interning = None |
|
97 |
|
98 # XMLReader methods |
|
99 |
|
100 def parse(self, source): |
|
101 "Parse an XML document from a URL or an InputSource." |
|
102 source = saxutils.prepare_input_source(source) |
|
103 |
|
104 self._source = source |
|
105 self.reset() |
|
106 self._cont_handler.setDocumentLocator(ExpatLocator(self)) |
|
107 xmlreader.IncrementalParser.parse(self, source) |
|
108 |
|
109 def prepareParser(self, source): |
|
110 if source.getSystemId() != None: |
|
111 self._parser.SetBase(source.getSystemId()) |
|
112 |
|
113 # Redefined setContentHandler to allow changing handlers during parsing |
|
114 |
|
115 def setContentHandler(self, handler): |
|
116 xmlreader.IncrementalParser.setContentHandler(self, handler) |
|
117 if self._parsing: |
|
118 self._reset_cont_handler() |
|
119 |
|
120 def getFeature(self, name): |
|
121 if name == feature_namespaces: |
|
122 return self._namespaces |
|
123 elif name == feature_string_interning: |
|
124 return self._interning is not None |
|
125 elif name in (feature_validation, feature_external_pes, |
|
126 feature_namespace_prefixes): |
|
127 return 0 |
|
128 elif name == feature_external_ges: |
|
129 return self._external_ges |
|
130 raise SAXNotRecognizedException("Feature '%s' not recognized" % name) |
|
131 |
|
132 def setFeature(self, name, state): |
|
133 if self._parsing: |
|
134 raise SAXNotSupportedException("Cannot set features while parsing") |
|
135 |
|
136 if name == feature_namespaces: |
|
137 self._namespaces = state |
|
138 elif name == feature_external_ges: |
|
139 self._external_ges = state |
|
140 elif name == feature_string_interning: |
|
141 if state: |
|
142 if self._interning is None: |
|
143 self._interning = {} |
|
144 else: |
|
145 self._interning = None |
|
146 elif name == feature_validation: |
|
147 if state: |
|
148 raise SAXNotSupportedException( |
|
149 "expat does not support validation") |
|
150 elif name == feature_external_pes: |
|
151 if state: |
|
152 raise SAXNotSupportedException( |
|
153 "expat does not read external parameter entities") |
|
154 elif name == feature_namespace_prefixes: |
|
155 if state: |
|
156 raise SAXNotSupportedException( |
|
157 "expat does not report namespace prefixes") |
|
158 else: |
|
159 raise SAXNotRecognizedException( |
|
160 "Feature '%s' not recognized" % name) |
|
161 |
|
162 def getProperty(self, name): |
|
163 if name == handler.property_lexical_handler: |
|
164 return self._lex_handler_prop |
|
165 elif name == property_interning_dict: |
|
166 return self._interning |
|
167 elif name == property_xml_string: |
|
168 if self._parser: |
|
169 if hasattr(self._parser, "GetInputContext"): |
|
170 return self._parser.GetInputContext() |
|
171 else: |
|
172 raise SAXNotRecognizedException( |
|
173 "This version of expat does not support getting" |
|
174 " the XML string") |
|
175 else: |
|
176 raise SAXNotSupportedException( |
|
177 "XML string cannot be returned when not parsing") |
|
178 raise SAXNotRecognizedException("Property '%s' not recognized" % name) |
|
179 |
|
180 def setProperty(self, name, value): |
|
181 if name == handler.property_lexical_handler: |
|
182 self._lex_handler_prop = value |
|
183 if self._parsing: |
|
184 self._reset_lex_handler_prop() |
|
185 elif name == property_interning_dict: |
|
186 self._interning = value |
|
187 elif name == property_xml_string: |
|
188 raise SAXNotSupportedException("Property '%s' cannot be set" % |
|
189 name) |
|
190 else: |
|
191 raise SAXNotRecognizedException("Property '%s' not recognized" % |
|
192 name) |
|
193 |
|
194 # IncrementalParser methods |
|
195 |
|
196 def feed(self, data, isFinal = 0): |
|
197 if not self._parsing: |
|
198 self.reset() |
|
199 self._parsing = 1 |
|
200 self._cont_handler.startDocument() |
|
201 |
|
202 try: |
|
203 # The isFinal parameter is internal to the expat reader. |
|
204 # If it is set to true, expat will check validity of the entire |
|
205 # document. When feeding chunks, they are not normally final - |
|
206 # except when invoked from close. |
|
207 self._parser.Parse(data, isFinal) |
|
208 except expat.error, e: |
|
209 exc = SAXParseException(expat.ErrorString(e.code), e, self) |
|
210 # FIXME: when to invoke error()? |
|
211 self._err_handler.fatalError(exc) |
|
212 |
|
213 def close(self): |
|
214 if self._entity_stack: |
|
215 # If we are completing an external entity, do nothing here |
|
216 return |
|
217 self.feed("", isFinal = 1) |
|
218 self._cont_handler.endDocument() |
|
219 self._parsing = 0 |
|
220 # break cycle created by expat handlers pointing to our methods |
|
221 self._parser = None |
|
222 |
|
223 def _reset_cont_handler(self): |
|
224 self._parser.ProcessingInstructionHandler = \ |
|
225 self._cont_handler.processingInstruction |
|
226 self._parser.CharacterDataHandler = self._cont_handler.characters |
|
227 |
|
228 def _reset_lex_handler_prop(self): |
|
229 lex = self._lex_handler_prop |
|
230 parser = self._parser |
|
231 if lex is None: |
|
232 parser.CommentHandler = None |
|
233 parser.StartCdataSectionHandler = None |
|
234 parser.EndCdataSectionHandler = None |
|
235 parser.StartDoctypeDeclHandler = None |
|
236 parser.EndDoctypeDeclHandler = None |
|
237 else: |
|
238 parser.CommentHandler = lex.comment |
|
239 parser.StartCdataSectionHandler = lex.startCDATA |
|
240 parser.EndCdataSectionHandler = lex.endCDATA |
|
241 parser.StartDoctypeDeclHandler = self.start_doctype_decl |
|
242 parser.EndDoctypeDeclHandler = lex.endDTD |
|
243 |
|
244 def reset(self): |
|
245 if self._namespaces: |
|
246 self._parser = expat.ParserCreate(self._source.getEncoding(), " ", |
|
247 intern=self._interning) |
|
248 self._parser.namespace_prefixes = 1 |
|
249 self._parser.StartElementHandler = self.start_element_ns |
|
250 self._parser.EndElementHandler = self.end_element_ns |
|
251 else: |
|
252 self._parser = expat.ParserCreate(self._source.getEncoding(), |
|
253 intern = self._interning) |
|
254 self._parser.StartElementHandler = self.start_element |
|
255 self._parser.EndElementHandler = self.end_element |
|
256 |
|
257 self._reset_cont_handler() |
|
258 self._parser.UnparsedEntityDeclHandler = self.unparsed_entity_decl |
|
259 self._parser.NotationDeclHandler = self.notation_decl |
|
260 self._parser.StartNamespaceDeclHandler = self.start_namespace_decl |
|
261 self._parser.EndNamespaceDeclHandler = self.end_namespace_decl |
|
262 |
|
263 self._decl_handler_prop = None |
|
264 if self._lex_handler_prop: |
|
265 self._reset_lex_handler_prop() |
|
266 # self._parser.DefaultHandler = |
|
267 # self._parser.DefaultHandlerExpand = |
|
268 # self._parser.NotStandaloneHandler = |
|
269 self._parser.ExternalEntityRefHandler = self.external_entity_ref |
|
270 try: |
|
271 self._parser.SkippedEntityHandler = self.skipped_entity_handler |
|
272 except AttributeError: |
|
273 # This pyexpat does not support SkippedEntity |
|
274 pass |
|
275 self._parser.SetParamEntityParsing( |
|
276 expat.XML_PARAM_ENTITY_PARSING_UNLESS_STANDALONE) |
|
277 |
|
278 self._parsing = 0 |
|
279 self._entity_stack = [] |
|
280 |
|
281 # Locator methods |
|
282 |
|
283 def getColumnNumber(self): |
|
284 if self._parser is None: |
|
285 return None |
|
286 return self._parser.ErrorColumnNumber |
|
287 |
|
288 def getLineNumber(self): |
|
289 if self._parser is None: |
|
290 return 1 |
|
291 return self._parser.ErrorLineNumber |
|
292 |
|
293 def getPublicId(self): |
|
294 return self._source.getPublicId() |
|
295 |
|
296 def getSystemId(self): |
|
297 return self._source.getSystemId() |
|
298 |
|
299 # event handlers |
|
300 def start_element(self, name, attrs): |
|
301 self._cont_handler.startElement(name, AttributesImpl(attrs)) |
|
302 |
|
303 def end_element(self, name): |
|
304 self._cont_handler.endElement(name) |
|
305 |
|
306 def start_element_ns(self, name, attrs): |
|
307 pair = name.split() |
|
308 if len(pair) == 1: |
|
309 # no namespace |
|
310 pair = (None, name) |
|
311 elif len(pair) == 3: |
|
312 pair = pair[0], pair[1] |
|
313 else: |
|
314 # default namespace |
|
315 pair = tuple(pair) |
|
316 |
|
317 newattrs = {} |
|
318 qnames = {} |
|
319 for (aname, value) in attrs.items(): |
|
320 parts = aname.split() |
|
321 length = len(parts) |
|
322 if length == 1: |
|
323 # no namespace |
|
324 qname = aname |
|
325 apair = (None, aname) |
|
326 elif length == 3: |
|
327 qname = "%s:%s" % (parts[2], parts[1]) |
|
328 apair = parts[0], parts[1] |
|
329 else: |
|
330 # default namespace |
|
331 qname = parts[1] |
|
332 apair = tuple(parts) |
|
333 |
|
334 newattrs[apair] = value |
|
335 qnames[apair] = qname |
|
336 |
|
337 self._cont_handler.startElementNS(pair, None, |
|
338 AttributesNSImpl(newattrs, qnames)) |
|
339 |
|
340 def end_element_ns(self, name): |
|
341 pair = name.split() |
|
342 if len(pair) == 1: |
|
343 pair = (None, name) |
|
344 elif len(pair) == 3: |
|
345 pair = pair[0], pair[1] |
|
346 else: |
|
347 pair = tuple(pair) |
|
348 |
|
349 self._cont_handler.endElementNS(pair, None) |
|
350 |
|
351 # this is not used (call directly to ContentHandler) |
|
352 def processing_instruction(self, target, data): |
|
353 self._cont_handler.processingInstruction(target, data) |
|
354 |
|
355 # this is not used (call directly to ContentHandler) |
|
356 def character_data(self, data): |
|
357 self._cont_handler.characters(data) |
|
358 |
|
359 def start_namespace_decl(self, prefix, uri): |
|
360 self._cont_handler.startPrefixMapping(prefix, uri) |
|
361 |
|
362 def end_namespace_decl(self, prefix): |
|
363 self._cont_handler.endPrefixMapping(prefix) |
|
364 |
|
365 def start_doctype_decl(self, name, sysid, pubid, has_internal_subset): |
|
366 self._lex_handler_prop.startDTD(name, pubid, sysid) |
|
367 |
|
368 def unparsed_entity_decl(self, name, base, sysid, pubid, notation_name): |
|
369 self._dtd_handler.unparsedEntityDecl(name, pubid, sysid, notation_name) |
|
370 |
|
371 def notation_decl(self, name, base, sysid, pubid): |
|
372 self._dtd_handler.notationDecl(name, pubid, sysid) |
|
373 |
|
374 def external_entity_ref(self, context, base, sysid, pubid): |
|
375 if not self._external_ges: |
|
376 return 1 |
|
377 |
|
378 source = self._ent_handler.resolveEntity(pubid, sysid) |
|
379 source = saxutils.prepare_input_source(source, |
|
380 self._source.getSystemId() or |
|
381 "") |
|
382 |
|
383 self._entity_stack.append((self._parser, self._source)) |
|
384 self._parser = self._parser.ExternalEntityParserCreate(context) |
|
385 self._source = source |
|
386 |
|
387 try: |
|
388 xmlreader.IncrementalParser.parse(self, source) |
|
389 except: |
|
390 return 0 # FIXME: save error info here? |
|
391 |
|
392 (self._parser, self._source) = self._entity_stack[-1] |
|
393 del self._entity_stack[-1] |
|
394 return 1 |
|
395 |
|
396 def skipped_entity_handler(self, name, is_pe): |
|
397 if is_pe: |
|
398 # The SAX spec requires to report skipped PEs with a '%' |
|
399 name = '%'+name |
|
400 self._cont_handler.skippedEntity(name) |
|
401 |
|
402 # --- |
|
403 |
|
404 def create_parser(*args, **kwargs): |
|
405 return ExpatParser(*args, **kwargs) |
|
406 |
|
407 # --- |
|
408 |
|
409 if __name__ == "__main__": |
|
410 import xml.sax |
|
411 p = create_parser() |
|
412 p.setContentHandler(xml.sax.XMLGenerator()) |
|
413 p.setErrorHandler(xml.sax.ErrorHandler()) |
|
414 p.parse("../../../hamlet.xml") |