|
1 """An XML Reader is the SAX 2 name for an XML parser. XML Parsers |
|
2 should be based on this code. """ |
|
3 |
|
4 import handler |
|
5 |
|
6 from _exceptions import SAXNotSupportedException, SAXNotRecognizedException |
|
7 |
|
8 |
|
9 # ===== XMLREADER ===== |
|
10 |
|
11 class XMLReader: |
|
12 """Interface for reading an XML document using callbacks. |
|
13 |
|
14 XMLReader is the interface that an XML parser's SAX2 driver must |
|
15 implement. This interface allows an application to set and query |
|
16 features and properties in the parser, to register event handlers |
|
17 for document processing, and to initiate a document parse. |
|
18 |
|
19 All SAX interfaces are assumed to be synchronous: the parse |
|
20 methods must not return until parsing is complete, and readers |
|
21 must wait for an event-handler callback to return before reporting |
|
22 the next event.""" |
|
23 |
|
24 def __init__(self): |
|
25 self._cont_handler = handler.ContentHandler() |
|
26 self._dtd_handler = handler.DTDHandler() |
|
27 self._ent_handler = handler.EntityResolver() |
|
28 self._err_handler = handler.ErrorHandler() |
|
29 |
|
30 def parse(self, source): |
|
31 "Parse an XML document from a system identifier or an InputSource." |
|
32 raise NotImplementedError("This method must be implemented!") |
|
33 |
|
34 def getContentHandler(self): |
|
35 "Returns the current ContentHandler." |
|
36 return self._cont_handler |
|
37 |
|
38 def setContentHandler(self, handler): |
|
39 "Registers a new object to receive document content events." |
|
40 self._cont_handler = handler |
|
41 |
|
42 def getDTDHandler(self): |
|
43 "Returns the current DTD handler." |
|
44 return self._dtd_handler |
|
45 |
|
46 def setDTDHandler(self, handler): |
|
47 "Register an object to receive basic DTD-related events." |
|
48 self._dtd_handler = handler |
|
49 |
|
50 def getEntityResolver(self): |
|
51 "Returns the current EntityResolver." |
|
52 return self._ent_handler |
|
53 |
|
54 def setEntityResolver(self, resolver): |
|
55 "Register an object to resolve external entities." |
|
56 self._ent_handler = resolver |
|
57 |
|
58 def getErrorHandler(self): |
|
59 "Returns the current ErrorHandler." |
|
60 return self._err_handler |
|
61 |
|
62 def setErrorHandler(self, handler): |
|
63 "Register an object to receive error-message events." |
|
64 self._err_handler = handler |
|
65 |
|
66 def setLocale(self, locale): |
|
67 """Allow an application to set the locale for errors and warnings. |
|
68 |
|
69 SAX parsers are not required to provide localization for errors |
|
70 and warnings; if they cannot support the requested locale, |
|
71 however, they must throw a SAX exception. Applications may |
|
72 request a locale change in the middle of a parse.""" |
|
73 raise SAXNotSupportedException("Locale support not implemented") |
|
74 |
|
75 def getFeature(self, name): |
|
76 "Looks up and returns the state of a SAX2 feature." |
|
77 raise SAXNotRecognizedException("Feature '%s' not recognized" % name) |
|
78 |
|
79 def setFeature(self, name, state): |
|
80 "Sets the state of a SAX2 feature." |
|
81 raise SAXNotRecognizedException("Feature '%s' not recognized" % name) |
|
82 |
|
83 def getProperty(self, name): |
|
84 "Looks up and returns the value of a SAX2 property." |
|
85 raise SAXNotRecognizedException("Property '%s' not recognized" % name) |
|
86 |
|
87 def setProperty(self, name, value): |
|
88 "Sets the value of a SAX2 property." |
|
89 raise SAXNotRecognizedException("Property '%s' not recognized" % name) |
|
90 |
|
91 class IncrementalParser(XMLReader): |
|
92 """This interface adds three extra methods to the XMLReader |
|
93 interface that allow XML parsers to support incremental |
|
94 parsing. Support for this interface is optional, since not all |
|
95 underlying XML parsers support this functionality. |
|
96 |
|
97 When the parser is instantiated it is ready to begin accepting |
|
98 data from the feed method immediately. After parsing has been |
|
99 finished with a call to close the reset method must be called to |
|
100 make the parser ready to accept new data, either from feed or |
|
101 using the parse method. |
|
102 |
|
103 Note that these methods must _not_ be called during parsing, that |
|
104 is, after parse has been called and before it returns. |
|
105 |
|
106 By default, the class also implements the parse method of the XMLReader |
|
107 interface using the feed, close and reset methods of the |
|
108 IncrementalParser interface as a convenience to SAX 2.0 driver |
|
109 writers.""" |
|
110 |
|
111 def __init__(self, bufsize=2**16): |
|
112 self._bufsize = bufsize |
|
113 XMLReader.__init__(self) |
|
114 |
|
115 def parse(self, source): |
|
116 import saxutils |
|
117 source = saxutils.prepare_input_source(source) |
|
118 |
|
119 self.prepareParser(source) |
|
120 file = source.getByteStream() |
|
121 buffer = file.read(self._bufsize) |
|
122 while buffer != "": |
|
123 self.feed(buffer) |
|
124 buffer = file.read(self._bufsize) |
|
125 self.close() |
|
126 |
|
127 def feed(self, data): |
|
128 """This method gives the raw XML data in the data parameter to |
|
129 the parser and makes it parse the data, emitting the |
|
130 corresponding events. It is allowed for XML constructs to be |
|
131 split across several calls to feed. |
|
132 |
|
133 feed may raise SAXException.""" |
|
134 raise NotImplementedError("This method must be implemented!") |
|
135 |
|
136 def prepareParser(self, source): |
|
137 """This method is called by the parse implementation to allow |
|
138 the SAX 2.0 driver to prepare itself for parsing.""" |
|
139 raise NotImplementedError("prepareParser must be overridden!") |
|
140 |
|
141 def close(self): |
|
142 """This method is called when the entire XML document has been |
|
143 passed to the parser through the feed method, to notify the |
|
144 parser that there are no more data. This allows the parser to |
|
145 do the final checks on the document and empty the internal |
|
146 data buffer. |
|
147 |
|
148 The parser will not be ready to parse another document until |
|
149 the reset method has been called. |
|
150 |
|
151 close may raise SAXException.""" |
|
152 raise NotImplementedError("This method must be implemented!") |
|
153 |
|
154 def reset(self): |
|
155 """This method is called after close has been called to reset |
|
156 the parser so that it is ready to parse new documents. The |
|
157 results of calling parse or feed after close without calling |
|
158 reset are undefined.""" |
|
159 raise NotImplementedError("This method must be implemented!") |
|
160 |
|
161 # ===== LOCATOR ===== |
|
162 |
|
163 class Locator: |
|
164 """Interface for associating a SAX event with a document |
|
165 location. A locator object will return valid results only during |
|
166 calls to DocumentHandler methods; at any other time, the |
|
167 results are unpredictable.""" |
|
168 |
|
169 def getColumnNumber(self): |
|
170 "Return the column number where the current event ends." |
|
171 return -1 |
|
172 |
|
173 def getLineNumber(self): |
|
174 "Return the line number where the current event ends." |
|
175 return -1 |
|
176 |
|
177 def getPublicId(self): |
|
178 "Return the public identifier for the current event." |
|
179 return None |
|
180 |
|
181 def getSystemId(self): |
|
182 "Return the system identifier for the current event." |
|
183 return None |
|
184 |
|
185 # ===== INPUTSOURCE ===== |
|
186 |
|
187 class InputSource: |
|
188 """Encapsulation of the information needed by the XMLReader to |
|
189 read entities. |
|
190 |
|
191 This class may include information about the public identifier, |
|
192 system identifier, byte stream (possibly with character encoding |
|
193 information) and/or the character stream of an entity. |
|
194 |
|
195 Applications will create objects of this class for use in the |
|
196 XMLReader.parse method and for returning from |
|
197 EntityResolver.resolveEntity. |
|
198 |
|
199 An InputSource belongs to the application, the XMLReader is not |
|
200 allowed to modify InputSource objects passed to it from the |
|
201 application, although it may make copies and modify those.""" |
|
202 |
|
203 def __init__(self, system_id = None): |
|
204 self.__system_id = system_id |
|
205 self.__public_id = None |
|
206 self.__encoding = None |
|
207 self.__bytefile = None |
|
208 self.__charfile = None |
|
209 |
|
210 def setPublicId(self, public_id): |
|
211 "Sets the public identifier of this InputSource." |
|
212 self.__public_id = public_id |
|
213 |
|
214 def getPublicId(self): |
|
215 "Returns the public identifier of this InputSource." |
|
216 return self.__public_id |
|
217 |
|
218 def setSystemId(self, system_id): |
|
219 "Sets the system identifier of this InputSource." |
|
220 self.__system_id = system_id |
|
221 |
|
222 def getSystemId(self): |
|
223 "Returns the system identifier of this InputSource." |
|
224 return self.__system_id |
|
225 |
|
226 def setEncoding(self, encoding): |
|
227 """Sets the character encoding of this InputSource. |
|
228 |
|
229 The encoding must be a string acceptable for an XML encoding |
|
230 declaration (see section 4.3.3 of the XML recommendation). |
|
231 |
|
232 The encoding attribute of the InputSource is ignored if the |
|
233 InputSource also contains a character stream.""" |
|
234 self.__encoding = encoding |
|
235 |
|
236 def getEncoding(self): |
|
237 "Get the character encoding of this InputSource." |
|
238 return self.__encoding |
|
239 |
|
240 def setByteStream(self, bytefile): |
|
241 """Set the byte stream (a Python file-like object which does |
|
242 not perform byte-to-character conversion) for this input |
|
243 source. |
|
244 |
|
245 The SAX parser will ignore this if there is also a character |
|
246 stream specified, but it will use a byte stream in preference |
|
247 to opening a URI connection itself. |
|
248 |
|
249 If the application knows the character encoding of the byte |
|
250 stream, it should set it with the setEncoding method.""" |
|
251 self.__bytefile = bytefile |
|
252 |
|
253 def getByteStream(self): |
|
254 """Get the byte stream for this input source. |
|
255 |
|
256 The getEncoding method will return the character encoding for |
|
257 this byte stream, or None if unknown.""" |
|
258 return self.__bytefile |
|
259 |
|
260 def setCharacterStream(self, charfile): |
|
261 """Set the character stream for this input source. (The stream |
|
262 must be a Python 2.0 Unicode-wrapped file-like that performs |
|
263 conversion to Unicode strings.) |
|
264 |
|
265 If there is a character stream specified, the SAX parser will |
|
266 ignore any byte stream and will not attempt to open a URI |
|
267 connection to the system identifier.""" |
|
268 self.__charfile = charfile |
|
269 |
|
270 def getCharacterStream(self): |
|
271 "Get the character stream for this input source." |
|
272 return self.__charfile |
|
273 |
|
274 # ===== ATTRIBUTESIMPL ===== |
|
275 |
|
276 class AttributesImpl: |
|
277 |
|
278 def __init__(self, attrs): |
|
279 """Non-NS-aware implementation. |
|
280 |
|
281 attrs should be of the form {name : value}.""" |
|
282 self._attrs = attrs |
|
283 |
|
284 def getLength(self): |
|
285 return len(self._attrs) |
|
286 |
|
287 def getType(self, name): |
|
288 return "CDATA" |
|
289 |
|
290 def getValue(self, name): |
|
291 return self._attrs[name] |
|
292 |
|
293 def getValueByQName(self, name): |
|
294 return self._attrs[name] |
|
295 |
|
296 def getNameByQName(self, name): |
|
297 if not name in self._attrs: |
|
298 raise KeyError, name |
|
299 return name |
|
300 |
|
301 def getQNameByName(self, name): |
|
302 if not name in self._attrs: |
|
303 raise KeyError, name |
|
304 return name |
|
305 |
|
306 def getNames(self): |
|
307 return self._attrs.keys() |
|
308 |
|
309 def getQNames(self): |
|
310 return self._attrs.keys() |
|
311 |
|
312 def __len__(self): |
|
313 return len(self._attrs) |
|
314 |
|
315 def __getitem__(self, name): |
|
316 return self._attrs[name] |
|
317 |
|
318 def keys(self): |
|
319 return self._attrs.keys() |
|
320 |
|
321 def has_key(self, name): |
|
322 return name in self._attrs |
|
323 |
|
324 def __contains__(self, name): |
|
325 return self._attrs.has_key(name) |
|
326 |
|
327 def get(self, name, alternative=None): |
|
328 return self._attrs.get(name, alternative) |
|
329 |
|
330 def copy(self): |
|
331 return self.__class__(self._attrs) |
|
332 |
|
333 def items(self): |
|
334 return self._attrs.items() |
|
335 |
|
336 def values(self): |
|
337 return self._attrs.values() |
|
338 |
|
339 # ===== ATTRIBUTESNSIMPL ===== |
|
340 |
|
341 class AttributesNSImpl(AttributesImpl): |
|
342 |
|
343 def __init__(self, attrs, qnames): |
|
344 """NS-aware implementation. |
|
345 |
|
346 attrs should be of the form {(ns_uri, lname): value, ...}. |
|
347 qnames of the form {(ns_uri, lname): qname, ...}.""" |
|
348 self._attrs = attrs |
|
349 self._qnames = qnames |
|
350 |
|
351 def getValueByQName(self, name): |
|
352 for (nsname, qname) in self._qnames.items(): |
|
353 if qname == name: |
|
354 return self._attrs[nsname] |
|
355 |
|
356 raise KeyError, name |
|
357 |
|
358 def getNameByQName(self, name): |
|
359 for (nsname, qname) in self._qnames.items(): |
|
360 if qname == name: |
|
361 return nsname |
|
362 |
|
363 raise KeyError, name |
|
364 |
|
365 def getQNameByName(self, name): |
|
366 return self._qnames[name] |
|
367 |
|
368 def getQNames(self): |
|
369 return self._qnames.values() |
|
370 |
|
371 def copy(self): |
|
372 return self.__class__(self._attrs, self._qnames) |
|
373 |
|
374 |
|
375 def _test(): |
|
376 XMLReader() |
|
377 IncrementalParser() |
|
378 Locator() |
|
379 |
|
380 if __name__ == "__main__": |
|
381 _test() |