|
1 # |
|
2 # ElementTree |
|
3 # $Id: ElementTree.py 2326 2005-03-17 07:45:21Z fredrik $ |
|
4 # |
|
5 # light-weight XML support for Python 1.5.2 and later. |
|
6 # |
|
7 # history: |
|
8 # 2001-10-20 fl created (from various sources) |
|
9 # 2001-11-01 fl return root from parse method |
|
10 # 2002-02-16 fl sort attributes in lexical order |
|
11 # 2002-04-06 fl TreeBuilder refactoring, added PythonDoc markup |
|
12 # 2002-05-01 fl finished TreeBuilder refactoring |
|
13 # 2002-07-14 fl added basic namespace support to ElementTree.write |
|
14 # 2002-07-25 fl added QName attribute support |
|
15 # 2002-10-20 fl fixed encoding in write |
|
16 # 2002-11-24 fl changed default encoding to ascii; fixed attribute encoding |
|
17 # 2002-11-27 fl accept file objects or file names for parse/write |
|
18 # 2002-12-04 fl moved XMLTreeBuilder back to this module |
|
19 # 2003-01-11 fl fixed entity encoding glitch for us-ascii |
|
20 # 2003-02-13 fl added XML literal factory |
|
21 # 2003-02-21 fl added ProcessingInstruction/PI factory |
|
22 # 2003-05-11 fl added tostring/fromstring helpers |
|
23 # 2003-05-26 fl added ElementPath support |
|
24 # 2003-07-05 fl added makeelement factory method |
|
25 # 2003-07-28 fl added more well-known namespace prefixes |
|
26 # 2003-08-15 fl fixed typo in ElementTree.findtext (Thomas Dartsch) |
|
27 # 2003-09-04 fl fall back on emulator if ElementPath is not installed |
|
28 # 2003-10-31 fl markup updates |
|
29 # 2003-11-15 fl fixed nested namespace bug |
|
30 # 2004-03-28 fl added XMLID helper |
|
31 # 2004-06-02 fl added default support to findtext |
|
32 # 2004-06-08 fl fixed encoding of non-ascii element/attribute names |
|
33 # 2004-08-23 fl take advantage of post-2.1 expat features |
|
34 # 2005-02-01 fl added iterparse implementation |
|
35 # 2005-03-02 fl fixed iterparse support for pre-2.2 versions |
|
36 # |
|
37 # Copyright (c) 1999-2005 by Fredrik Lundh. All rights reserved. |
|
38 # |
|
39 # fredrik@pythonware.com |
|
40 # http://www.pythonware.com |
|
41 # |
|
42 # -------------------------------------------------------------------- |
|
43 # The ElementTree toolkit is |
|
44 # |
|
45 # Copyright (c) 1999-2005 by Fredrik Lundh |
|
46 # |
|
47 # By obtaining, using, and/or copying this software and/or its |
|
48 # associated documentation, you agree that you have read, understood, |
|
49 # and will comply with the following terms and conditions: |
|
50 # |
|
51 # Permission to use, copy, modify, and distribute this software and |
|
52 # its associated documentation for any purpose and without fee is |
|
53 # hereby granted, provided that the above copyright notice appears in |
|
54 # all copies, and that both that copyright notice and this permission |
|
55 # notice appear in supporting documentation, and that the name of |
|
56 # Secret Labs AB or the author not be used in advertising or publicity |
|
57 # pertaining to distribution of the software without specific, written |
|
58 # prior permission. |
|
59 # |
|
60 # SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD |
|
61 # TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT- |
|
62 # ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR |
|
63 # BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY |
|
64 # DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, |
|
65 # WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS |
|
66 # ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE |
|
67 # OF THIS SOFTWARE. |
|
68 # -------------------------------------------------------------------- |
|
69 |
|
70 # Licensed to PSF under a Contributor Agreement. |
|
71 # See http://www.python.org/2.4/license for licensing details. |
|
72 |
|
73 __all__ = [ |
|
74 # public symbols |
|
75 "Comment", |
|
76 "dump", |
|
77 "Element", "ElementTree", |
|
78 "fromstring", |
|
79 "iselement", "iterparse", |
|
80 "parse", |
|
81 "PI", "ProcessingInstruction", |
|
82 "QName", |
|
83 "SubElement", |
|
84 "tostring", |
|
85 "TreeBuilder", |
|
86 "VERSION", "XML", |
|
87 "XMLParser", "XMLTreeBuilder", |
|
88 ] |
|
89 |
|
90 ## |
|
91 # The <b>Element</b> type is a flexible container object, designed to |
|
92 # store hierarchical data structures in memory. The type can be |
|
93 # described as a cross between a list and a dictionary. |
|
94 # <p> |
|
95 # Each element has a number of properties associated with it: |
|
96 # <ul> |
|
97 # <li>a <i>tag</i>. This is a string identifying what kind of data |
|
98 # this element represents (the element type, in other words).</li> |
|
99 # <li>a number of <i>attributes</i>, stored in a Python dictionary.</li> |
|
100 # <li>a <i>text</i> string.</li> |
|
101 # <li>an optional <i>tail</i> string.</li> |
|
102 # <li>a number of <i>child elements</i>, stored in a Python sequence</li> |
|
103 # </ul> |
|
104 # |
|
105 # To create an element instance, use the {@link #Element} or {@link |
|
106 # #SubElement} factory functions. |
|
107 # <p> |
|
108 # The {@link #ElementTree} class can be used to wrap an element |
|
109 # structure, and convert it from and to XML. |
|
110 ## |
|
111 |
|
112 import string, sys, re |
|
113 |
|
114 class _SimpleElementPath: |
|
115 # emulate pre-1.2 find/findtext/findall behaviour |
|
116 def find(self, element, tag): |
|
117 for elem in element: |
|
118 if elem.tag == tag: |
|
119 return elem |
|
120 return None |
|
121 def findtext(self, element, tag, default=None): |
|
122 for elem in element: |
|
123 if elem.tag == tag: |
|
124 return elem.text or "" |
|
125 return default |
|
126 def findall(self, element, tag): |
|
127 if tag[:3] == ".//": |
|
128 return element.getiterator(tag[3:]) |
|
129 result = [] |
|
130 for elem in element: |
|
131 if elem.tag == tag: |
|
132 result.append(elem) |
|
133 return result |
|
134 |
|
135 try: |
|
136 import ElementPath |
|
137 except ImportError: |
|
138 # FIXME: issue warning in this case? |
|
139 ElementPath = _SimpleElementPath() |
|
140 |
|
141 # TODO: add support for custom namespace resolvers/default namespaces |
|
142 # TODO: add improved support for incremental parsing |
|
143 |
|
144 VERSION = "1.2.6" |
|
145 |
|
146 ## |
|
147 # Internal element class. This class defines the Element interface, |
|
148 # and provides a reference implementation of this interface. |
|
149 # <p> |
|
150 # You should not create instances of this class directly. Use the |
|
151 # appropriate factory functions instead, such as {@link #Element} |
|
152 # and {@link #SubElement}. |
|
153 # |
|
154 # @see Element |
|
155 # @see SubElement |
|
156 # @see Comment |
|
157 # @see ProcessingInstruction |
|
158 |
|
159 class _ElementInterface: |
|
160 # <tag attrib>text<child/>...</tag>tail |
|
161 |
|
162 ## |
|
163 # (Attribute) Element tag. |
|
164 |
|
165 tag = None |
|
166 |
|
167 ## |
|
168 # (Attribute) Element attribute dictionary. Where possible, use |
|
169 # {@link #_ElementInterface.get}, |
|
170 # {@link #_ElementInterface.set}, |
|
171 # {@link #_ElementInterface.keys}, and |
|
172 # {@link #_ElementInterface.items} to access |
|
173 # element attributes. |
|
174 |
|
175 attrib = None |
|
176 |
|
177 ## |
|
178 # (Attribute) Text before first subelement. This is either a |
|
179 # string or the value None, if there was no text. |
|
180 |
|
181 text = None |
|
182 |
|
183 ## |
|
184 # (Attribute) Text after this element's end tag, but before the |
|
185 # next sibling element's start tag. This is either a string or |
|
186 # the value None, if there was no text. |
|
187 |
|
188 tail = None # text after end tag, if any |
|
189 |
|
190 def __init__(self, tag, attrib): |
|
191 self.tag = tag |
|
192 self.attrib = attrib |
|
193 self._children = [] |
|
194 |
|
195 def __repr__(self): |
|
196 return "<Element %s at %x>" % (self.tag, id(self)) |
|
197 |
|
198 ## |
|
199 # Creates a new element object of the same type as this element. |
|
200 # |
|
201 # @param tag Element tag. |
|
202 # @param attrib Element attributes, given as a dictionary. |
|
203 # @return A new element instance. |
|
204 |
|
205 def makeelement(self, tag, attrib): |
|
206 return Element(tag, attrib) |
|
207 |
|
208 ## |
|
209 # Returns the number of subelements. |
|
210 # |
|
211 # @return The number of subelements. |
|
212 |
|
213 def __len__(self): |
|
214 return len(self._children) |
|
215 |
|
216 ## |
|
217 # Returns the given subelement. |
|
218 # |
|
219 # @param index What subelement to return. |
|
220 # @return The given subelement. |
|
221 # @exception IndexError If the given element does not exist. |
|
222 |
|
223 def __getitem__(self, index): |
|
224 return self._children[index] |
|
225 |
|
226 ## |
|
227 # Replaces the given subelement. |
|
228 # |
|
229 # @param index What subelement to replace. |
|
230 # @param element The new element value. |
|
231 # @exception IndexError If the given element does not exist. |
|
232 # @exception AssertionError If element is not a valid object. |
|
233 |
|
234 def __setitem__(self, index, element): |
|
235 assert iselement(element) |
|
236 self._children[index] = element |
|
237 |
|
238 ## |
|
239 # Deletes the given subelement. |
|
240 # |
|
241 # @param index What subelement to delete. |
|
242 # @exception IndexError If the given element does not exist. |
|
243 |
|
244 def __delitem__(self, index): |
|
245 del self._children[index] |
|
246 |
|
247 ## |
|
248 # Returns a list containing subelements in the given range. |
|
249 # |
|
250 # @param start The first subelement to return. |
|
251 # @param stop The first subelement that shouldn't be returned. |
|
252 # @return A sequence object containing subelements. |
|
253 |
|
254 def __getslice__(self, start, stop): |
|
255 return self._children[start:stop] |
|
256 |
|
257 ## |
|
258 # Replaces a number of subelements with elements from a sequence. |
|
259 # |
|
260 # @param start The first subelement to replace. |
|
261 # @param stop The first subelement that shouldn't be replaced. |
|
262 # @param elements A sequence object with zero or more elements. |
|
263 # @exception AssertionError If a sequence member is not a valid object. |
|
264 |
|
265 def __setslice__(self, start, stop, elements): |
|
266 for element in elements: |
|
267 assert iselement(element) |
|
268 self._children[start:stop] = list(elements) |
|
269 |
|
270 ## |
|
271 # Deletes a number of subelements. |
|
272 # |
|
273 # @param start The first subelement to delete. |
|
274 # @param stop The first subelement to leave in there. |
|
275 |
|
276 def __delslice__(self, start, stop): |
|
277 del self._children[start:stop] |
|
278 |
|
279 ## |
|
280 # Adds a subelement to the end of this element. |
|
281 # |
|
282 # @param element The element to add. |
|
283 # @exception AssertionError If a sequence member is not a valid object. |
|
284 |
|
285 def append(self, element): |
|
286 assert iselement(element) |
|
287 self._children.append(element) |
|
288 |
|
289 ## |
|
290 # Inserts a subelement at the given position in this element. |
|
291 # |
|
292 # @param index Where to insert the new subelement. |
|
293 # @exception AssertionError If the element is not a valid object. |
|
294 |
|
295 def insert(self, index, element): |
|
296 assert iselement(element) |
|
297 self._children.insert(index, element) |
|
298 |
|
299 ## |
|
300 # Removes a matching subelement. Unlike the <b>find</b> methods, |
|
301 # this method compares elements based on identity, not on tag |
|
302 # value or contents. |
|
303 # |
|
304 # @param element What element to remove. |
|
305 # @exception ValueError If a matching element could not be found. |
|
306 # @exception AssertionError If the element is not a valid object. |
|
307 |
|
308 def remove(self, element): |
|
309 assert iselement(element) |
|
310 self._children.remove(element) |
|
311 |
|
312 ## |
|
313 # Returns all subelements. The elements are returned in document |
|
314 # order. |
|
315 # |
|
316 # @return A list of subelements. |
|
317 # @defreturn list of Element instances |
|
318 |
|
319 def getchildren(self): |
|
320 return self._children |
|
321 |
|
322 ## |
|
323 # Finds the first matching subelement, by tag name or path. |
|
324 # |
|
325 # @param path What element to look for. |
|
326 # @return The first matching element, or None if no element was found. |
|
327 # @defreturn Element or None |
|
328 |
|
329 def find(self, path): |
|
330 return ElementPath.find(self, path) |
|
331 |
|
332 ## |
|
333 # Finds text for the first matching subelement, by tag name or path. |
|
334 # |
|
335 # @param path What element to look for. |
|
336 # @param default What to return if the element was not found. |
|
337 # @return The text content of the first matching element, or the |
|
338 # default value no element was found. Note that if the element |
|
339 # has is found, but has no text content, this method returns an |
|
340 # empty string. |
|
341 # @defreturn string |
|
342 |
|
343 def findtext(self, path, default=None): |
|
344 return ElementPath.findtext(self, path, default) |
|
345 |
|
346 ## |
|
347 # Finds all matching subelements, by tag name or path. |
|
348 # |
|
349 # @param path What element to look for. |
|
350 # @return A list or iterator containing all matching elements, |
|
351 # in document order. |
|
352 # @defreturn list of Element instances |
|
353 |
|
354 def findall(self, path): |
|
355 return ElementPath.findall(self, path) |
|
356 |
|
357 ## |
|
358 # Resets an element. This function removes all subelements, clears |
|
359 # all attributes, and sets the text and tail attributes to None. |
|
360 |
|
361 def clear(self): |
|
362 self.attrib.clear() |
|
363 self._children = [] |
|
364 self.text = self.tail = None |
|
365 |
|
366 ## |
|
367 # Gets an element attribute. |
|
368 # |
|
369 # @param key What attribute to look for. |
|
370 # @param default What to return if the attribute was not found. |
|
371 # @return The attribute value, or the default value, if the |
|
372 # attribute was not found. |
|
373 # @defreturn string or None |
|
374 |
|
375 def get(self, key, default=None): |
|
376 return self.attrib.get(key, default) |
|
377 |
|
378 ## |
|
379 # Sets an element attribute. |
|
380 # |
|
381 # @param key What attribute to set. |
|
382 # @param value The attribute value. |
|
383 |
|
384 def set(self, key, value): |
|
385 self.attrib[key] = value |
|
386 |
|
387 ## |
|
388 # Gets a list of attribute names. The names are returned in an |
|
389 # arbitrary order (just like for an ordinary Python dictionary). |
|
390 # |
|
391 # @return A list of element attribute names. |
|
392 # @defreturn list of strings |
|
393 |
|
394 def keys(self): |
|
395 return self.attrib.keys() |
|
396 |
|
397 ## |
|
398 # Gets element attributes, as a sequence. The attributes are |
|
399 # returned in an arbitrary order. |
|
400 # |
|
401 # @return A list of (name, value) tuples for all attributes. |
|
402 # @defreturn list of (string, string) tuples |
|
403 |
|
404 def items(self): |
|
405 return self.attrib.items() |
|
406 |
|
407 ## |
|
408 # Creates a tree iterator. The iterator loops over this element |
|
409 # and all subelements, in document order, and returns all elements |
|
410 # with a matching tag. |
|
411 # <p> |
|
412 # If the tree structure is modified during iteration, the result |
|
413 # is undefined. |
|
414 # |
|
415 # @param tag What tags to look for (default is to return all elements). |
|
416 # @return A list or iterator containing all the matching elements. |
|
417 # @defreturn list or iterator |
|
418 |
|
419 def getiterator(self, tag=None): |
|
420 nodes = [] |
|
421 if tag == "*": |
|
422 tag = None |
|
423 if tag is None or self.tag == tag: |
|
424 nodes.append(self) |
|
425 for node in self._children: |
|
426 nodes.extend(node.getiterator(tag)) |
|
427 return nodes |
|
428 |
|
429 # compatibility |
|
430 _Element = _ElementInterface |
|
431 |
|
432 ## |
|
433 # Element factory. This function returns an object implementing the |
|
434 # standard Element interface. The exact class or type of that object |
|
435 # is implementation dependent, but it will always be compatible with |
|
436 # the {@link #_ElementInterface} class in this module. |
|
437 # <p> |
|
438 # The element name, attribute names, and attribute values can be |
|
439 # either 8-bit ASCII strings or Unicode strings. |
|
440 # |
|
441 # @param tag The element name. |
|
442 # @param attrib An optional dictionary, containing element attributes. |
|
443 # @param **extra Additional attributes, given as keyword arguments. |
|
444 # @return An element instance. |
|
445 # @defreturn Element |
|
446 |
|
447 def Element(tag, attrib={}, **extra): |
|
448 attrib = attrib.copy() |
|
449 attrib.update(extra) |
|
450 return _ElementInterface(tag, attrib) |
|
451 |
|
452 ## |
|
453 # Subelement factory. This function creates an element instance, and |
|
454 # appends it to an existing element. |
|
455 # <p> |
|
456 # The element name, attribute names, and attribute values can be |
|
457 # either 8-bit ASCII strings or Unicode strings. |
|
458 # |
|
459 # @param parent The parent element. |
|
460 # @param tag The subelement name. |
|
461 # @param attrib An optional dictionary, containing element attributes. |
|
462 # @param **extra Additional attributes, given as keyword arguments. |
|
463 # @return An element instance. |
|
464 # @defreturn Element |
|
465 |
|
466 def SubElement(parent, tag, attrib={}, **extra): |
|
467 attrib = attrib.copy() |
|
468 attrib.update(extra) |
|
469 element = parent.makeelement(tag, attrib) |
|
470 parent.append(element) |
|
471 return element |
|
472 |
|
473 ## |
|
474 # Comment element factory. This factory function creates a special |
|
475 # element that will be serialized as an XML comment. |
|
476 # <p> |
|
477 # The comment string can be either an 8-bit ASCII string or a Unicode |
|
478 # string. |
|
479 # |
|
480 # @param text A string containing the comment string. |
|
481 # @return An element instance, representing a comment. |
|
482 # @defreturn Element |
|
483 |
|
484 def Comment(text=None): |
|
485 element = Element(Comment) |
|
486 element.text = text |
|
487 return element |
|
488 |
|
489 ## |
|
490 # PI element factory. This factory function creates a special element |
|
491 # that will be serialized as an XML processing instruction. |
|
492 # |
|
493 # @param target A string containing the PI target. |
|
494 # @param text A string containing the PI contents, if any. |
|
495 # @return An element instance, representing a PI. |
|
496 # @defreturn Element |
|
497 |
|
498 def ProcessingInstruction(target, text=None): |
|
499 element = Element(ProcessingInstruction) |
|
500 element.text = target |
|
501 if text: |
|
502 element.text = element.text + " " + text |
|
503 return element |
|
504 |
|
505 PI = ProcessingInstruction |
|
506 |
|
507 ## |
|
508 # QName wrapper. This can be used to wrap a QName attribute value, in |
|
509 # order to get proper namespace handling on output. |
|
510 # |
|
511 # @param text A string containing the QName value, in the form {uri}local, |
|
512 # or, if the tag argument is given, the URI part of a QName. |
|
513 # @param tag Optional tag. If given, the first argument is interpreted as |
|
514 # an URI, and this argument is interpreted as a local name. |
|
515 # @return An opaque object, representing the QName. |
|
516 |
|
517 class QName: |
|
518 def __init__(self, text_or_uri, tag=None): |
|
519 if tag: |
|
520 text_or_uri = "{%s}%s" % (text_or_uri, tag) |
|
521 self.text = text_or_uri |
|
522 def __str__(self): |
|
523 return self.text |
|
524 def __hash__(self): |
|
525 return hash(self.text) |
|
526 def __cmp__(self, other): |
|
527 if isinstance(other, QName): |
|
528 return cmp(self.text, other.text) |
|
529 return cmp(self.text, other) |
|
530 |
|
531 ## |
|
532 # ElementTree wrapper class. This class represents an entire element |
|
533 # hierarchy, and adds some extra support for serialization to and from |
|
534 # standard XML. |
|
535 # |
|
536 # @param element Optional root element. |
|
537 # @keyparam file Optional file handle or name. If given, the |
|
538 # tree is initialized with the contents of this XML file. |
|
539 |
|
540 class ElementTree: |
|
541 |
|
542 def __init__(self, element=None, file=None): |
|
543 assert element is None or iselement(element) |
|
544 self._root = element # first node |
|
545 if file: |
|
546 self.parse(file) |
|
547 |
|
548 ## |
|
549 # Gets the root element for this tree. |
|
550 # |
|
551 # @return An element instance. |
|
552 # @defreturn Element |
|
553 |
|
554 def getroot(self): |
|
555 return self._root |
|
556 |
|
557 ## |
|
558 # Replaces the root element for this tree. This discards the |
|
559 # current contents of the tree, and replaces it with the given |
|
560 # element. Use with care. |
|
561 # |
|
562 # @param element An element instance. |
|
563 |
|
564 def _setroot(self, element): |
|
565 assert iselement(element) |
|
566 self._root = element |
|
567 |
|
568 ## |
|
569 # Loads an external XML document into this element tree. |
|
570 # |
|
571 # @param source A file name or file object. |
|
572 # @param parser An optional parser instance. If not given, the |
|
573 # standard {@link XMLTreeBuilder} parser is used. |
|
574 # @return The document root element. |
|
575 # @defreturn Element |
|
576 |
|
577 def parse(self, source, parser=None): |
|
578 if not hasattr(source, "read"): |
|
579 source = open(source, "rb") |
|
580 if not parser: |
|
581 parser = XMLTreeBuilder() |
|
582 while 1: |
|
583 data = source.read(32768) |
|
584 if not data: |
|
585 break |
|
586 parser.feed(data) |
|
587 self._root = parser.close() |
|
588 return self._root |
|
589 |
|
590 ## |
|
591 # Creates a tree iterator for the root element. The iterator loops |
|
592 # over all elements in this tree, in document order. |
|
593 # |
|
594 # @param tag What tags to look for (default is to return all elements) |
|
595 # @return An iterator. |
|
596 # @defreturn iterator |
|
597 |
|
598 def getiterator(self, tag=None): |
|
599 assert self._root is not None |
|
600 return self._root.getiterator(tag) |
|
601 |
|
602 ## |
|
603 # Finds the first toplevel element with given tag. |
|
604 # Same as getroot().find(path). |
|
605 # |
|
606 # @param path What element to look for. |
|
607 # @return The first matching element, or None if no element was found. |
|
608 # @defreturn Element or None |
|
609 |
|
610 def find(self, path): |
|
611 assert self._root is not None |
|
612 if path[:1] == "/": |
|
613 path = "." + path |
|
614 return self._root.find(path) |
|
615 |
|
616 ## |
|
617 # Finds the element text for the first toplevel element with given |
|
618 # tag. Same as getroot().findtext(path). |
|
619 # |
|
620 # @param path What toplevel element to look for. |
|
621 # @param default What to return if the element was not found. |
|
622 # @return The text content of the first matching element, or the |
|
623 # default value no element was found. Note that if the element |
|
624 # has is found, but has no text content, this method returns an |
|
625 # empty string. |
|
626 # @defreturn string |
|
627 |
|
628 def findtext(self, path, default=None): |
|
629 assert self._root is not None |
|
630 if path[:1] == "/": |
|
631 path = "." + path |
|
632 return self._root.findtext(path, default) |
|
633 |
|
634 ## |
|
635 # Finds all toplevel elements with the given tag. |
|
636 # Same as getroot().findall(path). |
|
637 # |
|
638 # @param path What element to look for. |
|
639 # @return A list or iterator containing all matching elements, |
|
640 # in document order. |
|
641 # @defreturn list of Element instances |
|
642 |
|
643 def findall(self, path): |
|
644 assert self._root is not None |
|
645 if path[:1] == "/": |
|
646 path = "." + path |
|
647 return self._root.findall(path) |
|
648 |
|
649 ## |
|
650 # Writes the element tree to a file, as XML. |
|
651 # |
|
652 # @param file A file name, or a file object opened for writing. |
|
653 # @param encoding Optional output encoding (default is US-ASCII). |
|
654 |
|
655 def write(self, file, encoding="us-ascii"): |
|
656 assert self._root is not None |
|
657 if not hasattr(file, "write"): |
|
658 file = open(file, "wb") |
|
659 if not encoding: |
|
660 encoding = "us-ascii" |
|
661 elif encoding != "utf-8" and encoding != "us-ascii": |
|
662 file.write("<?xml version='1.0' encoding='%s'?>\n" % encoding) |
|
663 self._write(file, self._root, encoding, {}) |
|
664 |
|
665 def _write(self, file, node, encoding, namespaces): |
|
666 # write XML to file |
|
667 tag = node.tag |
|
668 if tag is Comment: |
|
669 file.write("<!-- %s -->" % _escape_cdata(node.text, encoding)) |
|
670 elif tag is ProcessingInstruction: |
|
671 file.write("<?%s?>" % _escape_cdata(node.text, encoding)) |
|
672 else: |
|
673 items = node.items() |
|
674 xmlns_items = [] # new namespaces in this scope |
|
675 try: |
|
676 if isinstance(tag, QName) or tag[:1] == "{": |
|
677 tag, xmlns = fixtag(tag, namespaces) |
|
678 if xmlns: xmlns_items.append(xmlns) |
|
679 except TypeError: |
|
680 _raise_serialization_error(tag) |
|
681 file.write("<" + _encode(tag, encoding)) |
|
682 if items or xmlns_items: |
|
683 items.sort() # lexical order |
|
684 for k, v in items: |
|
685 try: |
|
686 if isinstance(k, QName) or k[:1] == "{": |
|
687 k, xmlns = fixtag(k, namespaces) |
|
688 if xmlns: xmlns_items.append(xmlns) |
|
689 except TypeError: |
|
690 _raise_serialization_error(k) |
|
691 try: |
|
692 if isinstance(v, QName): |
|
693 v, xmlns = fixtag(v, namespaces) |
|
694 if xmlns: xmlns_items.append(xmlns) |
|
695 except TypeError: |
|
696 _raise_serialization_error(v) |
|
697 file.write(" %s=\"%s\"" % (_encode(k, encoding), |
|
698 _escape_attrib(v, encoding))) |
|
699 for k, v in xmlns_items: |
|
700 file.write(" %s=\"%s\"" % (_encode(k, encoding), |
|
701 _escape_attrib(v, encoding))) |
|
702 if node.text or len(node): |
|
703 file.write(">") |
|
704 if node.text: |
|
705 file.write(_escape_cdata(node.text, encoding)) |
|
706 for n in node: |
|
707 self._write(file, n, encoding, namespaces) |
|
708 file.write("</" + _encode(tag, encoding) + ">") |
|
709 else: |
|
710 file.write(" />") |
|
711 for k, v in xmlns_items: |
|
712 del namespaces[v] |
|
713 if node.tail: |
|
714 file.write(_escape_cdata(node.tail, encoding)) |
|
715 |
|
716 # -------------------------------------------------------------------- |
|
717 # helpers |
|
718 |
|
719 ## |
|
720 # Checks if an object appears to be a valid element object. |
|
721 # |
|
722 # @param An element instance. |
|
723 # @return A true value if this is an element object. |
|
724 # @defreturn flag |
|
725 |
|
726 def iselement(element): |
|
727 # FIXME: not sure about this; might be a better idea to look |
|
728 # for tag/attrib/text attributes |
|
729 return isinstance(element, _ElementInterface) or hasattr(element, "tag") |
|
730 |
|
731 ## |
|
732 # Writes an element tree or element structure to sys.stdout. This |
|
733 # function should be used for debugging only. |
|
734 # <p> |
|
735 # The exact output format is implementation dependent. In this |
|
736 # version, it's written as an ordinary XML file. |
|
737 # |
|
738 # @param elem An element tree or an individual element. |
|
739 |
|
740 def dump(elem): |
|
741 # debugging |
|
742 if not isinstance(elem, ElementTree): |
|
743 elem = ElementTree(elem) |
|
744 elem.write(sys.stdout) |
|
745 tail = elem.getroot().tail |
|
746 if not tail or tail[-1] != "\n": |
|
747 sys.stdout.write("\n") |
|
748 |
|
749 def _encode(s, encoding): |
|
750 try: |
|
751 return s.encode(encoding) |
|
752 except AttributeError: |
|
753 return s # 1.5.2: assume the string uses the right encoding |
|
754 |
|
755 if sys.version[:3] == "1.5": |
|
756 _escape = re.compile(r"[&<>\"\x80-\xff]+") # 1.5.2 |
|
757 else: |
|
758 _escape = re.compile(eval(r'u"[&<>\"\u0080-\uffff]+"')) |
|
759 |
|
760 _escape_map = { |
|
761 "&": "&", |
|
762 "<": "<", |
|
763 ">": ">", |
|
764 '"': """, |
|
765 } |
|
766 |
|
767 _namespace_map = { |
|
768 # "well-known" namespace prefixes |
|
769 "http://www.w3.org/XML/1998/namespace": "xml", |
|
770 "http://www.w3.org/1999/xhtml": "html", |
|
771 "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf", |
|
772 "http://schemas.xmlsoap.org/wsdl/": "wsdl", |
|
773 } |
|
774 |
|
775 def _raise_serialization_error(text): |
|
776 raise TypeError( |
|
777 "cannot serialize %r (type %s)" % (text, type(text).__name__) |
|
778 ) |
|
779 |
|
780 def _encode_entity(text, pattern=_escape): |
|
781 # map reserved and non-ascii characters to numerical entities |
|
782 def escape_entities(m, map=_escape_map): |
|
783 out = [] |
|
784 append = out.append |
|
785 for char in m.group(): |
|
786 text = map.get(char) |
|
787 if text is None: |
|
788 text = "&#%d;" % ord(char) |
|
789 append(text) |
|
790 return string.join(out, "") |
|
791 try: |
|
792 return _encode(pattern.sub(escape_entities, text), "ascii") |
|
793 except TypeError: |
|
794 _raise_serialization_error(text) |
|
795 |
|
796 # |
|
797 # the following functions assume an ascii-compatible encoding |
|
798 # (or "utf-16") |
|
799 |
|
800 def _escape_cdata(text, encoding=None, replace=string.replace): |
|
801 # escape character data |
|
802 try: |
|
803 if encoding: |
|
804 try: |
|
805 text = _encode(text, encoding) |
|
806 except UnicodeError: |
|
807 return _encode_entity(text) |
|
808 text = replace(text, "&", "&") |
|
809 text = replace(text, "<", "<") |
|
810 text = replace(text, ">", ">") |
|
811 return text |
|
812 except (TypeError, AttributeError): |
|
813 _raise_serialization_error(text) |
|
814 |
|
815 def _escape_attrib(text, encoding=None, replace=string.replace): |
|
816 # escape attribute value |
|
817 try: |
|
818 if encoding: |
|
819 try: |
|
820 text = _encode(text, encoding) |
|
821 except UnicodeError: |
|
822 return _encode_entity(text) |
|
823 text = replace(text, "&", "&") |
|
824 text = replace(text, "'", "'") # FIXME: overkill |
|
825 text = replace(text, "\"", """) |
|
826 text = replace(text, "<", "<") |
|
827 text = replace(text, ">", ">") |
|
828 return text |
|
829 except (TypeError, AttributeError): |
|
830 _raise_serialization_error(text) |
|
831 |
|
832 def fixtag(tag, namespaces): |
|
833 # given a decorated tag (of the form {uri}tag), return prefixed |
|
834 # tag and namespace declaration, if any |
|
835 if isinstance(tag, QName): |
|
836 tag = tag.text |
|
837 namespace_uri, tag = string.split(tag[1:], "}", 1) |
|
838 prefix = namespaces.get(namespace_uri) |
|
839 if prefix is None: |
|
840 prefix = _namespace_map.get(namespace_uri) |
|
841 if prefix is None: |
|
842 prefix = "ns%d" % len(namespaces) |
|
843 namespaces[namespace_uri] = prefix |
|
844 if prefix == "xml": |
|
845 xmlns = None |
|
846 else: |
|
847 xmlns = ("xmlns:%s" % prefix, namespace_uri) |
|
848 else: |
|
849 xmlns = None |
|
850 return "%s:%s" % (prefix, tag), xmlns |
|
851 |
|
852 ## |
|
853 # Parses an XML document into an element tree. |
|
854 # |
|
855 # @param source A filename or file object containing XML data. |
|
856 # @param parser An optional parser instance. If not given, the |
|
857 # standard {@link XMLTreeBuilder} parser is used. |
|
858 # @return An ElementTree instance |
|
859 |
|
860 def parse(source, parser=None): |
|
861 tree = ElementTree() |
|
862 tree.parse(source, parser) |
|
863 return tree |
|
864 |
|
865 ## |
|
866 # Parses an XML document into an element tree incrementally, and reports |
|
867 # what's going on to the user. |
|
868 # |
|
869 # @param source A filename or file object containing XML data. |
|
870 # @param events A list of events to report back. If omitted, only "end" |
|
871 # events are reported. |
|
872 # @return A (event, elem) iterator. |
|
873 |
|
874 class iterparse: |
|
875 |
|
876 def __init__(self, source, events=None): |
|
877 if not hasattr(source, "read"): |
|
878 source = open(source, "rb") |
|
879 self._file = source |
|
880 self._events = [] |
|
881 self._index = 0 |
|
882 self.root = self._root = None |
|
883 self._parser = XMLTreeBuilder() |
|
884 # wire up the parser for event reporting |
|
885 parser = self._parser._parser |
|
886 append = self._events.append |
|
887 if events is None: |
|
888 events = ["end"] |
|
889 for event in events: |
|
890 if event == "start": |
|
891 try: |
|
892 parser.ordered_attributes = 1 |
|
893 parser.specified_attributes = 1 |
|
894 def handler(tag, attrib_in, event=event, append=append, |
|
895 start=self._parser._start_list): |
|
896 append((event, start(tag, attrib_in))) |
|
897 parser.StartElementHandler = handler |
|
898 except AttributeError: |
|
899 def handler(tag, attrib_in, event=event, append=append, |
|
900 start=self._parser._start): |
|
901 append((event, start(tag, attrib_in))) |
|
902 parser.StartElementHandler = handler |
|
903 elif event == "end": |
|
904 def handler(tag, event=event, append=append, |
|
905 end=self._parser._end): |
|
906 append((event, end(tag))) |
|
907 parser.EndElementHandler = handler |
|
908 elif event == "start-ns": |
|
909 def handler(prefix, uri, event=event, append=append): |
|
910 try: |
|
911 uri = _encode(uri, "ascii") |
|
912 except UnicodeError: |
|
913 pass |
|
914 append((event, (prefix or "", uri))) |
|
915 parser.StartNamespaceDeclHandler = handler |
|
916 elif event == "end-ns": |
|
917 def handler(prefix, event=event, append=append): |
|
918 append((event, None)) |
|
919 parser.EndNamespaceDeclHandler = handler |
|
920 |
|
921 def next(self): |
|
922 while 1: |
|
923 try: |
|
924 item = self._events[self._index] |
|
925 except IndexError: |
|
926 if self._parser is None: |
|
927 self.root = self._root |
|
928 try: |
|
929 raise StopIteration |
|
930 except NameError: |
|
931 raise IndexError |
|
932 # load event buffer |
|
933 del self._events[:] |
|
934 self._index = 0 |
|
935 data = self._file.read(16384) |
|
936 if data: |
|
937 self._parser.feed(data) |
|
938 else: |
|
939 self._root = self._parser.close() |
|
940 self._parser = None |
|
941 else: |
|
942 self._index = self._index + 1 |
|
943 return item |
|
944 |
|
945 try: |
|
946 iter |
|
947 def __iter__(self): |
|
948 return self |
|
949 except NameError: |
|
950 def __getitem__(self, index): |
|
951 return self.next() |
|
952 |
|
953 ## |
|
954 # Parses an XML document from a string constant. This function can |
|
955 # be used to embed "XML literals" in Python code. |
|
956 # |
|
957 # @param source A string containing XML data. |
|
958 # @return An Element instance. |
|
959 # @defreturn Element |
|
960 |
|
961 def XML(text): |
|
962 parser = XMLTreeBuilder() |
|
963 parser.feed(text) |
|
964 return parser.close() |
|
965 |
|
966 ## |
|
967 # Parses an XML document from a string constant, and also returns |
|
968 # a dictionary which maps from element id:s to elements. |
|
969 # |
|
970 # @param source A string containing XML data. |
|
971 # @return A tuple containing an Element instance and a dictionary. |
|
972 # @defreturn (Element, dictionary) |
|
973 |
|
974 def XMLID(text): |
|
975 parser = XMLTreeBuilder() |
|
976 parser.feed(text) |
|
977 tree = parser.close() |
|
978 ids = {} |
|
979 for elem in tree.getiterator(): |
|
980 id = elem.get("id") |
|
981 if id: |
|
982 ids[id] = elem |
|
983 return tree, ids |
|
984 |
|
985 ## |
|
986 # Parses an XML document from a string constant. Same as {@link #XML}. |
|
987 # |
|
988 # @def fromstring(text) |
|
989 # @param source A string containing XML data. |
|
990 # @return An Element instance. |
|
991 # @defreturn Element |
|
992 |
|
993 fromstring = XML |
|
994 |
|
995 ## |
|
996 # Generates a string representation of an XML element, including all |
|
997 # subelements. |
|
998 # |
|
999 # @param element An Element instance. |
|
1000 # @return An encoded string containing the XML data. |
|
1001 # @defreturn string |
|
1002 |
|
1003 def tostring(element, encoding=None): |
|
1004 class dummy: |
|
1005 pass |
|
1006 data = [] |
|
1007 file = dummy() |
|
1008 file.write = data.append |
|
1009 ElementTree(element).write(file, encoding) |
|
1010 return string.join(data, "") |
|
1011 |
|
1012 ## |
|
1013 # Generic element structure builder. This builder converts a sequence |
|
1014 # of {@link #TreeBuilder.start}, {@link #TreeBuilder.data}, and {@link |
|
1015 # #TreeBuilder.end} method calls to a well-formed element structure. |
|
1016 # <p> |
|
1017 # You can use this class to build an element structure using a custom XML |
|
1018 # parser, or a parser for some other XML-like format. |
|
1019 # |
|
1020 # @param element_factory Optional element factory. This factory |
|
1021 # is called to create new Element instances, as necessary. |
|
1022 |
|
1023 class TreeBuilder: |
|
1024 |
|
1025 def __init__(self, element_factory=None): |
|
1026 self._data = [] # data collector |
|
1027 self._elem = [] # element stack |
|
1028 self._last = None # last element |
|
1029 self._tail = None # true if we're after an end tag |
|
1030 if element_factory is None: |
|
1031 element_factory = _ElementInterface |
|
1032 self._factory = element_factory |
|
1033 |
|
1034 ## |
|
1035 # Flushes the parser buffers, and returns the toplevel documen |
|
1036 # element. |
|
1037 # |
|
1038 # @return An Element instance. |
|
1039 # @defreturn Element |
|
1040 |
|
1041 def close(self): |
|
1042 assert len(self._elem) == 0, "missing end tags" |
|
1043 assert self._last != None, "missing toplevel element" |
|
1044 return self._last |
|
1045 |
|
1046 def _flush(self): |
|
1047 if self._data: |
|
1048 if self._last is not None: |
|
1049 text = string.join(self._data, "") |
|
1050 if self._tail: |
|
1051 assert self._last.tail is None, "internal error (tail)" |
|
1052 self._last.tail = text |
|
1053 else: |
|
1054 assert self._last.text is None, "internal error (text)" |
|
1055 self._last.text = text |
|
1056 self._data = [] |
|
1057 |
|
1058 ## |
|
1059 # Adds text to the current element. |
|
1060 # |
|
1061 # @param data A string. This should be either an 8-bit string |
|
1062 # containing ASCII text, or a Unicode string. |
|
1063 |
|
1064 def data(self, data): |
|
1065 self._data.append(data) |
|
1066 |
|
1067 ## |
|
1068 # Opens a new element. |
|
1069 # |
|
1070 # @param tag The element name. |
|
1071 # @param attrib A dictionary containing element attributes. |
|
1072 # @return The opened element. |
|
1073 # @defreturn Element |
|
1074 |
|
1075 def start(self, tag, attrs): |
|
1076 self._flush() |
|
1077 self._last = elem = self._factory(tag, attrs) |
|
1078 if self._elem: |
|
1079 self._elem[-1].append(elem) |
|
1080 self._elem.append(elem) |
|
1081 self._tail = 0 |
|
1082 return elem |
|
1083 |
|
1084 ## |
|
1085 # Closes the current element. |
|
1086 # |
|
1087 # @param tag The element name. |
|
1088 # @return The closed element. |
|
1089 # @defreturn Element |
|
1090 |
|
1091 def end(self, tag): |
|
1092 self._flush() |
|
1093 self._last = self._elem.pop() |
|
1094 assert self._last.tag == tag,\ |
|
1095 "end tag mismatch (expected %s, got %s)" % ( |
|
1096 self._last.tag, tag) |
|
1097 self._tail = 1 |
|
1098 return self._last |
|
1099 |
|
1100 ## |
|
1101 # Element structure builder for XML source data, based on the |
|
1102 # <b>expat</b> parser. |
|
1103 # |
|
1104 # @keyparam target Target object. If omitted, the builder uses an |
|
1105 # instance of the standard {@link #TreeBuilder} class. |
|
1106 # @keyparam html Predefine HTML entities. This flag is not supported |
|
1107 # by the current implementation. |
|
1108 # @see #ElementTree |
|
1109 # @see #TreeBuilder |
|
1110 |
|
1111 class XMLTreeBuilder: |
|
1112 |
|
1113 def __init__(self, html=0, target=None): |
|
1114 try: |
|
1115 from xml.parsers import expat |
|
1116 except ImportError: |
|
1117 raise ImportError( |
|
1118 "No module named expat; use SimpleXMLTreeBuilder instead" |
|
1119 ) |
|
1120 self._parser = parser = expat.ParserCreate(None, "}") |
|
1121 if target is None: |
|
1122 target = TreeBuilder() |
|
1123 self._target = target |
|
1124 self._names = {} # name memo cache |
|
1125 # callbacks |
|
1126 parser.DefaultHandlerExpand = self._default |
|
1127 parser.StartElementHandler = self._start |
|
1128 parser.EndElementHandler = self._end |
|
1129 parser.CharacterDataHandler = self._data |
|
1130 # let expat do the buffering, if supported |
|
1131 try: |
|
1132 self._parser.buffer_text = 1 |
|
1133 except AttributeError: |
|
1134 pass |
|
1135 # use new-style attribute handling, if supported |
|
1136 try: |
|
1137 self._parser.ordered_attributes = 1 |
|
1138 self._parser.specified_attributes = 1 |
|
1139 parser.StartElementHandler = self._start_list |
|
1140 except AttributeError: |
|
1141 pass |
|
1142 encoding = None |
|
1143 if not parser.returns_unicode: |
|
1144 encoding = "utf-8" |
|
1145 # target.xml(encoding, None) |
|
1146 self._doctype = None |
|
1147 self.entity = {} |
|
1148 |
|
1149 def _fixtext(self, text): |
|
1150 # convert text string to ascii, if possible |
|
1151 try: |
|
1152 return _encode(text, "ascii") |
|
1153 except UnicodeError: |
|
1154 return text |
|
1155 |
|
1156 def _fixname(self, key): |
|
1157 # expand qname, and convert name string to ascii, if possible |
|
1158 try: |
|
1159 name = self._names[key] |
|
1160 except KeyError: |
|
1161 name = key |
|
1162 if "}" in name: |
|
1163 name = "{" + name |
|
1164 self._names[key] = name = self._fixtext(name) |
|
1165 return name |
|
1166 |
|
1167 def _start(self, tag, attrib_in): |
|
1168 fixname = self._fixname |
|
1169 tag = fixname(tag) |
|
1170 attrib = {} |
|
1171 for key, value in attrib_in.items(): |
|
1172 attrib[fixname(key)] = self._fixtext(value) |
|
1173 return self._target.start(tag, attrib) |
|
1174 |
|
1175 def _start_list(self, tag, attrib_in): |
|
1176 fixname = self._fixname |
|
1177 tag = fixname(tag) |
|
1178 attrib = {} |
|
1179 if attrib_in: |
|
1180 for i in range(0, len(attrib_in), 2): |
|
1181 attrib[fixname(attrib_in[i])] = self._fixtext(attrib_in[i+1]) |
|
1182 return self._target.start(tag, attrib) |
|
1183 |
|
1184 def _data(self, text): |
|
1185 return self._target.data(self._fixtext(text)) |
|
1186 |
|
1187 def _end(self, tag): |
|
1188 return self._target.end(self._fixname(tag)) |
|
1189 |
|
1190 def _default(self, text): |
|
1191 prefix = text[:1] |
|
1192 if prefix == "&": |
|
1193 # deal with undefined entities |
|
1194 try: |
|
1195 self._target.data(self.entity[text[1:-1]]) |
|
1196 except KeyError: |
|
1197 from xml.parsers import expat |
|
1198 raise expat.error( |
|
1199 "undefined entity %s: line %d, column %d" % |
|
1200 (text, self._parser.ErrorLineNumber, |
|
1201 self._parser.ErrorColumnNumber) |
|
1202 ) |
|
1203 elif prefix == "<" and text[:9] == "<!DOCTYPE": |
|
1204 self._doctype = [] # inside a doctype declaration |
|
1205 elif self._doctype is not None: |
|
1206 # parse doctype contents |
|
1207 if prefix == ">": |
|
1208 self._doctype = None |
|
1209 return |
|
1210 text = string.strip(text) |
|
1211 if not text: |
|
1212 return |
|
1213 self._doctype.append(text) |
|
1214 n = len(self._doctype) |
|
1215 if n > 2: |
|
1216 type = self._doctype[1] |
|
1217 if type == "PUBLIC" and n == 4: |
|
1218 name, type, pubid, system = self._doctype |
|
1219 elif type == "SYSTEM" and n == 3: |
|
1220 name, type, system = self._doctype |
|
1221 pubid = None |
|
1222 else: |
|
1223 return |
|
1224 if pubid: |
|
1225 pubid = pubid[1:-1] |
|
1226 self.doctype(name, pubid, system[1:-1]) |
|
1227 self._doctype = None |
|
1228 |
|
1229 ## |
|
1230 # Handles a doctype declaration. |
|
1231 # |
|
1232 # @param name Doctype name. |
|
1233 # @param pubid Public identifier. |
|
1234 # @param system System identifier. |
|
1235 |
|
1236 def doctype(self, name, pubid, system): |
|
1237 pass |
|
1238 |
|
1239 ## |
|
1240 # Feeds data to the parser. |
|
1241 # |
|
1242 # @param data Encoded data. |
|
1243 |
|
1244 def feed(self, data): |
|
1245 self._parser.Parse(data, 0) |
|
1246 |
|
1247 ## |
|
1248 # Finishes feeding data to the parser. |
|
1249 # |
|
1250 # @return An element structure. |
|
1251 # @defreturn Element |
|
1252 |
|
1253 def close(self): |
|
1254 self._parser.Parse("", 1) # end of data |
|
1255 tree = self._target.close() |
|
1256 del self._target, self._parser # get rid of circular references |
|
1257 return tree |
|
1258 |
|
1259 # compatibility |
|
1260 XMLParser = XMLTreeBuilder |