symbian-qemu-0.9.1-12/python-win32-2.6.1/lib/xml/dom/expatbuilder.py
changeset 1 2fb8b9db1c86
equal deleted inserted replaced
0:ffa851df0825 1:2fb8b9db1c86
       
     1 """Facility to use the Expat parser to load a minidom instance
       
     2 from a string or file.
       
     3 
       
     4 This avoids all the overhead of SAX and pulldom to gain performance.
       
     5 """
       
     6 
       
     7 # Warning!
       
     8 #
       
     9 # This module is tightly bound to the implementation details of the
       
    10 # minidom DOM and can't be used with other DOM implementations.  This
       
    11 # is due, in part, to a lack of appropriate methods in the DOM (there is
       
    12 # no way to create Entity and Notation nodes via the DOM Level 2
       
    13 # interface), and for performance.  The later is the cause of some fairly
       
    14 # cryptic code.
       
    15 #
       
    16 # Performance hacks:
       
    17 #
       
    18 #   -  .character_data_handler() has an extra case in which continuing
       
    19 #      data is appended to an existing Text node; this can be a
       
    20 #      speedup since pyexpat can break up character data into multiple
       
    21 #      callbacks even though we set the buffer_text attribute on the
       
    22 #      parser.  This also gives us the advantage that we don't need a
       
    23 #      separate normalization pass.
       
    24 #
       
    25 #   -  Determining that a node exists is done using an identity comparison
       
    26 #      with None rather than a truth test; this avoids searching for and
       
    27 #      calling any methods on the node object if it exists.  (A rather
       
    28 #      nice speedup is achieved this way as well!)
       
    29 
       
    30 from xml.dom import xmlbuilder, minidom, Node
       
    31 from xml.dom import EMPTY_NAMESPACE, EMPTY_PREFIX, XMLNS_NAMESPACE
       
    32 from xml.parsers import expat
       
    33 from xml.dom.minidom import _append_child, _set_attribute_node
       
    34 from xml.dom.NodeFilter import NodeFilter
       
    35 
       
    36 from xml.dom.minicompat import *
       
    37 
       
    38 TEXT_NODE = Node.TEXT_NODE
       
    39 CDATA_SECTION_NODE = Node.CDATA_SECTION_NODE
       
    40 DOCUMENT_NODE = Node.DOCUMENT_NODE
       
    41 
       
    42 FILTER_ACCEPT = xmlbuilder.DOMBuilderFilter.FILTER_ACCEPT
       
    43 FILTER_REJECT = xmlbuilder.DOMBuilderFilter.FILTER_REJECT
       
    44 FILTER_SKIP = xmlbuilder.DOMBuilderFilter.FILTER_SKIP
       
    45 FILTER_INTERRUPT = xmlbuilder.DOMBuilderFilter.FILTER_INTERRUPT
       
    46 
       
    47 theDOMImplementation = minidom.getDOMImplementation()
       
    48 
       
    49 # Expat typename -> TypeInfo
       
    50 _typeinfo_map = {
       
    51     "CDATA":    minidom.TypeInfo(None, "cdata"),
       
    52     "ENUM":     minidom.TypeInfo(None, "enumeration"),
       
    53     "ENTITY":   minidom.TypeInfo(None, "entity"),
       
    54     "ENTITIES": minidom.TypeInfo(None, "entities"),
       
    55     "ID":       minidom.TypeInfo(None, "id"),
       
    56     "IDREF":    minidom.TypeInfo(None, "idref"),
       
    57     "IDREFS":   minidom.TypeInfo(None, "idrefs"),
       
    58     "NMTOKEN":  minidom.TypeInfo(None, "nmtoken"),
       
    59     "NMTOKENS": minidom.TypeInfo(None, "nmtokens"),
       
    60     }
       
    61 
       
    62 class ElementInfo(object):
       
    63     __slots__ = '_attr_info', '_model', 'tagName'
       
    64 
       
    65     def __init__(self, tagName, model=None):
       
    66         self.tagName = tagName
       
    67         self._attr_info = []
       
    68         self._model = model
       
    69 
       
    70     def __getstate__(self):
       
    71         return self._attr_info, self._model, self.tagName
       
    72 
       
    73     def __setstate__(self, state):
       
    74         self._attr_info, self._model, self.tagName = state
       
    75 
       
    76     def getAttributeType(self, aname):
       
    77         for info in self._attr_info:
       
    78             if info[1] == aname:
       
    79                 t = info[-2]
       
    80                 if t[0] == "(":
       
    81                     return _typeinfo_map["ENUM"]
       
    82                 else:
       
    83                     return _typeinfo_map[info[-2]]
       
    84         return minidom._no_type
       
    85 
       
    86     def getAttributeTypeNS(self, namespaceURI, localName):
       
    87         return minidom._no_type
       
    88 
       
    89     def isElementContent(self):
       
    90         if self._model:
       
    91             type = self._model[0]
       
    92             return type not in (expat.model.XML_CTYPE_ANY,
       
    93                                 expat.model.XML_CTYPE_MIXED)
       
    94         else:
       
    95             return False
       
    96 
       
    97     def isEmpty(self):
       
    98         if self._model:
       
    99             return self._model[0] == expat.model.XML_CTYPE_EMPTY
       
   100         else:
       
   101             return False
       
   102 
       
   103     def isId(self, aname):
       
   104         for info in self._attr_info:
       
   105             if info[1] == aname:
       
   106                 return info[-2] == "ID"
       
   107         return False
       
   108 
       
   109     def isIdNS(self, euri, ename, auri, aname):
       
   110         # not sure this is meaningful
       
   111         return self.isId((auri, aname))
       
   112 
       
   113 def _intern(builder, s):
       
   114     return builder._intern_setdefault(s, s)
       
   115 
       
   116 def _parse_ns_name(builder, name):
       
   117     assert ' ' in name
       
   118     parts = name.split(' ')
       
   119     intern = builder._intern_setdefault
       
   120     if len(parts) == 3:
       
   121         uri, localname, prefix = parts
       
   122         prefix = intern(prefix, prefix)
       
   123         qname = "%s:%s" % (prefix, localname)
       
   124         qname = intern(qname, qname)
       
   125         localname = intern(localname, localname)
       
   126     else:
       
   127         uri, localname = parts
       
   128         prefix = EMPTY_PREFIX
       
   129         qname = localname = intern(localname, localname)
       
   130     return intern(uri, uri), localname, prefix, qname
       
   131 
       
   132 
       
   133 class ExpatBuilder:
       
   134     """Document builder that uses Expat to build a ParsedXML.DOM document
       
   135     instance."""
       
   136 
       
   137     def __init__(self, options=None):
       
   138         if options is None:
       
   139             options = xmlbuilder.Options()
       
   140         self._options = options
       
   141         if self._options.filter is not None:
       
   142             self._filter = FilterVisibilityController(self._options.filter)
       
   143         else:
       
   144             self._filter = None
       
   145             # This *really* doesn't do anything in this case, so
       
   146             # override it with something fast & minimal.
       
   147             self._finish_start_element = id
       
   148         self._parser = None
       
   149         self.reset()
       
   150 
       
   151     def createParser(self):
       
   152         """Create a new parser object."""
       
   153         return expat.ParserCreate()
       
   154 
       
   155     def getParser(self):
       
   156         """Return the parser object, creating a new one if needed."""
       
   157         if not self._parser:
       
   158             self._parser = self.createParser()
       
   159             self._intern_setdefault = self._parser.intern.setdefault
       
   160             self._parser.buffer_text = True
       
   161             self._parser.ordered_attributes = True
       
   162             self._parser.specified_attributes = True
       
   163             self.install(self._parser)
       
   164         return self._parser
       
   165 
       
   166     def reset(self):
       
   167         """Free all data structures used during DOM construction."""
       
   168         self.document = theDOMImplementation.createDocument(
       
   169             EMPTY_NAMESPACE, None, None)
       
   170         self.curNode = self.document
       
   171         self._elem_info = self.document._elem_info
       
   172         self._cdata = False
       
   173 
       
   174     def install(self, parser):
       
   175         """Install the callbacks needed to build the DOM into the parser."""
       
   176         # This creates circular references!
       
   177         parser.StartDoctypeDeclHandler = self.start_doctype_decl_handler
       
   178         parser.StartElementHandler = self.first_element_handler
       
   179         parser.EndElementHandler = self.end_element_handler
       
   180         parser.ProcessingInstructionHandler = self.pi_handler
       
   181         if self._options.entities:
       
   182             parser.EntityDeclHandler = self.entity_decl_handler
       
   183         parser.NotationDeclHandler = self.notation_decl_handler
       
   184         if self._options.comments:
       
   185             parser.CommentHandler = self.comment_handler
       
   186         if self._options.cdata_sections:
       
   187             parser.StartCdataSectionHandler = self.start_cdata_section_handler
       
   188             parser.EndCdataSectionHandler = self.end_cdata_section_handler
       
   189             parser.CharacterDataHandler = self.character_data_handler_cdata
       
   190         else:
       
   191             parser.CharacterDataHandler = self.character_data_handler
       
   192         parser.ExternalEntityRefHandler = self.external_entity_ref_handler
       
   193         parser.XmlDeclHandler = self.xml_decl_handler
       
   194         parser.ElementDeclHandler = self.element_decl_handler
       
   195         parser.AttlistDeclHandler = self.attlist_decl_handler
       
   196 
       
   197     def parseFile(self, file):
       
   198         """Parse a document from a file object, returning the document
       
   199         node."""
       
   200         parser = self.getParser()
       
   201         first_buffer = True
       
   202         try:
       
   203             while 1:
       
   204                 buffer = file.read(16*1024)
       
   205                 if not buffer:
       
   206                     break
       
   207                 parser.Parse(buffer, 0)
       
   208                 if first_buffer and self.document.documentElement:
       
   209                     self._setup_subset(buffer)
       
   210                 first_buffer = False
       
   211             parser.Parse("", True)
       
   212         except ParseEscape:
       
   213             pass
       
   214         doc = self.document
       
   215         self.reset()
       
   216         self._parser = None
       
   217         return doc
       
   218 
       
   219     def parseString(self, string):
       
   220         """Parse a document from a string, returning the document node."""
       
   221         parser = self.getParser()
       
   222         try:
       
   223             parser.Parse(string, True)
       
   224             self._setup_subset(string)
       
   225         except ParseEscape:
       
   226             pass
       
   227         doc = self.document
       
   228         self.reset()
       
   229         self._parser = None
       
   230         return doc
       
   231 
       
   232     def _setup_subset(self, buffer):
       
   233         """Load the internal subset if there might be one."""
       
   234         if self.document.doctype:
       
   235             extractor = InternalSubsetExtractor()
       
   236             extractor.parseString(buffer)
       
   237             subset = extractor.getSubset()
       
   238             self.document.doctype.internalSubset = subset
       
   239 
       
   240     def start_doctype_decl_handler(self, doctypeName, systemId, publicId,
       
   241                                    has_internal_subset):
       
   242         doctype = self.document.implementation.createDocumentType(
       
   243             doctypeName, publicId, systemId)
       
   244         doctype.ownerDocument = self.document
       
   245         self.document.childNodes.append(doctype)
       
   246         self.document.doctype = doctype
       
   247         if self._filter and self._filter.acceptNode(doctype) == FILTER_REJECT:
       
   248             self.document.doctype = None
       
   249             del self.document.childNodes[-1]
       
   250             doctype = None
       
   251             self._parser.EntityDeclHandler = None
       
   252             self._parser.NotationDeclHandler = None
       
   253         if has_internal_subset:
       
   254             if doctype is not None:
       
   255                 doctype.entities._seq = []
       
   256                 doctype.notations._seq = []
       
   257             self._parser.CommentHandler = None
       
   258             self._parser.ProcessingInstructionHandler = None
       
   259             self._parser.EndDoctypeDeclHandler = self.end_doctype_decl_handler
       
   260 
       
   261     def end_doctype_decl_handler(self):
       
   262         if self._options.comments:
       
   263             self._parser.CommentHandler = self.comment_handler
       
   264         self._parser.ProcessingInstructionHandler = self.pi_handler
       
   265         if not (self._elem_info or self._filter):
       
   266             self._finish_end_element = id
       
   267 
       
   268     def pi_handler(self, target, data):
       
   269         node = self.document.createProcessingInstruction(target, data)
       
   270         _append_child(self.curNode, node)
       
   271         if self._filter and self._filter.acceptNode(node) == FILTER_REJECT:
       
   272             self.curNode.removeChild(node)
       
   273 
       
   274     def character_data_handler_cdata(self, data):
       
   275         childNodes = self.curNode.childNodes
       
   276         if self._cdata:
       
   277             if (  self._cdata_continue
       
   278                   and childNodes[-1].nodeType == CDATA_SECTION_NODE):
       
   279                 childNodes[-1].appendData(data)
       
   280                 return
       
   281             node = self.document.createCDATASection(data)
       
   282             self._cdata_continue = True
       
   283         elif childNodes and childNodes[-1].nodeType == TEXT_NODE:
       
   284             node = childNodes[-1]
       
   285             value = node.data + data
       
   286             d = node.__dict__
       
   287             d['data'] = d['nodeValue'] = value
       
   288             return
       
   289         else:
       
   290             node = minidom.Text()
       
   291             d = node.__dict__
       
   292             d['data'] = d['nodeValue'] = data
       
   293             d['ownerDocument'] = self.document
       
   294         _append_child(self.curNode, node)
       
   295 
       
   296     def character_data_handler(self, data):
       
   297         childNodes = self.curNode.childNodes
       
   298         if childNodes and childNodes[-1].nodeType == TEXT_NODE:
       
   299             node = childNodes[-1]
       
   300             d = node.__dict__
       
   301             d['data'] = d['nodeValue'] = node.data + data
       
   302             return
       
   303         node = minidom.Text()
       
   304         d = node.__dict__
       
   305         d['data'] = d['nodeValue'] = node.data + data
       
   306         d['ownerDocument'] = self.document
       
   307         _append_child(self.curNode, node)
       
   308 
       
   309     def entity_decl_handler(self, entityName, is_parameter_entity, value,
       
   310                             base, systemId, publicId, notationName):
       
   311         if is_parameter_entity:
       
   312             # we don't care about parameter entities for the DOM
       
   313             return
       
   314         if not self._options.entities:
       
   315             return
       
   316         node = self.document._create_entity(entityName, publicId,
       
   317                                             systemId, notationName)
       
   318         if value is not None:
       
   319             # internal entity
       
   320             # node *should* be readonly, but we'll cheat
       
   321             child = self.document.createTextNode(value)
       
   322             node.childNodes.append(child)
       
   323         self.document.doctype.entities._seq.append(node)
       
   324         if self._filter and self._filter.acceptNode(node) == FILTER_REJECT:
       
   325             del self.document.doctype.entities._seq[-1]
       
   326 
       
   327     def notation_decl_handler(self, notationName, base, systemId, publicId):
       
   328         node = self.document._create_notation(notationName, publicId, systemId)
       
   329         self.document.doctype.notations._seq.append(node)
       
   330         if self._filter and self._filter.acceptNode(node) == FILTER_ACCEPT:
       
   331             del self.document.doctype.notations._seq[-1]
       
   332 
       
   333     def comment_handler(self, data):
       
   334         node = self.document.createComment(data)
       
   335         _append_child(self.curNode, node)
       
   336         if self._filter and self._filter.acceptNode(node) == FILTER_REJECT:
       
   337             self.curNode.removeChild(node)
       
   338 
       
   339     def start_cdata_section_handler(self):
       
   340         self._cdata = True
       
   341         self._cdata_continue = False
       
   342 
       
   343     def end_cdata_section_handler(self):
       
   344         self._cdata = False
       
   345         self._cdata_continue = False
       
   346 
       
   347     def external_entity_ref_handler(self, context, base, systemId, publicId):
       
   348         return 1
       
   349 
       
   350     def first_element_handler(self, name, attributes):
       
   351         if self._filter is None and not self._elem_info:
       
   352             self._finish_end_element = id
       
   353         self.getParser().StartElementHandler = self.start_element_handler
       
   354         self.start_element_handler(name, attributes)
       
   355 
       
   356     def start_element_handler(self, name, attributes):
       
   357         node = self.document.createElement(name)
       
   358         _append_child(self.curNode, node)
       
   359         self.curNode = node
       
   360 
       
   361         if attributes:
       
   362             for i in range(0, len(attributes), 2):
       
   363                 a = minidom.Attr(attributes[i], EMPTY_NAMESPACE,
       
   364                                  None, EMPTY_PREFIX)
       
   365                 value = attributes[i+1]
       
   366                 d = a.childNodes[0].__dict__
       
   367                 d['data'] = d['nodeValue'] = value
       
   368                 d = a.__dict__
       
   369                 d['value'] = d['nodeValue'] = value
       
   370                 d['ownerDocument'] = self.document
       
   371                 _set_attribute_node(node, a)
       
   372 
       
   373         if node is not self.document.documentElement:
       
   374             self._finish_start_element(node)
       
   375 
       
   376     def _finish_start_element(self, node):
       
   377         if self._filter:
       
   378             # To be general, we'd have to call isSameNode(), but this
       
   379             # is sufficient for minidom:
       
   380             if node is self.document.documentElement:
       
   381                 return
       
   382             filt = self._filter.startContainer(node)
       
   383             if filt == FILTER_REJECT:
       
   384                 # ignore this node & all descendents
       
   385                 Rejecter(self)
       
   386             elif filt == FILTER_SKIP:
       
   387                 # ignore this node, but make it's children become
       
   388                 # children of the parent node
       
   389                 Skipper(self)
       
   390             else:
       
   391                 return
       
   392             self.curNode = node.parentNode
       
   393             node.parentNode.removeChild(node)
       
   394             node.unlink()
       
   395 
       
   396     # If this ever changes, Namespaces.end_element_handler() needs to
       
   397     # be changed to match.
       
   398     #
       
   399     def end_element_handler(self, name):
       
   400         curNode = self.curNode
       
   401         self.curNode = curNode.parentNode
       
   402         self._finish_end_element(curNode)
       
   403 
       
   404     def _finish_end_element(self, curNode):
       
   405         info = self._elem_info.get(curNode.tagName)
       
   406         if info:
       
   407             self._handle_white_text_nodes(curNode, info)
       
   408         if self._filter:
       
   409             if curNode is self.document.documentElement:
       
   410                 return
       
   411             if self._filter.acceptNode(curNode) == FILTER_REJECT:
       
   412                 self.curNode.removeChild(curNode)
       
   413                 curNode.unlink()
       
   414 
       
   415     def _handle_white_text_nodes(self, node, info):
       
   416         if (self._options.whitespace_in_element_content
       
   417             or not info.isElementContent()):
       
   418             return
       
   419 
       
   420         # We have element type information and should remove ignorable
       
   421         # whitespace; identify for text nodes which contain only
       
   422         # whitespace.
       
   423         L = []
       
   424         for child in node.childNodes:
       
   425             if child.nodeType == TEXT_NODE and not child.data.strip():
       
   426                 L.append(child)
       
   427 
       
   428         # Remove ignorable whitespace from the tree.
       
   429         for child in L:
       
   430             node.removeChild(child)
       
   431 
       
   432     def element_decl_handler(self, name, model):
       
   433         info = self._elem_info.get(name)
       
   434         if info is None:
       
   435             self._elem_info[name] = ElementInfo(name, model)
       
   436         else:
       
   437             assert info._model is None
       
   438             info._model = model
       
   439 
       
   440     def attlist_decl_handler(self, elem, name, type, default, required):
       
   441         info = self._elem_info.get(elem)
       
   442         if info is None:
       
   443             info = ElementInfo(elem)
       
   444             self._elem_info[elem] = info
       
   445         info._attr_info.append(
       
   446             [None, name, None, None, default, 0, type, required])
       
   447 
       
   448     def xml_decl_handler(self, version, encoding, standalone):
       
   449         self.document.version = version
       
   450         self.document.encoding = encoding
       
   451         # This is still a little ugly, thanks to the pyexpat API. ;-(
       
   452         if standalone >= 0:
       
   453             if standalone:
       
   454                 self.document.standalone = True
       
   455             else:
       
   456                 self.document.standalone = False
       
   457 
       
   458 
       
   459 # Don't include FILTER_INTERRUPT, since that's checked separately
       
   460 # where allowed.
       
   461 _ALLOWED_FILTER_RETURNS = (FILTER_ACCEPT, FILTER_REJECT, FILTER_SKIP)
       
   462 
       
   463 class FilterVisibilityController(object):
       
   464     """Wrapper around a DOMBuilderFilter which implements the checks
       
   465     to make the whatToShow filter attribute work."""
       
   466 
       
   467     __slots__ = 'filter',
       
   468 
       
   469     def __init__(self, filter):
       
   470         self.filter = filter
       
   471 
       
   472     def startContainer(self, node):
       
   473         mask = self._nodetype_mask[node.nodeType]
       
   474         if self.filter.whatToShow & mask:
       
   475             val = self.filter.startContainer(node)
       
   476             if val == FILTER_INTERRUPT:
       
   477                 raise ParseEscape
       
   478             if val not in _ALLOWED_FILTER_RETURNS:
       
   479                 raise ValueError, \
       
   480                       "startContainer() returned illegal value: " + repr(val)
       
   481             return val
       
   482         else:
       
   483             return FILTER_ACCEPT
       
   484 
       
   485     def acceptNode(self, node):
       
   486         mask = self._nodetype_mask[node.nodeType]
       
   487         if self.filter.whatToShow & mask:
       
   488             val = self.filter.acceptNode(node)
       
   489             if val == FILTER_INTERRUPT:
       
   490                 raise ParseEscape
       
   491             if val == FILTER_SKIP:
       
   492                 # move all child nodes to the parent, and remove this node
       
   493                 parent = node.parentNode
       
   494                 for child in node.childNodes[:]:
       
   495                     parent.appendChild(child)
       
   496                 # node is handled by the caller
       
   497                 return FILTER_REJECT
       
   498             if val not in _ALLOWED_FILTER_RETURNS:
       
   499                 raise ValueError, \
       
   500                       "acceptNode() returned illegal value: " + repr(val)
       
   501             return val
       
   502         else:
       
   503             return FILTER_ACCEPT
       
   504 
       
   505     _nodetype_mask = {
       
   506         Node.ELEMENT_NODE:                NodeFilter.SHOW_ELEMENT,
       
   507         Node.ATTRIBUTE_NODE:              NodeFilter.SHOW_ATTRIBUTE,
       
   508         Node.TEXT_NODE:                   NodeFilter.SHOW_TEXT,
       
   509         Node.CDATA_SECTION_NODE:          NodeFilter.SHOW_CDATA_SECTION,
       
   510         Node.ENTITY_REFERENCE_NODE:       NodeFilter.SHOW_ENTITY_REFERENCE,
       
   511         Node.ENTITY_NODE:                 NodeFilter.SHOW_ENTITY,
       
   512         Node.PROCESSING_INSTRUCTION_NODE: NodeFilter.SHOW_PROCESSING_INSTRUCTION,
       
   513         Node.COMMENT_NODE:                NodeFilter.SHOW_COMMENT,
       
   514         Node.DOCUMENT_NODE:               NodeFilter.SHOW_DOCUMENT,
       
   515         Node.DOCUMENT_TYPE_NODE:          NodeFilter.SHOW_DOCUMENT_TYPE,
       
   516         Node.DOCUMENT_FRAGMENT_NODE:      NodeFilter.SHOW_DOCUMENT_FRAGMENT,
       
   517         Node.NOTATION_NODE:               NodeFilter.SHOW_NOTATION,
       
   518         }
       
   519 
       
   520 
       
   521 class FilterCrutch(object):
       
   522     __slots__ = '_builder', '_level', '_old_start', '_old_end'
       
   523 
       
   524     def __init__(self, builder):
       
   525         self._level = 0
       
   526         self._builder = builder
       
   527         parser = builder._parser
       
   528         self._old_start = parser.StartElementHandler
       
   529         self._old_end = parser.EndElementHandler
       
   530         parser.StartElementHandler = self.start_element_handler
       
   531         parser.EndElementHandler = self.end_element_handler
       
   532 
       
   533 class Rejecter(FilterCrutch):
       
   534     __slots__ = ()
       
   535 
       
   536     def __init__(self, builder):
       
   537         FilterCrutch.__init__(self, builder)
       
   538         parser = builder._parser
       
   539         for name in ("ProcessingInstructionHandler",
       
   540                      "CommentHandler",
       
   541                      "CharacterDataHandler",
       
   542                      "StartCdataSectionHandler",
       
   543                      "EndCdataSectionHandler",
       
   544                      "ExternalEntityRefHandler",
       
   545                      ):
       
   546             setattr(parser, name, None)
       
   547 
       
   548     def start_element_handler(self, *args):
       
   549         self._level = self._level + 1
       
   550 
       
   551     def end_element_handler(self, *args):
       
   552         if self._level == 0:
       
   553             # restore the old handlers
       
   554             parser = self._builder._parser
       
   555             self._builder.install(parser)
       
   556             parser.StartElementHandler = self._old_start
       
   557             parser.EndElementHandler = self._old_end
       
   558         else:
       
   559             self._level = self._level - 1
       
   560 
       
   561 class Skipper(FilterCrutch):
       
   562     __slots__ = ()
       
   563 
       
   564     def start_element_handler(self, *args):
       
   565         node = self._builder.curNode
       
   566         self._old_start(*args)
       
   567         if self._builder.curNode is not node:
       
   568             self._level = self._level + 1
       
   569 
       
   570     def end_element_handler(self, *args):
       
   571         if self._level == 0:
       
   572             # We're popping back out of the node we're skipping, so we
       
   573             # shouldn't need to do anything but reset the handlers.
       
   574             self._builder._parser.StartElementHandler = self._old_start
       
   575             self._builder._parser.EndElementHandler = self._old_end
       
   576             self._builder = None
       
   577         else:
       
   578             self._level = self._level - 1
       
   579             self._old_end(*args)
       
   580 
       
   581 
       
   582 # framework document used by the fragment builder.
       
   583 # Takes a string for the doctype, subset string, and namespace attrs string.
       
   584 
       
   585 _FRAGMENT_BUILDER_INTERNAL_SYSTEM_ID = \
       
   586     "http://xml.python.org/entities/fragment-builder/internal"
       
   587 
       
   588 _FRAGMENT_BUILDER_TEMPLATE = (
       
   589     '''\
       
   590 <!DOCTYPE wrapper
       
   591   %%s [
       
   592   <!ENTITY fragment-builder-internal
       
   593     SYSTEM "%s">
       
   594 %%s
       
   595 ]>
       
   596 <wrapper %%s
       
   597 >&fragment-builder-internal;</wrapper>'''
       
   598     % _FRAGMENT_BUILDER_INTERNAL_SYSTEM_ID)
       
   599 
       
   600 
       
   601 class FragmentBuilder(ExpatBuilder):
       
   602     """Builder which constructs document fragments given XML source
       
   603     text and a context node.
       
   604 
       
   605     The context node is expected to provide information about the
       
   606     namespace declarations which are in scope at the start of the
       
   607     fragment.
       
   608     """
       
   609 
       
   610     def __init__(self, context, options=None):
       
   611         if context.nodeType == DOCUMENT_NODE:
       
   612             self.originalDocument = context
       
   613             self.context = context
       
   614         else:
       
   615             self.originalDocument = context.ownerDocument
       
   616             self.context = context
       
   617         ExpatBuilder.__init__(self, options)
       
   618 
       
   619     def reset(self):
       
   620         ExpatBuilder.reset(self)
       
   621         self.fragment = None
       
   622 
       
   623     def parseFile(self, file):
       
   624         """Parse a document fragment from a file object, returning the
       
   625         fragment node."""
       
   626         return self.parseString(file.read())
       
   627 
       
   628     def parseString(self, string):
       
   629         """Parse a document fragment from a string, returning the
       
   630         fragment node."""
       
   631         self._source = string
       
   632         parser = self.getParser()
       
   633         doctype = self.originalDocument.doctype
       
   634         ident = ""
       
   635         if doctype:
       
   636             subset = doctype.internalSubset or self._getDeclarations()
       
   637             if doctype.publicId:
       
   638                 ident = ('PUBLIC "%s" "%s"'
       
   639                          % (doctype.publicId, doctype.systemId))
       
   640             elif doctype.systemId:
       
   641                 ident = 'SYSTEM "%s"' % doctype.systemId
       
   642         else:
       
   643             subset = ""
       
   644         nsattrs = self._getNSattrs() # get ns decls from node's ancestors
       
   645         document = _FRAGMENT_BUILDER_TEMPLATE % (ident, subset, nsattrs)
       
   646         try:
       
   647             parser.Parse(document, 1)
       
   648         except:
       
   649             self.reset()
       
   650             raise
       
   651         fragment = self.fragment
       
   652         self.reset()
       
   653 ##         self._parser = None
       
   654         return fragment
       
   655 
       
   656     def _getDeclarations(self):
       
   657         """Re-create the internal subset from the DocumentType node.
       
   658 
       
   659         This is only needed if we don't already have the
       
   660         internalSubset as a string.
       
   661         """
       
   662         doctype = self.context.ownerDocument.doctype
       
   663         s = ""
       
   664         if doctype:
       
   665             for i in range(doctype.notations.length):
       
   666                 notation = doctype.notations.item(i)
       
   667                 if s:
       
   668                     s = s + "\n  "
       
   669                 s = "%s<!NOTATION %s" % (s, notation.nodeName)
       
   670                 if notation.publicId:
       
   671                     s = '%s PUBLIC "%s"\n             "%s">' \
       
   672                         % (s, notation.publicId, notation.systemId)
       
   673                 else:
       
   674                     s = '%s SYSTEM "%s">' % (s, notation.systemId)
       
   675             for i in range(doctype.entities.length):
       
   676                 entity = doctype.entities.item(i)
       
   677                 if s:
       
   678                     s = s + "\n  "
       
   679                 s = "%s<!ENTITY %s" % (s, entity.nodeName)
       
   680                 if entity.publicId:
       
   681                     s = '%s PUBLIC "%s"\n             "%s"' \
       
   682                         % (s, entity.publicId, entity.systemId)
       
   683                 elif entity.systemId:
       
   684                     s = '%s SYSTEM "%s"' % (s, entity.systemId)
       
   685                 else:
       
   686                     s = '%s "%s"' % (s, entity.firstChild.data)
       
   687                 if entity.notationName:
       
   688                     s = "%s NOTATION %s" % (s, entity.notationName)
       
   689                 s = s + ">"
       
   690         return s
       
   691 
       
   692     def _getNSattrs(self):
       
   693         return ""
       
   694 
       
   695     def external_entity_ref_handler(self, context, base, systemId, publicId):
       
   696         if systemId == _FRAGMENT_BUILDER_INTERNAL_SYSTEM_ID:
       
   697             # this entref is the one that we made to put the subtree
       
   698             # in; all of our given input is parsed in here.
       
   699             old_document = self.document
       
   700             old_cur_node = self.curNode
       
   701             parser = self._parser.ExternalEntityParserCreate(context)
       
   702             # put the real document back, parse into the fragment to return
       
   703             self.document = self.originalDocument
       
   704             self.fragment = self.document.createDocumentFragment()
       
   705             self.curNode = self.fragment
       
   706             try:
       
   707                 parser.Parse(self._source, 1)
       
   708             finally:
       
   709                 self.curNode = old_cur_node
       
   710                 self.document = old_document
       
   711                 self._source = None
       
   712             return -1
       
   713         else:
       
   714             return ExpatBuilder.external_entity_ref_handler(
       
   715                 self, context, base, systemId, publicId)
       
   716 
       
   717 
       
   718 class Namespaces:
       
   719     """Mix-in class for builders; adds support for namespaces."""
       
   720 
       
   721     def _initNamespaces(self):
       
   722         # list of (prefix, uri) ns declarations.  Namespace attrs are
       
   723         # constructed from this and added to the element's attrs.
       
   724         self._ns_ordered_prefixes = []
       
   725 
       
   726     def createParser(self):
       
   727         """Create a new namespace-handling parser."""
       
   728         parser = expat.ParserCreate(namespace_separator=" ")
       
   729         parser.namespace_prefixes = True
       
   730         return parser
       
   731 
       
   732     def install(self, parser):
       
   733         """Insert the namespace-handlers onto the parser."""
       
   734         ExpatBuilder.install(self, parser)
       
   735         if self._options.namespace_declarations:
       
   736             parser.StartNamespaceDeclHandler = (
       
   737                 self.start_namespace_decl_handler)
       
   738 
       
   739     def start_namespace_decl_handler(self, prefix, uri):
       
   740         """Push this namespace declaration on our storage."""
       
   741         self._ns_ordered_prefixes.append((prefix, uri))
       
   742 
       
   743     def start_element_handler(self, name, attributes):
       
   744         if ' ' in name:
       
   745             uri, localname, prefix, qname = _parse_ns_name(self, name)
       
   746         else:
       
   747             uri = EMPTY_NAMESPACE
       
   748             qname = name
       
   749             localname = None
       
   750             prefix = EMPTY_PREFIX
       
   751         node = minidom.Element(qname, uri, prefix, localname)
       
   752         node.ownerDocument = self.document
       
   753         _append_child(self.curNode, node)
       
   754         self.curNode = node
       
   755 
       
   756         if self._ns_ordered_prefixes:
       
   757             for prefix, uri in self._ns_ordered_prefixes:
       
   758                 if prefix:
       
   759                     a = minidom.Attr(_intern(self, 'xmlns:' + prefix),
       
   760                                      XMLNS_NAMESPACE, prefix, "xmlns")
       
   761                 else:
       
   762                     a = minidom.Attr("xmlns", XMLNS_NAMESPACE,
       
   763                                      "xmlns", EMPTY_PREFIX)
       
   764                 d = a.childNodes[0].__dict__
       
   765                 d['data'] = d['nodeValue'] = uri
       
   766                 d = a.__dict__
       
   767                 d['value'] = d['nodeValue'] = uri
       
   768                 d['ownerDocument'] = self.document
       
   769                 _set_attribute_node(node, a)
       
   770             del self._ns_ordered_prefixes[:]
       
   771 
       
   772         if attributes:
       
   773             _attrs = node._attrs
       
   774             _attrsNS = node._attrsNS
       
   775             for i in range(0, len(attributes), 2):
       
   776                 aname = attributes[i]
       
   777                 value = attributes[i+1]
       
   778                 if ' ' in aname:
       
   779                     uri, localname, prefix, qname = _parse_ns_name(self, aname)
       
   780                     a = minidom.Attr(qname, uri, localname, prefix)
       
   781                     _attrs[qname] = a
       
   782                     _attrsNS[(uri, localname)] = a
       
   783                 else:
       
   784                     a = minidom.Attr(aname, EMPTY_NAMESPACE,
       
   785                                      aname, EMPTY_PREFIX)
       
   786                     _attrs[aname] = a
       
   787                     _attrsNS[(EMPTY_NAMESPACE, aname)] = a
       
   788                 d = a.childNodes[0].__dict__
       
   789                 d['data'] = d['nodeValue'] = value
       
   790                 d = a.__dict__
       
   791                 d['ownerDocument'] = self.document
       
   792                 d['value'] = d['nodeValue'] = value
       
   793                 d['ownerElement'] = node
       
   794 
       
   795     if __debug__:
       
   796         # This only adds some asserts to the original
       
   797         # end_element_handler(), so we only define this when -O is not
       
   798         # used.  If changing one, be sure to check the other to see if
       
   799         # it needs to be changed as well.
       
   800         #
       
   801         def end_element_handler(self, name):
       
   802             curNode = self.curNode
       
   803             if ' ' in name:
       
   804                 uri, localname, prefix, qname = _parse_ns_name(self, name)
       
   805                 assert (curNode.namespaceURI == uri
       
   806                         and curNode.localName == localname
       
   807                         and curNode.prefix == prefix), \
       
   808                         "element stack messed up! (namespace)"
       
   809             else:
       
   810                 assert curNode.nodeName == name, \
       
   811                        "element stack messed up - bad nodeName"
       
   812                 assert curNode.namespaceURI == EMPTY_NAMESPACE, \
       
   813                        "element stack messed up - bad namespaceURI"
       
   814             self.curNode = curNode.parentNode
       
   815             self._finish_end_element(curNode)
       
   816 
       
   817 
       
   818 class ExpatBuilderNS(Namespaces, ExpatBuilder):
       
   819     """Document builder that supports namespaces."""
       
   820 
       
   821     def reset(self):
       
   822         ExpatBuilder.reset(self)
       
   823         self._initNamespaces()
       
   824 
       
   825 
       
   826 class FragmentBuilderNS(Namespaces, FragmentBuilder):
       
   827     """Fragment builder that supports namespaces."""
       
   828 
       
   829     def reset(self):
       
   830         FragmentBuilder.reset(self)
       
   831         self._initNamespaces()
       
   832 
       
   833     def _getNSattrs(self):
       
   834         """Return string of namespace attributes from this element and
       
   835         ancestors."""
       
   836         # XXX This needs to be re-written to walk the ancestors of the
       
   837         # context to build up the namespace information from
       
   838         # declarations, elements, and attributes found in context.
       
   839         # Otherwise we have to store a bunch more data on the DOM
       
   840         # (though that *might* be more reliable -- not clear).
       
   841         attrs = ""
       
   842         context = self.context
       
   843         L = []
       
   844         while context:
       
   845             if hasattr(context, '_ns_prefix_uri'):
       
   846                 for prefix, uri in context._ns_prefix_uri.items():
       
   847                     # add every new NS decl from context to L and attrs string
       
   848                     if prefix in L:
       
   849                         continue
       
   850                     L.append(prefix)
       
   851                     if prefix:
       
   852                         declname = "xmlns:" + prefix
       
   853                     else:
       
   854                         declname = "xmlns"
       
   855                     if attrs:
       
   856                         attrs = "%s\n    %s='%s'" % (attrs, declname, uri)
       
   857                     else:
       
   858                         attrs = " %s='%s'" % (declname, uri)
       
   859             context = context.parentNode
       
   860         return attrs
       
   861 
       
   862 
       
   863 class ParseEscape(Exception):
       
   864     """Exception raised to short-circuit parsing in InternalSubsetExtractor."""
       
   865     pass
       
   866 
       
   867 class InternalSubsetExtractor(ExpatBuilder):
       
   868     """XML processor which can rip out the internal document type subset."""
       
   869 
       
   870     subset = None
       
   871 
       
   872     def getSubset(self):
       
   873         """Return the internal subset as a string."""
       
   874         return self.subset
       
   875 
       
   876     def parseFile(self, file):
       
   877         try:
       
   878             ExpatBuilder.parseFile(self, file)
       
   879         except ParseEscape:
       
   880             pass
       
   881 
       
   882     def parseString(self, string):
       
   883         try:
       
   884             ExpatBuilder.parseString(self, string)
       
   885         except ParseEscape:
       
   886             pass
       
   887 
       
   888     def install(self, parser):
       
   889         parser.StartDoctypeDeclHandler = self.start_doctype_decl_handler
       
   890         parser.StartElementHandler = self.start_element_handler
       
   891 
       
   892     def start_doctype_decl_handler(self, name, publicId, systemId,
       
   893                                    has_internal_subset):
       
   894         if has_internal_subset:
       
   895             parser = self.getParser()
       
   896             self.subset = []
       
   897             parser.DefaultHandler = self.subset.append
       
   898             parser.EndDoctypeDeclHandler = self.end_doctype_decl_handler
       
   899         else:
       
   900             raise ParseEscape()
       
   901 
       
   902     def end_doctype_decl_handler(self):
       
   903         s = ''.join(self.subset).replace('\r\n', '\n').replace('\r', '\n')
       
   904         self.subset = s
       
   905         raise ParseEscape()
       
   906 
       
   907     def start_element_handler(self, name, attrs):
       
   908         raise ParseEscape()
       
   909 
       
   910 
       
   911 def parse(file, namespaces=True):
       
   912     """Parse a document, returning the resulting Document node.
       
   913 
       
   914     'file' may be either a file name or an open file object.
       
   915     """
       
   916     if namespaces:
       
   917         builder = ExpatBuilderNS()
       
   918     else:
       
   919         builder = ExpatBuilder()
       
   920 
       
   921     if isinstance(file, StringTypes):
       
   922         fp = open(file, 'rb')
       
   923         try:
       
   924             result = builder.parseFile(fp)
       
   925         finally:
       
   926             fp.close()
       
   927     else:
       
   928         result = builder.parseFile(file)
       
   929     return result
       
   930 
       
   931 
       
   932 def parseString(string, namespaces=True):
       
   933     """Parse a document from a string, returning the resulting
       
   934     Document node.
       
   935     """
       
   936     if namespaces:
       
   937         builder = ExpatBuilderNS()
       
   938     else:
       
   939         builder = ExpatBuilder()
       
   940     return builder.parseString(string)
       
   941 
       
   942 
       
   943 def parseFragment(file, context, namespaces=True):
       
   944     """Parse a fragment of a document, given the context from which it
       
   945     was originally extracted.  context should be the parent of the
       
   946     node(s) which are in the fragment.
       
   947 
       
   948     'file' may be either a file name or an open file object.
       
   949     """
       
   950     if namespaces:
       
   951         builder = FragmentBuilderNS(context)
       
   952     else:
       
   953         builder = FragmentBuilder(context)
       
   954 
       
   955     if isinstance(file, StringTypes):
       
   956         fp = open(file, 'rb')
       
   957         try:
       
   958             result = builder.parseFile(fp)
       
   959         finally:
       
   960             fp.close()
       
   961     else:
       
   962         result = builder.parseFile(file)
       
   963     return result
       
   964 
       
   965 
       
   966 def parseFragmentString(string, context, namespaces=True):
       
   967     """Parse a fragment of a document from a string, given the context
       
   968     from which it was originally extracted.  context should be the
       
   969     parent of the node(s) which are in the fragment.
       
   970     """
       
   971     if namespaces:
       
   972         builder = FragmentBuilderNS(context)
       
   973     else:
       
   974         builder = FragmentBuilder(context)
       
   975     return builder.parseString(string)
       
   976 
       
   977 
       
   978 def makeBuilder(options):
       
   979     """Create a builder based on an Options object."""
       
   980     if options.namespaces:
       
   981         return ExpatBuilderNS(options)
       
   982     else:
       
   983         return ExpatBuilder(options)