symbian-qemu-0.9.1-12/python-2.6.1/Lib/xmllib.py
changeset 1 2fb8b9db1c86
equal deleted inserted replaced
0:ffa851df0825 1:2fb8b9db1c86
       
     1 """A parser for XML, using the derived class as static DTD."""
       
     2 
       
     3 # Author: Sjoerd Mullender.
       
     4 
       
     5 import re
       
     6 import string
       
     7 
       
     8 import warnings
       
     9 warnings.warn("The xmllib module is obsolete.  Use xml.sax instead.", DeprecationWarning)
       
    10 del warnings
       
    11 
       
    12 version = '0.3'
       
    13 
       
    14 class Error(RuntimeError):
       
    15     pass
       
    16 
       
    17 # Regular expressions used for parsing
       
    18 
       
    19 _S = '[ \t\r\n]+'                       # white space
       
    20 _opS = '[ \t\r\n]*'                     # optional white space
       
    21 _Name = '[a-zA-Z_:][-a-zA-Z0-9._:]*'    # valid XML name
       
    22 _QStr = "(?:'[^']*'|\"[^\"]*\")"        # quoted XML string
       
    23 illegal = re.compile('[^\t\r\n -\176\240-\377]') # illegal chars in content
       
    24 interesting = re.compile('[]&<]')
       
    25 
       
    26 amp = re.compile('&')
       
    27 ref = re.compile('&(' + _Name + '|#[0-9]+|#x[0-9a-fA-F]+)[^-a-zA-Z0-9._:]')
       
    28 entityref = re.compile('&(?P<name>' + _Name + ')[^-a-zA-Z0-9._:]')
       
    29 charref = re.compile('&#(?P<char>[0-9]+[^0-9]|x[0-9a-fA-F]+[^0-9a-fA-F])')
       
    30 space = re.compile(_S + '$')
       
    31 newline = re.compile('\n')
       
    32 
       
    33 attrfind = re.compile(
       
    34     _S + '(?P<name>' + _Name + ')'
       
    35     '(' + _opS + '=' + _opS +
       
    36     '(?P<value>'+_QStr+'|[-a-zA-Z0-9.:+*%?!\(\)_#=~]+))?')
       
    37 starttagopen = re.compile('<' + _Name)
       
    38 starttagend = re.compile(_opS + '(?P<slash>/?)>')
       
    39 starttagmatch = re.compile('<(?P<tagname>'+_Name+')'
       
    40                       '(?P<attrs>(?:'+attrfind.pattern+')*)'+
       
    41                       starttagend.pattern)
       
    42 endtagopen = re.compile('</')
       
    43 endbracket = re.compile(_opS + '>')
       
    44 endbracketfind = re.compile('(?:[^>\'"]|'+_QStr+')*>')
       
    45 tagfind = re.compile(_Name)
       
    46 cdataopen = re.compile(r'<!\[CDATA\[')
       
    47 cdataclose = re.compile(r'\]\]>')
       
    48 # this matches one of the following:
       
    49 # SYSTEM SystemLiteral
       
    50 # PUBLIC PubidLiteral SystemLiteral
       
    51 _SystemLiteral = '(?P<%s>'+_QStr+')'
       
    52 _PublicLiteral = '(?P<%s>"[-\'\(\)+,./:=?;!*#@$_%% \n\ra-zA-Z0-9]*"|' \
       
    53                         "'[-\(\)+,./:=?;!*#@$_%% \n\ra-zA-Z0-9]*')"
       
    54 _ExternalId = '(?:SYSTEM|' \
       
    55                  'PUBLIC'+_S+_PublicLiteral%'pubid'+ \
       
    56               ')'+_S+_SystemLiteral%'syslit'
       
    57 doctype = re.compile('<!DOCTYPE'+_S+'(?P<name>'+_Name+')'
       
    58                      '(?:'+_S+_ExternalId+')?'+_opS)
       
    59 xmldecl = re.compile('<\?xml'+_S+
       
    60                      'version'+_opS+'='+_opS+'(?P<version>'+_QStr+')'+
       
    61                      '(?:'+_S+'encoding'+_opS+'='+_opS+
       
    62                         "(?P<encoding>'[A-Za-z][-A-Za-z0-9._]*'|"
       
    63                         '"[A-Za-z][-A-Za-z0-9._]*"))?'
       
    64                      '(?:'+_S+'standalone'+_opS+'='+_opS+
       
    65                         '(?P<standalone>\'(?:yes|no)\'|"(?:yes|no)"))?'+
       
    66                      _opS+'\?>')
       
    67 procopen = re.compile(r'<\?(?P<proc>' + _Name + ')' + _opS)
       
    68 procclose = re.compile(_opS + r'\?>')
       
    69 commentopen = re.compile('<!--')
       
    70 commentclose = re.compile('-->')
       
    71 doubledash = re.compile('--')
       
    72 attrtrans = string.maketrans(' \r\n\t', '    ')
       
    73 
       
    74 # definitions for XML namespaces
       
    75 _NCName = '[a-zA-Z_][-a-zA-Z0-9._]*'    # XML Name, minus the ":"
       
    76 ncname = re.compile(_NCName + '$')
       
    77 qname = re.compile('(?:(?P<prefix>' + _NCName + '):)?' # optional prefix
       
    78                    '(?P<local>' + _NCName + ')$')
       
    79 
       
    80 xmlns = re.compile('xmlns(?::(?P<ncname>'+_NCName+'))?$')
       
    81 
       
    82 # XML parser base class -- find tags and call handler functions.
       
    83 # Usage: p = XMLParser(); p.feed(data); ...; p.close().
       
    84 # The dtd is defined by deriving a class which defines methods with
       
    85 # special names to handle tags: start_foo and end_foo to handle <foo>
       
    86 # and </foo>, respectively.  The data between tags is passed to the
       
    87 # parser by calling self.handle_data() with some data as argument (the
       
    88 # data may be split up in arbitrary chunks).
       
    89 
       
    90 class XMLParser:
       
    91     attributes = {}                     # default, to be overridden
       
    92     elements = {}                       # default, to be overridden
       
    93 
       
    94     # parsing options, settable using keyword args in __init__
       
    95     __accept_unquoted_attributes = 0
       
    96     __accept_missing_endtag_name = 0
       
    97     __map_case = 0
       
    98     __accept_utf8 = 0
       
    99     __translate_attribute_references = 1
       
   100 
       
   101     # Interface -- initialize and reset this instance
       
   102     def __init__(self, **kw):
       
   103         self.__fixed = 0
       
   104         if 'accept_unquoted_attributes' in kw:
       
   105             self.__accept_unquoted_attributes = kw['accept_unquoted_attributes']
       
   106         if 'accept_missing_endtag_name' in kw:
       
   107             self.__accept_missing_endtag_name = kw['accept_missing_endtag_name']
       
   108         if 'map_case' in kw:
       
   109             self.__map_case = kw['map_case']
       
   110         if 'accept_utf8' in kw:
       
   111             self.__accept_utf8 = kw['accept_utf8']
       
   112         if 'translate_attribute_references' in kw:
       
   113             self.__translate_attribute_references = kw['translate_attribute_references']
       
   114         self.reset()
       
   115 
       
   116     def __fixelements(self):
       
   117         self.__fixed = 1
       
   118         self.elements = {}
       
   119         self.__fixdict(self.__dict__)
       
   120         self.__fixclass(self.__class__)
       
   121 
       
   122     def __fixclass(self, kl):
       
   123         self.__fixdict(kl.__dict__)
       
   124         for k in kl.__bases__:
       
   125             self.__fixclass(k)
       
   126 
       
   127     def __fixdict(self, dict):
       
   128         for key in dict.keys():
       
   129             if key[:6] == 'start_':
       
   130                 tag = key[6:]
       
   131                 start, end = self.elements.get(tag, (None, None))
       
   132                 if start is None:
       
   133                     self.elements[tag] = getattr(self, key), end
       
   134             elif key[:4] == 'end_':
       
   135                 tag = key[4:]
       
   136                 start, end = self.elements.get(tag, (None, None))
       
   137                 if end is None:
       
   138                     self.elements[tag] = start, getattr(self, key)
       
   139 
       
   140     # Interface -- reset this instance.  Loses all unprocessed data
       
   141     def reset(self):
       
   142         self.rawdata = ''
       
   143         self.stack = []
       
   144         self.nomoretags = 0
       
   145         self.literal = 0
       
   146         self.lineno = 1
       
   147         self.__at_start = 1
       
   148         self.__seen_doctype = None
       
   149         self.__seen_starttag = 0
       
   150         self.__use_namespaces = 0
       
   151         self.__namespaces = {'xml':None}   # xml is implicitly declared
       
   152         # backward compatibility hack: if elements not overridden,
       
   153         # fill it in ourselves
       
   154         if self.elements is XMLParser.elements:
       
   155             self.__fixelements()
       
   156 
       
   157     # For derived classes only -- enter literal mode (CDATA) till EOF
       
   158     def setnomoretags(self):
       
   159         self.nomoretags = self.literal = 1
       
   160 
       
   161     # For derived classes only -- enter literal mode (CDATA)
       
   162     def setliteral(self, *args):
       
   163         self.literal = 1
       
   164 
       
   165     # Interface -- feed some data to the parser.  Call this as
       
   166     # often as you want, with as little or as much text as you
       
   167     # want (may include '\n').  (This just saves the text, all the
       
   168     # processing is done by goahead().)
       
   169     def feed(self, data):
       
   170         self.rawdata = self.rawdata + data
       
   171         self.goahead(0)
       
   172 
       
   173     # Interface -- handle the remaining data
       
   174     def close(self):
       
   175         self.goahead(1)
       
   176         if self.__fixed:
       
   177             self.__fixed = 0
       
   178             # remove self.elements so that we don't leak
       
   179             del self.elements
       
   180 
       
   181     # Interface -- translate references
       
   182     def translate_references(self, data, all = 1):
       
   183         if not self.__translate_attribute_references:
       
   184             return data
       
   185         i = 0
       
   186         while 1:
       
   187             res = amp.search(data, i)
       
   188             if res is None:
       
   189                 return data
       
   190             s = res.start(0)
       
   191             res = ref.match(data, s)
       
   192             if res is None:
       
   193                 self.syntax_error("bogus `&'")
       
   194                 i = s+1
       
   195                 continue
       
   196             i = res.end(0)
       
   197             str = res.group(1)
       
   198             rescan = 0
       
   199             if str[0] == '#':
       
   200                 if str[1] == 'x':
       
   201                     str = chr(int(str[2:], 16))
       
   202                 else:
       
   203                     str = chr(int(str[1:]))
       
   204                 if data[i - 1] != ';':
       
   205                     self.syntax_error("`;' missing after char reference")
       
   206                     i = i-1
       
   207             elif all:
       
   208                 if str in self.entitydefs:
       
   209                     str = self.entitydefs[str]
       
   210                     rescan = 1
       
   211                 elif data[i - 1] != ';':
       
   212                     self.syntax_error("bogus `&'")
       
   213                     i = s + 1 # just past the &
       
   214                     continue
       
   215                 else:
       
   216                     self.syntax_error("reference to unknown entity `&%s;'" % str)
       
   217                     str = '&' + str + ';'
       
   218             elif data[i - 1] != ';':
       
   219                 self.syntax_error("bogus `&'")
       
   220                 i = s + 1 # just past the &
       
   221                 continue
       
   222 
       
   223             # when we get here, str contains the translated text and i points
       
   224             # to the end of the string that is to be replaced
       
   225             data = data[:s] + str + data[i:]
       
   226             if rescan:
       
   227                 i = s
       
   228             else:
       
   229                 i = s + len(str)
       
   230 
       
   231     # Interface - return a dictionary of all namespaces currently valid
       
   232     def getnamespace(self):
       
   233         nsdict = {}
       
   234         for t, d, nst in self.stack:
       
   235             nsdict.update(d)
       
   236         return nsdict
       
   237 
       
   238     # Internal -- handle data as far as reasonable.  May leave state
       
   239     # and data to be processed by a subsequent call.  If 'end' is
       
   240     # true, force handling all data as if followed by EOF marker.
       
   241     def goahead(self, end):
       
   242         rawdata = self.rawdata
       
   243         i = 0
       
   244         n = len(rawdata)
       
   245         while i < n:
       
   246             if i > 0:
       
   247                 self.__at_start = 0
       
   248             if self.nomoretags:
       
   249                 data = rawdata[i:n]
       
   250                 self.handle_data(data)
       
   251                 self.lineno = self.lineno + data.count('\n')
       
   252                 i = n
       
   253                 break
       
   254             res = interesting.search(rawdata, i)
       
   255             if res:
       
   256                 j = res.start(0)
       
   257             else:
       
   258                 j = n
       
   259             if i < j:
       
   260                 data = rawdata[i:j]
       
   261                 if self.__at_start and space.match(data) is None:
       
   262                     self.syntax_error('illegal data at start of file')
       
   263                 self.__at_start = 0
       
   264                 if not self.stack and space.match(data) is None:
       
   265                     self.syntax_error('data not in content')
       
   266                 if not self.__accept_utf8 and illegal.search(data):
       
   267                     self.syntax_error('illegal character in content')
       
   268                 self.handle_data(data)
       
   269                 self.lineno = self.lineno + data.count('\n')
       
   270             i = j
       
   271             if i == n: break
       
   272             if rawdata[i] == '<':
       
   273                 if starttagopen.match(rawdata, i):
       
   274                     if self.literal:
       
   275                         data = rawdata[i]
       
   276                         self.handle_data(data)
       
   277                         self.lineno = self.lineno + data.count('\n')
       
   278                         i = i+1
       
   279                         continue
       
   280                     k = self.parse_starttag(i)
       
   281                     if k < 0: break
       
   282                     self.__seen_starttag = 1
       
   283                     self.lineno = self.lineno + rawdata[i:k].count('\n')
       
   284                     i = k
       
   285                     continue
       
   286                 if endtagopen.match(rawdata, i):
       
   287                     k = self.parse_endtag(i)
       
   288                     if k < 0: break
       
   289                     self.lineno = self.lineno + rawdata[i:k].count('\n')
       
   290                     i =  k
       
   291                     continue
       
   292                 if commentopen.match(rawdata, i):
       
   293                     if self.literal:
       
   294                         data = rawdata[i]
       
   295                         self.handle_data(data)
       
   296                         self.lineno = self.lineno + data.count('\n')
       
   297                         i = i+1
       
   298                         continue
       
   299                     k = self.parse_comment(i)
       
   300                     if k < 0: break
       
   301                     self.lineno = self.lineno + rawdata[i:k].count('\n')
       
   302                     i = k
       
   303                     continue
       
   304                 if cdataopen.match(rawdata, i):
       
   305                     k = self.parse_cdata(i)
       
   306                     if k < 0: break
       
   307                     self.lineno = self.lineno + rawdata[i:k].count('\n')
       
   308                     i = k
       
   309                     continue
       
   310                 res = xmldecl.match(rawdata, i)
       
   311                 if res:
       
   312                     if not self.__at_start:
       
   313                         self.syntax_error("<?xml?> declaration not at start of document")
       
   314                     version, encoding, standalone = res.group('version',
       
   315                                                               'encoding',
       
   316                                                               'standalone')
       
   317                     if version[1:-1] != '1.0':
       
   318                         raise Error('only XML version 1.0 supported')
       
   319                     if encoding: encoding = encoding[1:-1]
       
   320                     if standalone: standalone = standalone[1:-1]
       
   321                     self.handle_xml(encoding, standalone)
       
   322                     i = res.end(0)
       
   323                     continue
       
   324                 res = procopen.match(rawdata, i)
       
   325                 if res:
       
   326                     k = self.parse_proc(i)
       
   327                     if k < 0: break
       
   328                     self.lineno = self.lineno + rawdata[i:k].count('\n')
       
   329                     i = k
       
   330                     continue
       
   331                 res = doctype.match(rawdata, i)
       
   332                 if res:
       
   333                     if self.literal:
       
   334                         data = rawdata[i]
       
   335                         self.handle_data(data)
       
   336                         self.lineno = self.lineno + data.count('\n')
       
   337                         i = i+1
       
   338                         continue
       
   339                     if self.__seen_doctype:
       
   340                         self.syntax_error('multiple DOCTYPE elements')
       
   341                     if self.__seen_starttag:
       
   342                         self.syntax_error('DOCTYPE not at beginning of document')
       
   343                     k = self.parse_doctype(res)
       
   344                     if k < 0: break
       
   345                     self.__seen_doctype = res.group('name')
       
   346                     if self.__map_case:
       
   347                         self.__seen_doctype = self.__seen_doctype.lower()
       
   348                     self.lineno = self.lineno + rawdata[i:k].count('\n')
       
   349                     i = k
       
   350                     continue
       
   351             elif rawdata[i] == '&':
       
   352                 if self.literal:
       
   353                     data = rawdata[i]
       
   354                     self.handle_data(data)
       
   355                     i = i+1
       
   356                     continue
       
   357                 res = charref.match(rawdata, i)
       
   358                 if res is not None:
       
   359                     i = res.end(0)
       
   360                     if rawdata[i-1] != ';':
       
   361                         self.syntax_error("`;' missing in charref")
       
   362                         i = i-1
       
   363                     if not self.stack:
       
   364                         self.syntax_error('data not in content')
       
   365                     self.handle_charref(res.group('char')[:-1])
       
   366                     self.lineno = self.lineno + res.group(0).count('\n')
       
   367                     continue
       
   368                 res = entityref.match(rawdata, i)
       
   369                 if res is not None:
       
   370                     i = res.end(0)
       
   371                     if rawdata[i-1] != ';':
       
   372                         self.syntax_error("`;' missing in entityref")
       
   373                         i = i-1
       
   374                     name = res.group('name')
       
   375                     if self.__map_case:
       
   376                         name = name.lower()
       
   377                     if name in self.entitydefs:
       
   378                         self.rawdata = rawdata = rawdata[:res.start(0)] + self.entitydefs[name] + rawdata[i:]
       
   379                         n = len(rawdata)
       
   380                         i = res.start(0)
       
   381                     else:
       
   382                         self.unknown_entityref(name)
       
   383                     self.lineno = self.lineno + res.group(0).count('\n')
       
   384                     continue
       
   385             elif rawdata[i] == ']':
       
   386                 if self.literal:
       
   387                     data = rawdata[i]
       
   388                     self.handle_data(data)
       
   389                     i = i+1
       
   390                     continue
       
   391                 if n-i < 3:
       
   392                     break
       
   393                 if cdataclose.match(rawdata, i):
       
   394                     self.syntax_error("bogus `]]>'")
       
   395                 self.handle_data(rawdata[i])
       
   396                 i = i+1
       
   397                 continue
       
   398             else:
       
   399                 raise Error('neither < nor & ??')
       
   400             # We get here only if incomplete matches but
       
   401             # nothing else
       
   402             break
       
   403         # end while
       
   404         if i > 0:
       
   405             self.__at_start = 0
       
   406         if end and i < n:
       
   407             data = rawdata[i]
       
   408             self.syntax_error("bogus `%s'" % data)
       
   409             if not self.__accept_utf8 and illegal.search(data):
       
   410                 self.syntax_error('illegal character in content')
       
   411             self.handle_data(data)
       
   412             self.lineno = self.lineno + data.count('\n')
       
   413             self.rawdata = rawdata[i+1:]
       
   414             return self.goahead(end)
       
   415         self.rawdata = rawdata[i:]
       
   416         if end:
       
   417             if not self.__seen_starttag:
       
   418                 self.syntax_error('no elements in file')
       
   419             if self.stack:
       
   420                 self.syntax_error('missing end tags')
       
   421                 while self.stack:
       
   422                     self.finish_endtag(self.stack[-1][0])
       
   423 
       
   424     # Internal -- parse comment, return length or -1 if not terminated
       
   425     def parse_comment(self, i):
       
   426         rawdata = self.rawdata
       
   427         if rawdata[i:i+4] != '<!--':
       
   428             raise Error('unexpected call to handle_comment')
       
   429         res = commentclose.search(rawdata, i+4)
       
   430         if res is None:
       
   431             return -1
       
   432         if doubledash.search(rawdata, i+4, res.start(0)):
       
   433             self.syntax_error("`--' inside comment")
       
   434         if rawdata[res.start(0)-1] == '-':
       
   435             self.syntax_error('comment cannot end in three dashes')
       
   436         if not self.__accept_utf8 and \
       
   437            illegal.search(rawdata, i+4, res.start(0)):
       
   438             self.syntax_error('illegal character in comment')
       
   439         self.handle_comment(rawdata[i+4: res.start(0)])
       
   440         return res.end(0)
       
   441 
       
   442     # Internal -- handle DOCTYPE tag, return length or -1 if not terminated
       
   443     def parse_doctype(self, res):
       
   444         rawdata = self.rawdata
       
   445         n = len(rawdata)
       
   446         name = res.group('name')
       
   447         if self.__map_case:
       
   448             name = name.lower()
       
   449         pubid, syslit = res.group('pubid', 'syslit')
       
   450         if pubid is not None:
       
   451             pubid = pubid[1:-1]         # remove quotes
       
   452             pubid = ' '.join(pubid.split()) # normalize
       
   453         if syslit is not None: syslit = syslit[1:-1] # remove quotes
       
   454         j = k = res.end(0)
       
   455         if k >= n:
       
   456             return -1
       
   457         if rawdata[k] == '[':
       
   458             level = 0
       
   459             k = k+1
       
   460             dq = sq = 0
       
   461             while k < n:
       
   462                 c = rawdata[k]
       
   463                 if not sq and c == '"':
       
   464                     dq = not dq
       
   465                 elif not dq and c == "'":
       
   466                     sq = not sq
       
   467                 elif sq or dq:
       
   468                     pass
       
   469                 elif level <= 0 and c == ']':
       
   470                     res = endbracket.match(rawdata, k+1)
       
   471                     if res is None:
       
   472                         return -1
       
   473                     self.handle_doctype(name, pubid, syslit, rawdata[j+1:k])
       
   474                     return res.end(0)
       
   475                 elif c == '<':
       
   476                     level = level + 1
       
   477                 elif c == '>':
       
   478                     level = level - 1
       
   479                     if level < 0:
       
   480                         self.syntax_error("bogus `>' in DOCTYPE")
       
   481                 k = k+1
       
   482         res = endbracketfind.match(rawdata, k)
       
   483         if res is None:
       
   484             return -1
       
   485         if endbracket.match(rawdata, k) is None:
       
   486             self.syntax_error('garbage in DOCTYPE')
       
   487         self.handle_doctype(name, pubid, syslit, None)
       
   488         return res.end(0)
       
   489 
       
   490     # Internal -- handle CDATA tag, return length or -1 if not terminated
       
   491     def parse_cdata(self, i):
       
   492         rawdata = self.rawdata
       
   493         if rawdata[i:i+9] != '<![CDATA[':
       
   494             raise Error('unexpected call to parse_cdata')
       
   495         res = cdataclose.search(rawdata, i+9)
       
   496         if res is None:
       
   497             return -1
       
   498         if not self.__accept_utf8 and \
       
   499            illegal.search(rawdata, i+9, res.start(0)):
       
   500             self.syntax_error('illegal character in CDATA')
       
   501         if not self.stack:
       
   502             self.syntax_error('CDATA not in content')
       
   503         self.handle_cdata(rawdata[i+9:res.start(0)])
       
   504         return res.end(0)
       
   505 
       
   506     __xml_namespace_attributes = {'ns':None, 'src':None, 'prefix':None}
       
   507     # Internal -- handle a processing instruction tag
       
   508     def parse_proc(self, i):
       
   509         rawdata = self.rawdata
       
   510         end = procclose.search(rawdata, i)
       
   511         if end is None:
       
   512             return -1
       
   513         j = end.start(0)
       
   514         if not self.__accept_utf8 and illegal.search(rawdata, i+2, j):
       
   515             self.syntax_error('illegal character in processing instruction')
       
   516         res = tagfind.match(rawdata, i+2)
       
   517         if res is None:
       
   518             raise Error('unexpected call to parse_proc')
       
   519         k = res.end(0)
       
   520         name = res.group(0)
       
   521         if self.__map_case:
       
   522             name = name.lower()
       
   523         if name == 'xml:namespace':
       
   524             self.syntax_error('old-fashioned namespace declaration')
       
   525             self.__use_namespaces = -1
       
   526             # namespace declaration
       
   527             # this must come after the <?xml?> declaration (if any)
       
   528             # and before the <!DOCTYPE> (if any).
       
   529             if self.__seen_doctype or self.__seen_starttag:
       
   530                 self.syntax_error('xml:namespace declaration too late in document')
       
   531             attrdict, namespace, k = self.parse_attributes(name, k, j)
       
   532             if namespace:
       
   533                 self.syntax_error('namespace declaration inside namespace declaration')
       
   534             for attrname in attrdict.keys():
       
   535                 if not attrname in self.__xml_namespace_attributes:
       
   536                     self.syntax_error("unknown attribute `%s' in xml:namespace tag" % attrname)
       
   537             if not 'ns' in attrdict or not 'prefix' in attrdict:
       
   538                 self.syntax_error('xml:namespace without required attributes')
       
   539             prefix = attrdict.get('prefix')
       
   540             if ncname.match(prefix) is None:
       
   541                 self.syntax_error('xml:namespace illegal prefix value')
       
   542                 return end.end(0)
       
   543             if prefix in self.__namespaces:
       
   544                 self.syntax_error('xml:namespace prefix not unique')
       
   545             self.__namespaces[prefix] = attrdict['ns']
       
   546         else:
       
   547             if name.lower() == 'xml':
       
   548                 self.syntax_error('illegal processing instruction target name')
       
   549             self.handle_proc(name, rawdata[k:j])
       
   550         return end.end(0)
       
   551 
       
   552     # Internal -- parse attributes between i and j
       
   553     def parse_attributes(self, tag, i, j):
       
   554         rawdata = self.rawdata
       
   555         attrdict = {}
       
   556         namespace = {}
       
   557         while i < j:
       
   558             res = attrfind.match(rawdata, i)
       
   559             if res is None:
       
   560                 break
       
   561             attrname, attrvalue = res.group('name', 'value')
       
   562             if self.__map_case:
       
   563                 attrname = attrname.lower()
       
   564             i = res.end(0)
       
   565             if attrvalue is None:
       
   566                 self.syntax_error("no value specified for attribute `%s'" % attrname)
       
   567                 attrvalue = attrname
       
   568             elif attrvalue[:1] == "'" == attrvalue[-1:] or \
       
   569                  attrvalue[:1] == '"' == attrvalue[-1:]:
       
   570                 attrvalue = attrvalue[1:-1]
       
   571             elif not self.__accept_unquoted_attributes:
       
   572                 self.syntax_error("attribute `%s' value not quoted" % attrname)
       
   573             res = xmlns.match(attrname)
       
   574             if res is not None:
       
   575                 # namespace declaration
       
   576                 ncname = res.group('ncname')
       
   577                 namespace[ncname or ''] = attrvalue or None
       
   578                 if not self.__use_namespaces:
       
   579                     self.__use_namespaces = len(self.stack)+1
       
   580                 continue
       
   581             if '<' in attrvalue:
       
   582                 self.syntax_error("`<' illegal in attribute value")
       
   583             if attrname in attrdict:
       
   584                 self.syntax_error("attribute `%s' specified twice" % attrname)
       
   585             attrvalue = attrvalue.translate(attrtrans)
       
   586             attrdict[attrname] = self.translate_references(attrvalue)
       
   587         return attrdict, namespace, i
       
   588 
       
   589     # Internal -- handle starttag, return length or -1 if not terminated
       
   590     def parse_starttag(self, i):
       
   591         rawdata = self.rawdata
       
   592         # i points to start of tag
       
   593         end = endbracketfind.match(rawdata, i+1)
       
   594         if end is None:
       
   595             return -1
       
   596         tag = starttagmatch.match(rawdata, i)
       
   597         if tag is None or tag.end(0) != end.end(0):
       
   598             self.syntax_error('garbage in starttag')
       
   599             return end.end(0)
       
   600         nstag = tagname = tag.group('tagname')
       
   601         if self.__map_case:
       
   602             nstag = tagname = nstag.lower()
       
   603         if not self.__seen_starttag and self.__seen_doctype and \
       
   604            tagname != self.__seen_doctype:
       
   605             self.syntax_error('starttag does not match DOCTYPE')
       
   606         if self.__seen_starttag and not self.stack:
       
   607             self.syntax_error('multiple elements on top level')
       
   608         k, j = tag.span('attrs')
       
   609         attrdict, nsdict, k = self.parse_attributes(tagname, k, j)
       
   610         self.stack.append((tagname, nsdict, nstag))
       
   611         if self.__use_namespaces:
       
   612             res = qname.match(tagname)
       
   613         else:
       
   614             res = None
       
   615         if res is not None:
       
   616             prefix, nstag = res.group('prefix', 'local')
       
   617             if prefix is None:
       
   618                 prefix = ''
       
   619             ns = None
       
   620             for t, d, nst in self.stack:
       
   621                 if prefix in d:
       
   622                     ns = d[prefix]
       
   623             if ns is None and prefix != '':
       
   624                 ns = self.__namespaces.get(prefix)
       
   625             if ns is not None:
       
   626                 nstag = ns + ' ' + nstag
       
   627             elif prefix != '':
       
   628                 nstag = prefix + ':' + nstag # undo split
       
   629             self.stack[-1] = tagname, nsdict, nstag
       
   630         # translate namespace of attributes
       
   631         attrnamemap = {} # map from new name to old name (used for error reporting)
       
   632         for key in attrdict.keys():
       
   633             attrnamemap[key] = key
       
   634         if self.__use_namespaces:
       
   635             nattrdict = {}
       
   636             for key, val in attrdict.items():
       
   637                 okey = key
       
   638                 res = qname.match(key)
       
   639                 if res is not None:
       
   640                     aprefix, key = res.group('prefix', 'local')
       
   641                     if self.__map_case:
       
   642                         key = key.lower()
       
   643                     if aprefix is not None:
       
   644                         ans = None
       
   645                         for t, d, nst in self.stack:
       
   646                             if aprefix in d:
       
   647                                 ans = d[aprefix]
       
   648                         if ans is None:
       
   649                             ans = self.__namespaces.get(aprefix)
       
   650                         if ans is not None:
       
   651                             key = ans + ' ' + key
       
   652                         else:
       
   653                             key = aprefix + ':' + key
       
   654                 nattrdict[key] = val
       
   655                 attrnamemap[key] = okey
       
   656             attrdict = nattrdict
       
   657         attributes = self.attributes.get(nstag)
       
   658         if attributes is not None:
       
   659             for key in attrdict.keys():
       
   660                 if not key in attributes:
       
   661                     self.syntax_error("unknown attribute `%s' in tag `%s'" % (attrnamemap[key], tagname))
       
   662             for key, val in attributes.items():
       
   663                 if val is not None and not key in attrdict:
       
   664                     attrdict[key] = val
       
   665         method = self.elements.get(nstag, (None, None))[0]
       
   666         self.finish_starttag(nstag, attrdict, method)
       
   667         if tag.group('slash') == '/':
       
   668             self.finish_endtag(tagname)
       
   669         return tag.end(0)
       
   670 
       
   671     # Internal -- parse endtag
       
   672     def parse_endtag(self, i):
       
   673         rawdata = self.rawdata
       
   674         end = endbracketfind.match(rawdata, i+1)
       
   675         if end is None:
       
   676             return -1
       
   677         res = tagfind.match(rawdata, i+2)
       
   678         if res is None:
       
   679             if self.literal:
       
   680                 self.handle_data(rawdata[i])
       
   681                 return i+1
       
   682             if not self.__accept_missing_endtag_name:
       
   683                 self.syntax_error('no name specified in end tag')
       
   684             tag = self.stack[-1][0]
       
   685             k = i+2
       
   686         else:
       
   687             tag = res.group(0)
       
   688             if self.__map_case:
       
   689                 tag = tag.lower()
       
   690             if self.literal:
       
   691                 if not self.stack or tag != self.stack[-1][0]:
       
   692                     self.handle_data(rawdata[i])
       
   693                     return i+1
       
   694             k = res.end(0)
       
   695         if endbracket.match(rawdata, k) is None:
       
   696             self.syntax_error('garbage in end tag')
       
   697         self.finish_endtag(tag)
       
   698         return end.end(0)
       
   699 
       
   700     # Internal -- finish processing of start tag
       
   701     def finish_starttag(self, tagname, attrdict, method):
       
   702         if method is not None:
       
   703             self.handle_starttag(tagname, method, attrdict)
       
   704         else:
       
   705             self.unknown_starttag(tagname, attrdict)
       
   706 
       
   707     # Internal -- finish processing of end tag
       
   708     def finish_endtag(self, tag):
       
   709         self.literal = 0
       
   710         if not tag:
       
   711             self.syntax_error('name-less end tag')
       
   712             found = len(self.stack) - 1
       
   713             if found < 0:
       
   714                 self.unknown_endtag(tag)
       
   715                 return
       
   716         else:
       
   717             found = -1
       
   718             for i in range(len(self.stack)):
       
   719                 if tag == self.stack[i][0]:
       
   720                     found = i
       
   721             if found == -1:
       
   722                 self.syntax_error('unopened end tag')
       
   723                 return
       
   724         while len(self.stack) > found:
       
   725             if found < len(self.stack) - 1:
       
   726                 self.syntax_error('missing close tag for %s' % self.stack[-1][2])
       
   727             nstag = self.stack[-1][2]
       
   728             method = self.elements.get(nstag, (None, None))[1]
       
   729             if method is not None:
       
   730                 self.handle_endtag(nstag, method)
       
   731             else:
       
   732                 self.unknown_endtag(nstag)
       
   733             if self.__use_namespaces == len(self.stack):
       
   734                 self.__use_namespaces = 0
       
   735             del self.stack[-1]
       
   736 
       
   737     # Overridable -- handle xml processing instruction
       
   738     def handle_xml(self, encoding, standalone):
       
   739         pass
       
   740 
       
   741     # Overridable -- handle DOCTYPE
       
   742     def handle_doctype(self, tag, pubid, syslit, data):
       
   743         pass
       
   744 
       
   745     # Overridable -- handle start tag
       
   746     def handle_starttag(self, tag, method, attrs):
       
   747         method(attrs)
       
   748 
       
   749     # Overridable -- handle end tag
       
   750     def handle_endtag(self, tag, method):
       
   751         method()
       
   752 
       
   753     # Example -- handle character reference, no need to override
       
   754     def handle_charref(self, name):
       
   755         try:
       
   756             if name[0] == 'x':
       
   757                 n = int(name[1:], 16)
       
   758             else:
       
   759                 n = int(name)
       
   760         except ValueError:
       
   761             self.unknown_charref(name)
       
   762             return
       
   763         if not 0 <= n <= 255:
       
   764             self.unknown_charref(name)
       
   765             return
       
   766         self.handle_data(chr(n))
       
   767 
       
   768     # Definition of entities -- derived classes may override
       
   769     entitydefs = {'lt': '&#60;',        # must use charref
       
   770                   'gt': '&#62;',
       
   771                   'amp': '&#38;',       # must use charref
       
   772                   'quot': '&#34;',
       
   773                   'apos': '&#39;',
       
   774                   }
       
   775 
       
   776     # Example -- handle data, should be overridden
       
   777     def handle_data(self, data):
       
   778         pass
       
   779 
       
   780     # Example -- handle cdata, could be overridden
       
   781     def handle_cdata(self, data):
       
   782         pass
       
   783 
       
   784     # Example -- handle comment, could be overridden
       
   785     def handle_comment(self, data):
       
   786         pass
       
   787 
       
   788     # Example -- handle processing instructions, could be overridden
       
   789     def handle_proc(self, name, data):
       
   790         pass
       
   791 
       
   792     # Example -- handle relatively harmless syntax errors, could be overridden
       
   793     def syntax_error(self, message):
       
   794         raise Error('Syntax error at line %d: %s' % (self.lineno, message))
       
   795 
       
   796     # To be overridden -- handlers for unknown objects
       
   797     def unknown_starttag(self, tag, attrs): pass
       
   798     def unknown_endtag(self, tag): pass
       
   799     def unknown_charref(self, ref): pass
       
   800     def unknown_entityref(self, name):
       
   801         self.syntax_error("reference to unknown entity `&%s;'" % name)
       
   802 
       
   803 
       
   804 class TestXMLParser(XMLParser):
       
   805 
       
   806     def __init__(self, **kw):
       
   807         self.testdata = ""
       
   808         XMLParser.__init__(self, **kw)
       
   809 
       
   810     def handle_xml(self, encoding, standalone):
       
   811         self.flush()
       
   812         print 'xml: encoding =',encoding,'standalone =',standalone
       
   813 
       
   814     def handle_doctype(self, tag, pubid, syslit, data):
       
   815         self.flush()
       
   816         print 'DOCTYPE:',tag, repr(data)
       
   817 
       
   818     def handle_data(self, data):
       
   819         self.testdata = self.testdata + data
       
   820         if len(repr(self.testdata)) >= 70:
       
   821             self.flush()
       
   822 
       
   823     def flush(self):
       
   824         data = self.testdata
       
   825         if data:
       
   826             self.testdata = ""
       
   827             print 'data:', repr(data)
       
   828 
       
   829     def handle_cdata(self, data):
       
   830         self.flush()
       
   831         print 'cdata:', repr(data)
       
   832 
       
   833     def handle_proc(self, name, data):
       
   834         self.flush()
       
   835         print 'processing:',name,repr(data)
       
   836 
       
   837     def handle_comment(self, data):
       
   838         self.flush()
       
   839         r = repr(data)
       
   840         if len(r) > 68:
       
   841             r = r[:32] + '...' + r[-32:]
       
   842         print 'comment:', r
       
   843 
       
   844     def syntax_error(self, message):
       
   845         print 'error at line %d:' % self.lineno, message
       
   846 
       
   847     def unknown_starttag(self, tag, attrs):
       
   848         self.flush()
       
   849         if not attrs:
       
   850             print 'start tag: <' + tag + '>'
       
   851         else:
       
   852             print 'start tag: <' + tag,
       
   853             for name, value in attrs.items():
       
   854                 print name + '=' + '"' + value + '"',
       
   855             print '>'
       
   856 
       
   857     def unknown_endtag(self, tag):
       
   858         self.flush()
       
   859         print 'end tag: </' + tag + '>'
       
   860 
       
   861     def unknown_entityref(self, ref):
       
   862         self.flush()
       
   863         print '*** unknown entity ref: &' + ref + ';'
       
   864 
       
   865     def unknown_charref(self, ref):
       
   866         self.flush()
       
   867         print '*** unknown char ref: &#' + ref + ';'
       
   868 
       
   869     def close(self):
       
   870         XMLParser.close(self)
       
   871         self.flush()
       
   872 
       
   873 def test(args = None):
       
   874     import sys, getopt
       
   875     from time import time
       
   876 
       
   877     if not args:
       
   878         args = sys.argv[1:]
       
   879 
       
   880     opts, args = getopt.getopt(args, 'st')
       
   881     klass = TestXMLParser
       
   882     do_time = 0
       
   883     for o, a in opts:
       
   884         if o == '-s':
       
   885             klass = XMLParser
       
   886         elif o == '-t':
       
   887             do_time = 1
       
   888 
       
   889     if args:
       
   890         file = args[0]
       
   891     else:
       
   892         file = 'test.xml'
       
   893 
       
   894     if file == '-':
       
   895         f = sys.stdin
       
   896     else:
       
   897         try:
       
   898             f = open(file, 'r')
       
   899         except IOError, msg:
       
   900             print file, ":", msg
       
   901             sys.exit(1)
       
   902 
       
   903     data = f.read()
       
   904     if f is not sys.stdin:
       
   905         f.close()
       
   906 
       
   907     x = klass()
       
   908     t0 = time()
       
   909     try:
       
   910         if do_time:
       
   911             x.feed(data)
       
   912             x.close()
       
   913         else:
       
   914             for c in data:
       
   915                 x.feed(c)
       
   916             x.close()
       
   917     except Error, msg:
       
   918         t1 = time()
       
   919         print msg
       
   920         if do_time:
       
   921             print 'total time: %g' % (t1-t0)
       
   922         sys.exit(1)
       
   923     t1 = time()
       
   924     if do_time:
       
   925         print 'total time: %g' % (t1-t0)
       
   926 
       
   927 
       
   928 if __name__ == '__main__':
       
   929     test()