symbian-qemu-0.9.1-12/python-win32-2.6.1/lib/sgmllib.py
changeset 1 2fb8b9db1c86
equal deleted inserted replaced
0:ffa851df0825 1:2fb8b9db1c86
       
     1 """A parser for SGML, using the derived class as a static DTD."""
       
     2 
       
     3 # XXX This only supports those SGML features used by HTML.
       
     4 
       
     5 # XXX There should be a way to distinguish between PCDATA (parsed
       
     6 # character data -- the normal case), RCDATA (replaceable character
       
     7 # data -- only char and entity references and end tags are special)
       
     8 # and CDATA (character data -- only end tags are special).  RCDATA is
       
     9 # not supported at all.
       
    10 
       
    11 
       
    12 from warnings import warnpy3k
       
    13 warnpy3k("the sgmllib module has been removed in Python 3.0",
       
    14          stacklevel=2)
       
    15 del warnpy3k
       
    16 
       
    17 import markupbase
       
    18 import re
       
    19 
       
    20 __all__ = ["SGMLParser", "SGMLParseError"]
       
    21 
       
    22 # Regular expressions used for parsing
       
    23 
       
    24 interesting = re.compile('[&<]')
       
    25 incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|'
       
    26                            '<([a-zA-Z][^<>]*|'
       
    27                               '/([a-zA-Z][^<>]*)?|'
       
    28                               '![^<>]*)?')
       
    29 
       
    30 entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
       
    31 charref = re.compile('&#([0-9]+)[^0-9]')
       
    32 
       
    33 starttagopen = re.compile('<[>a-zA-Z]')
       
    34 shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
       
    35 shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/')
       
    36 piclose = re.compile('>')
       
    37 endbracket = re.compile('[<>]')
       
    38 tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
       
    39 attrfind = re.compile(
       
    40     r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
       
    41     r'(\'[^\']*\'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?')
       
    42 
       
    43 
       
    44 class SGMLParseError(RuntimeError):
       
    45     """Exception raised for all parse errors."""
       
    46     pass
       
    47 
       
    48 
       
    49 # SGML parser base class -- find tags and call handler functions.
       
    50 # Usage: p = SGMLParser(); p.feed(data); ...; p.close().
       
    51 # The dtd is defined by deriving a class which defines methods
       
    52 # with special names to handle tags: start_foo and end_foo to handle
       
    53 # <foo> and </foo>, respectively, or do_foo to handle <foo> by itself.
       
    54 # (Tags are converted to lower case for this purpose.)  The data
       
    55 # between tags is passed to the parser by calling self.handle_data()
       
    56 # with some data as argument (the data may be split up in arbitrary
       
    57 # chunks).  Entity references are passed by calling
       
    58 # self.handle_entityref() with the entity reference as argument.
       
    59 
       
    60 class SGMLParser(markupbase.ParserBase):
       
    61     # Definition of entities -- derived classes may override
       
    62     entity_or_charref = re.compile('&(?:'
       
    63       '([a-zA-Z][-.a-zA-Z0-9]*)|#([0-9]+)'
       
    64       ')(;?)')
       
    65 
       
    66     def __init__(self, verbose=0):
       
    67         """Initialize and reset this instance."""
       
    68         self.verbose = verbose
       
    69         self.reset()
       
    70 
       
    71     def reset(self):
       
    72         """Reset this instance. Loses all unprocessed data."""
       
    73         self.__starttag_text = None
       
    74         self.rawdata = ''
       
    75         self.stack = []
       
    76         self.lasttag = '???'
       
    77         self.nomoretags = 0
       
    78         self.literal = 0
       
    79         markupbase.ParserBase.reset(self)
       
    80 
       
    81     def setnomoretags(self):
       
    82         """Enter literal mode (CDATA) till EOF.
       
    83 
       
    84         Intended for derived classes only.
       
    85         """
       
    86         self.nomoretags = self.literal = 1
       
    87 
       
    88     def setliteral(self, *args):
       
    89         """Enter literal mode (CDATA).
       
    90 
       
    91         Intended for derived classes only.
       
    92         """
       
    93         self.literal = 1
       
    94 
       
    95     def feed(self, data):
       
    96         """Feed some data to the parser.
       
    97 
       
    98         Call this as often as you want, with as little or as much text
       
    99         as you want (may include '\n').  (This just saves the text,
       
   100         all the processing is done by goahead().)
       
   101         """
       
   102 
       
   103         self.rawdata = self.rawdata + data
       
   104         self.goahead(0)
       
   105 
       
   106     def close(self):
       
   107         """Handle the remaining data."""
       
   108         self.goahead(1)
       
   109 
       
   110     def error(self, message):
       
   111         raise SGMLParseError(message)
       
   112 
       
   113     # Internal -- handle data as far as reasonable.  May leave state
       
   114     # and data to be processed by a subsequent call.  If 'end' is
       
   115     # true, force handling all data as if followed by EOF marker.
       
   116     def goahead(self, end):
       
   117         rawdata = self.rawdata
       
   118         i = 0
       
   119         n = len(rawdata)
       
   120         while i < n:
       
   121             if self.nomoretags:
       
   122                 self.handle_data(rawdata[i:n])
       
   123                 i = n
       
   124                 break
       
   125             match = interesting.search(rawdata, i)
       
   126             if match: j = match.start()
       
   127             else: j = n
       
   128             if i < j:
       
   129                 self.handle_data(rawdata[i:j])
       
   130             i = j
       
   131             if i == n: break
       
   132             if rawdata[i] == '<':
       
   133                 if starttagopen.match(rawdata, i):
       
   134                     if self.literal:
       
   135                         self.handle_data(rawdata[i])
       
   136                         i = i+1
       
   137                         continue
       
   138                     k = self.parse_starttag(i)
       
   139                     if k < 0: break
       
   140                     i = k
       
   141                     continue
       
   142                 if rawdata.startswith("</", i):
       
   143                     k = self.parse_endtag(i)
       
   144                     if k < 0: break
       
   145                     i = k
       
   146                     self.literal = 0
       
   147                     continue
       
   148                 if self.literal:
       
   149                     if n > (i + 1):
       
   150                         self.handle_data("<")
       
   151                         i = i+1
       
   152                     else:
       
   153                         # incomplete
       
   154                         break
       
   155                     continue
       
   156                 if rawdata.startswith("<!--", i):
       
   157                         # Strictly speaking, a comment is --.*--
       
   158                         # within a declaration tag <!...>.
       
   159                         # This should be removed,
       
   160                         # and comments handled only in parse_declaration.
       
   161                     k = self.parse_comment(i)
       
   162                     if k < 0: break
       
   163                     i = k
       
   164                     continue
       
   165                 if rawdata.startswith("<?", i):
       
   166                     k = self.parse_pi(i)
       
   167                     if k < 0: break
       
   168                     i = i+k
       
   169                     continue
       
   170                 if rawdata.startswith("<!", i):
       
   171                     # This is some sort of declaration; in "HTML as
       
   172                     # deployed," this should only be the document type
       
   173                     # declaration ("<!DOCTYPE html...>").
       
   174                     k = self.parse_declaration(i)
       
   175                     if k < 0: break
       
   176                     i = k
       
   177                     continue
       
   178             elif rawdata[i] == '&':
       
   179                 if self.literal:
       
   180                     self.handle_data(rawdata[i])
       
   181                     i = i+1
       
   182                     continue
       
   183                 match = charref.match(rawdata, i)
       
   184                 if match:
       
   185                     name = match.group(1)
       
   186                     self.handle_charref(name)
       
   187                     i = match.end(0)
       
   188                     if rawdata[i-1] != ';': i = i-1
       
   189                     continue
       
   190                 match = entityref.match(rawdata, i)
       
   191                 if match:
       
   192                     name = match.group(1)
       
   193                     self.handle_entityref(name)
       
   194                     i = match.end(0)
       
   195                     if rawdata[i-1] != ';': i = i-1
       
   196                     continue
       
   197             else:
       
   198                 self.error('neither < nor & ??')
       
   199             # We get here only if incomplete matches but
       
   200             # nothing else
       
   201             match = incomplete.match(rawdata, i)
       
   202             if not match:
       
   203                 self.handle_data(rawdata[i])
       
   204                 i = i+1
       
   205                 continue
       
   206             j = match.end(0)
       
   207             if j == n:
       
   208                 break # Really incomplete
       
   209             self.handle_data(rawdata[i:j])
       
   210             i = j
       
   211         # end while
       
   212         if end and i < n:
       
   213             self.handle_data(rawdata[i:n])
       
   214             i = n
       
   215         self.rawdata = rawdata[i:]
       
   216         # XXX if end: check for empty stack
       
   217 
       
   218     # Extensions for the DOCTYPE scanner:
       
   219     _decl_otherchars = '='
       
   220 
       
   221     # Internal -- parse processing instr, return length or -1 if not terminated
       
   222     def parse_pi(self, i):
       
   223         rawdata = self.rawdata
       
   224         if rawdata[i:i+2] != '<?':
       
   225             self.error('unexpected call to parse_pi()')
       
   226         match = piclose.search(rawdata, i+2)
       
   227         if not match:
       
   228             return -1
       
   229         j = match.start(0)
       
   230         self.handle_pi(rawdata[i+2: j])
       
   231         j = match.end(0)
       
   232         return j-i
       
   233 
       
   234     def get_starttag_text(self):
       
   235         return self.__starttag_text
       
   236 
       
   237     # Internal -- handle starttag, return length or -1 if not terminated
       
   238     def parse_starttag(self, i):
       
   239         self.__starttag_text = None
       
   240         start_pos = i
       
   241         rawdata = self.rawdata
       
   242         if shorttagopen.match(rawdata, i):
       
   243             # SGML shorthand: <tag/data/ == <tag>data</tag>
       
   244             # XXX Can data contain &... (entity or char refs)?
       
   245             # XXX Can data contain < or > (tag characters)?
       
   246             # XXX Can there be whitespace before the first /?
       
   247             match = shorttag.match(rawdata, i)
       
   248             if not match:
       
   249                 return -1
       
   250             tag, data = match.group(1, 2)
       
   251             self.__starttag_text = '<%s/' % tag
       
   252             tag = tag.lower()
       
   253             k = match.end(0)
       
   254             self.finish_shorttag(tag, data)
       
   255             self.__starttag_text = rawdata[start_pos:match.end(1) + 1]
       
   256             return k
       
   257         # XXX The following should skip matching quotes (' or ")
       
   258         # As a shortcut way to exit, this isn't so bad, but shouldn't
       
   259         # be used to locate the actual end of the start tag since the
       
   260         # < or > characters may be embedded in an attribute value.
       
   261         match = endbracket.search(rawdata, i+1)
       
   262         if not match:
       
   263             return -1
       
   264         j = match.start(0)
       
   265         # Now parse the data between i+1 and j into a tag and attrs
       
   266         attrs = []
       
   267         if rawdata[i:i+2] == '<>':
       
   268             # SGML shorthand: <> == <last open tag seen>
       
   269             k = j
       
   270             tag = self.lasttag
       
   271         else:
       
   272             match = tagfind.match(rawdata, i+1)
       
   273             if not match:
       
   274                 self.error('unexpected call to parse_starttag')
       
   275             k = match.end(0)
       
   276             tag = rawdata[i+1:k].lower()
       
   277             self.lasttag = tag
       
   278         while k < j:
       
   279             match = attrfind.match(rawdata, k)
       
   280             if not match: break
       
   281             attrname, rest, attrvalue = match.group(1, 2, 3)
       
   282             if not rest:
       
   283                 attrvalue = attrname
       
   284             else:
       
   285                 if (attrvalue[:1] == "'" == attrvalue[-1:] or
       
   286                     attrvalue[:1] == '"' == attrvalue[-1:]):
       
   287                     # strip quotes
       
   288                     attrvalue = attrvalue[1:-1]
       
   289                 attrvalue = self.entity_or_charref.sub(
       
   290                     self._convert_ref, attrvalue)
       
   291             attrs.append((attrname.lower(), attrvalue))
       
   292             k = match.end(0)
       
   293         if rawdata[j] == '>':
       
   294             j = j+1
       
   295         self.__starttag_text = rawdata[start_pos:j]
       
   296         self.finish_starttag(tag, attrs)
       
   297         return j
       
   298 
       
   299     # Internal -- convert entity or character reference
       
   300     def _convert_ref(self, match):
       
   301         if match.group(2):
       
   302             return self.convert_charref(match.group(2)) or \
       
   303                 '&#%s%s' % match.groups()[1:]
       
   304         elif match.group(3):
       
   305             return self.convert_entityref(match.group(1)) or \
       
   306                 '&%s;' % match.group(1)
       
   307         else:
       
   308             return '&%s' % match.group(1)
       
   309 
       
   310     # Internal -- parse endtag
       
   311     def parse_endtag(self, i):
       
   312         rawdata = self.rawdata
       
   313         match = endbracket.search(rawdata, i+1)
       
   314         if not match:
       
   315             return -1
       
   316         j = match.start(0)
       
   317         tag = rawdata[i+2:j].strip().lower()
       
   318         if rawdata[j] == '>':
       
   319             j = j+1
       
   320         self.finish_endtag(tag)
       
   321         return j
       
   322 
       
   323     # Internal -- finish parsing of <tag/data/ (same as <tag>data</tag>)
       
   324     def finish_shorttag(self, tag, data):
       
   325         self.finish_starttag(tag, [])
       
   326         self.handle_data(data)
       
   327         self.finish_endtag(tag)
       
   328 
       
   329     # Internal -- finish processing of start tag
       
   330     # Return -1 for unknown tag, 0 for open-only tag, 1 for balanced tag
       
   331     def finish_starttag(self, tag, attrs):
       
   332         try:
       
   333             method = getattr(self, 'start_' + tag)
       
   334         except AttributeError:
       
   335             try:
       
   336                 method = getattr(self, 'do_' + tag)
       
   337             except AttributeError:
       
   338                 self.unknown_starttag(tag, attrs)
       
   339                 return -1
       
   340             else:
       
   341                 self.handle_starttag(tag, method, attrs)
       
   342                 return 0
       
   343         else:
       
   344             self.stack.append(tag)
       
   345             self.handle_starttag(tag, method, attrs)
       
   346             return 1
       
   347 
       
   348     # Internal -- finish processing of end tag
       
   349     def finish_endtag(self, tag):
       
   350         if not tag:
       
   351             found = len(self.stack) - 1
       
   352             if found < 0:
       
   353                 self.unknown_endtag(tag)
       
   354                 return
       
   355         else:
       
   356             if tag not in self.stack:
       
   357                 try:
       
   358                     method = getattr(self, 'end_' + tag)
       
   359                 except AttributeError:
       
   360                     self.unknown_endtag(tag)
       
   361                 else:
       
   362                     self.report_unbalanced(tag)
       
   363                 return
       
   364             found = len(self.stack)
       
   365             for i in range(found):
       
   366                 if self.stack[i] == tag: found = i
       
   367         while len(self.stack) > found:
       
   368             tag = self.stack[-1]
       
   369             try:
       
   370                 method = getattr(self, 'end_' + tag)
       
   371             except AttributeError:
       
   372                 method = None
       
   373             if method:
       
   374                 self.handle_endtag(tag, method)
       
   375             else:
       
   376                 self.unknown_endtag(tag)
       
   377             del self.stack[-1]
       
   378 
       
   379     # Overridable -- handle start tag
       
   380     def handle_starttag(self, tag, method, attrs):
       
   381         method(attrs)
       
   382 
       
   383     # Overridable -- handle end tag
       
   384     def handle_endtag(self, tag, method):
       
   385         method()
       
   386 
       
   387     # Example -- report an unbalanced </...> tag.
       
   388     def report_unbalanced(self, tag):
       
   389         if self.verbose:
       
   390             print '*** Unbalanced </' + tag + '>'
       
   391             print '*** Stack:', self.stack
       
   392 
       
   393     def convert_charref(self, name):
       
   394         """Convert character reference, may be overridden."""
       
   395         try:
       
   396             n = int(name)
       
   397         except ValueError:
       
   398             return
       
   399         if not 0 <= n <= 255:
       
   400             return
       
   401         return self.convert_codepoint(n)
       
   402 
       
   403     def convert_codepoint(self, codepoint):
       
   404         return chr(codepoint)
       
   405 
       
   406     def handle_charref(self, name):
       
   407         """Handle character reference, no need to override."""
       
   408         replacement = self.convert_charref(name)
       
   409         if replacement is None:
       
   410             self.unknown_charref(name)
       
   411         else:
       
   412             self.handle_data(replacement)
       
   413 
       
   414     # Definition of entities -- derived classes may override
       
   415     entitydefs = \
       
   416             {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}
       
   417 
       
   418     def convert_entityref(self, name):
       
   419         """Convert entity references.
       
   420 
       
   421         As an alternative to overriding this method; one can tailor the
       
   422         results by setting up the self.entitydefs mapping appropriately.
       
   423         """
       
   424         table = self.entitydefs
       
   425         if name in table:
       
   426             return table[name]
       
   427         else:
       
   428             return
       
   429 
       
   430     def handle_entityref(self, name):
       
   431         """Handle entity references, no need to override."""
       
   432         replacement = self.convert_entityref(name)
       
   433         if replacement is None:
       
   434             self.unknown_entityref(name)
       
   435         else:
       
   436             self.handle_data(replacement)
       
   437 
       
   438     # Example -- handle data, should be overridden
       
   439     def handle_data(self, data):
       
   440         pass
       
   441 
       
   442     # Example -- handle comment, could be overridden
       
   443     def handle_comment(self, data):
       
   444         pass
       
   445 
       
   446     # Example -- handle declaration, could be overridden
       
   447     def handle_decl(self, decl):
       
   448         pass
       
   449 
       
   450     # Example -- handle processing instruction, could be overridden
       
   451     def handle_pi(self, data):
       
   452         pass
       
   453 
       
   454     # To be overridden -- handlers for unknown objects
       
   455     def unknown_starttag(self, tag, attrs): pass
       
   456     def unknown_endtag(self, tag): pass
       
   457     def unknown_charref(self, ref): pass
       
   458     def unknown_entityref(self, ref): pass
       
   459 
       
   460 
       
   461 class TestSGMLParser(SGMLParser):
       
   462 
       
   463     def __init__(self, verbose=0):
       
   464         self.testdata = ""
       
   465         SGMLParser.__init__(self, verbose)
       
   466 
       
   467     def handle_data(self, data):
       
   468         self.testdata = self.testdata + data
       
   469         if len(repr(self.testdata)) >= 70:
       
   470             self.flush()
       
   471 
       
   472     def flush(self):
       
   473         data = self.testdata
       
   474         if data:
       
   475             self.testdata = ""
       
   476             print 'data:', repr(data)
       
   477 
       
   478     def handle_comment(self, data):
       
   479         self.flush()
       
   480         r = repr(data)
       
   481         if len(r) > 68:
       
   482             r = r[:32] + '...' + r[-32:]
       
   483         print 'comment:', r
       
   484 
       
   485     def unknown_starttag(self, tag, attrs):
       
   486         self.flush()
       
   487         if not attrs:
       
   488             print 'start tag: <' + tag + '>'
       
   489         else:
       
   490             print 'start tag: <' + tag,
       
   491             for name, value in attrs:
       
   492                 print name + '=' + '"' + value + '"',
       
   493             print '>'
       
   494 
       
   495     def unknown_endtag(self, tag):
       
   496         self.flush()
       
   497         print 'end tag: </' + tag + '>'
       
   498 
       
   499     def unknown_entityref(self, ref):
       
   500         self.flush()
       
   501         print '*** unknown entity ref: &' + ref + ';'
       
   502 
       
   503     def unknown_charref(self, ref):
       
   504         self.flush()
       
   505         print '*** unknown char ref: &#' + ref + ';'
       
   506 
       
   507     def unknown_decl(self, data):
       
   508         self.flush()
       
   509         print '*** unknown decl: [' + data + ']'
       
   510 
       
   511     def close(self):
       
   512         SGMLParser.close(self)
       
   513         self.flush()
       
   514 
       
   515 
       
   516 def test(args = None):
       
   517     import sys
       
   518 
       
   519     if args is None:
       
   520         args = sys.argv[1:]
       
   521 
       
   522     if args and args[0] == '-s':
       
   523         args = args[1:]
       
   524         klass = SGMLParser
       
   525     else:
       
   526         klass = TestSGMLParser
       
   527 
       
   528     if args:
       
   529         file = args[0]
       
   530     else:
       
   531         file = 'test.html'
       
   532 
       
   533     if file == '-':
       
   534         f = sys.stdin
       
   535     else:
       
   536         try:
       
   537             f = open(file, 'r')
       
   538         except IOError, msg:
       
   539             print file, ":", msg
       
   540             sys.exit(1)
       
   541 
       
   542     data = f.read()
       
   543     if f is not sys.stdin:
       
   544         f.close()
       
   545 
       
   546     x = klass()
       
   547     for c in data:
       
   548         x.feed(c)
       
   549     x.close()
       
   550 
       
   551 
       
   552 if __name__ == '__main__':
       
   553     test()