python-2.5.2/win32/Lib/sgmllib.py
changeset 0 ae805ac0140d
equal deleted inserted replaced
-1:000000000000 0:ae805ac0140d
       
     1 """A parser for SGML, using the derived class as a static DTD."""
       
     2 
       
     3 # XXX This only supports those SGML features used by HTML.
       
     4 
       
     5 # XXX There should be a way to distinguish between PCDATA (parsed
       
     6 # character data -- the normal case), RCDATA (replaceable character
       
     7 # data -- only char and entity references and end tags are special)
       
     8 # and CDATA (character data -- only end tags are special).  RCDATA is
       
     9 # not supported at all.
       
    10 
       
    11 
       
    12 import markupbase
       
    13 import re
       
    14 
       
    15 __all__ = ["SGMLParser", "SGMLParseError"]
       
    16 
       
    17 # Regular expressions used for parsing
       
    18 
       
    19 interesting = re.compile('[&<]')
       
    20 incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|'
       
    21                            '<([a-zA-Z][^<>]*|'
       
    22                               '/([a-zA-Z][^<>]*)?|'
       
    23                               '![^<>]*)?')
       
    24 
       
    25 entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
       
    26 charref = re.compile('&#([0-9]+)[^0-9]')
       
    27 
       
    28 starttagopen = re.compile('<[>a-zA-Z]')
       
    29 shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
       
    30 shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/')
       
    31 piclose = re.compile('>')
       
    32 endbracket = re.compile('[<>]')
       
    33 tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
       
    34 attrfind = re.compile(
       
    35     r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
       
    36     r'(\'[^\']*\'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?')
       
    37 
       
    38 
       
    39 class SGMLParseError(RuntimeError):
       
    40     """Exception raised for all parse errors."""
       
    41     pass
       
    42 
       
    43 
       
    44 # SGML parser base class -- find tags and call handler functions.
       
    45 # Usage: p = SGMLParser(); p.feed(data); ...; p.close().
       
    46 # The dtd is defined by deriving a class which defines methods
       
    47 # with special names to handle tags: start_foo and end_foo to handle
       
    48 # <foo> and </foo>, respectively, or do_foo to handle <foo> by itself.
       
    49 # (Tags are converted to lower case for this purpose.)  The data
       
    50 # between tags is passed to the parser by calling self.handle_data()
       
    51 # with some data as argument (the data may be split up in arbitrary
       
    52 # chunks).  Entity references are passed by calling
       
    53 # self.handle_entityref() with the entity reference as argument.
       
    54 
       
    55 class SGMLParser(markupbase.ParserBase):
       
    56     # Definition of entities -- derived classes may override
       
    57     entity_or_charref = re.compile('&(?:'
       
    58       '([a-zA-Z][-.a-zA-Z0-9]*)|#([0-9]+)'
       
    59       ')(;?)')
       
    60 
       
    61     def __init__(self, verbose=0):
       
    62         """Initialize and reset this instance."""
       
    63         self.verbose = verbose
       
    64         self.reset()
       
    65 
       
    66     def reset(self):
       
    67         """Reset this instance. Loses all unprocessed data."""
       
    68         self.__starttag_text = None
       
    69         self.rawdata = ''
       
    70         self.stack = []
       
    71         self.lasttag = '???'
       
    72         self.nomoretags = 0
       
    73         self.literal = 0
       
    74         markupbase.ParserBase.reset(self)
       
    75 
       
    76     def setnomoretags(self):
       
    77         """Enter literal mode (CDATA) till EOF.
       
    78 
       
    79         Intended for derived classes only.
       
    80         """
       
    81         self.nomoretags = self.literal = 1
       
    82 
       
    83     def setliteral(self, *args):
       
    84         """Enter literal mode (CDATA).
       
    85 
       
    86         Intended for derived classes only.
       
    87         """
       
    88         self.literal = 1
       
    89 
       
    90     def feed(self, data):
       
    91         """Feed some data to the parser.
       
    92 
       
    93         Call this as often as you want, with as little or as much text
       
    94         as you want (may include '\n').  (This just saves the text,
       
    95         all the processing is done by goahead().)
       
    96         """
       
    97 
       
    98         self.rawdata = self.rawdata + data
       
    99         self.goahead(0)
       
   100 
       
   101     def close(self):
       
   102         """Handle the remaining data."""
       
   103         self.goahead(1)
       
   104 
       
   105     def error(self, message):
       
   106         raise SGMLParseError(message)
       
   107 
       
   108     # Internal -- handle data as far as reasonable.  May leave state
       
   109     # and data to be processed by a subsequent call.  If 'end' is
       
   110     # true, force handling all data as if followed by EOF marker.
       
   111     def goahead(self, end):
       
   112         rawdata = self.rawdata
       
   113         i = 0
       
   114         n = len(rawdata)
       
   115         while i < n:
       
   116             if self.nomoretags:
       
   117                 self.handle_data(rawdata[i:n])
       
   118                 i = n
       
   119                 break
       
   120             match = interesting.search(rawdata, i)
       
   121             if match: j = match.start()
       
   122             else: j = n
       
   123             if i < j:
       
   124                 self.handle_data(rawdata[i:j])
       
   125             i = j
       
   126             if i == n: break
       
   127             if rawdata[i] == '<':
       
   128                 if starttagopen.match(rawdata, i):
       
   129                     if self.literal:
       
   130                         self.handle_data(rawdata[i])
       
   131                         i = i+1
       
   132                         continue
       
   133                     k = self.parse_starttag(i)
       
   134                     if k < 0: break
       
   135                     i = k
       
   136                     continue
       
   137                 if rawdata.startswith("</", i):
       
   138                     k = self.parse_endtag(i)
       
   139                     if k < 0: break
       
   140                     i = k
       
   141                     self.literal = 0
       
   142                     continue
       
   143                 if self.literal:
       
   144                     if n > (i + 1):
       
   145                         self.handle_data("<")
       
   146                         i = i+1
       
   147                     else:
       
   148                         # incomplete
       
   149                         break
       
   150                     continue
       
   151                 if rawdata.startswith("<!--", i):
       
   152                         # Strictly speaking, a comment is --.*--
       
   153                         # within a declaration tag <!...>.
       
   154                         # This should be removed,
       
   155                         # and comments handled only in parse_declaration.
       
   156                     k = self.parse_comment(i)
       
   157                     if k < 0: break
       
   158                     i = k
       
   159                     continue
       
   160                 if rawdata.startswith("<?", i):
       
   161                     k = self.parse_pi(i)
       
   162                     if k < 0: break
       
   163                     i = i+k
       
   164                     continue
       
   165                 if rawdata.startswith("<!", i):
       
   166                     # This is some sort of declaration; in "HTML as
       
   167                     # deployed," this should only be the document type
       
   168                     # declaration ("<!DOCTYPE html...>").
       
   169                     k = self.parse_declaration(i)
       
   170                     if k < 0: break
       
   171                     i = k
       
   172                     continue
       
   173             elif rawdata[i] == '&':
       
   174                 if self.literal:
       
   175                     self.handle_data(rawdata[i])
       
   176                     i = i+1
       
   177                     continue
       
   178                 match = charref.match(rawdata, i)
       
   179                 if match:
       
   180                     name = match.group(1)
       
   181                     self.handle_charref(name)
       
   182                     i = match.end(0)
       
   183                     if rawdata[i-1] != ';': i = i-1
       
   184                     continue
       
   185                 match = entityref.match(rawdata, i)
       
   186                 if match:
       
   187                     name = match.group(1)
       
   188                     self.handle_entityref(name)
       
   189                     i = match.end(0)
       
   190                     if rawdata[i-1] != ';': i = i-1
       
   191                     continue
       
   192             else:
       
   193                 self.error('neither < nor & ??')
       
   194             # We get here only if incomplete matches but
       
   195             # nothing else
       
   196             match = incomplete.match(rawdata, i)
       
   197             if not match:
       
   198                 self.handle_data(rawdata[i])
       
   199                 i = i+1
       
   200                 continue
       
   201             j = match.end(0)
       
   202             if j == n:
       
   203                 break # Really incomplete
       
   204             self.handle_data(rawdata[i:j])
       
   205             i = j
       
   206         # end while
       
   207         if end and i < n:
       
   208             self.handle_data(rawdata[i:n])
       
   209             i = n
       
   210         self.rawdata = rawdata[i:]
       
   211         # XXX if end: check for empty stack
       
   212 
       
   213     # Extensions for the DOCTYPE scanner:
       
   214     _decl_otherchars = '='
       
   215 
       
   216     # Internal -- parse processing instr, return length or -1 if not terminated
       
   217     def parse_pi(self, i):
       
   218         rawdata = self.rawdata
       
   219         if rawdata[i:i+2] != '<?':
       
   220             self.error('unexpected call to parse_pi()')
       
   221         match = piclose.search(rawdata, i+2)
       
   222         if not match:
       
   223             return -1
       
   224         j = match.start(0)
       
   225         self.handle_pi(rawdata[i+2: j])
       
   226         j = match.end(0)
       
   227         return j-i
       
   228 
       
   229     def get_starttag_text(self):
       
   230         return self.__starttag_text
       
   231 
       
   232     # Internal -- handle starttag, return length or -1 if not terminated
       
   233     def parse_starttag(self, i):
       
   234         self.__starttag_text = None
       
   235         start_pos = i
       
   236         rawdata = self.rawdata
       
   237         if shorttagopen.match(rawdata, i):
       
   238             # SGML shorthand: <tag/data/ == <tag>data</tag>
       
   239             # XXX Can data contain &... (entity or char refs)?
       
   240             # XXX Can data contain < or > (tag characters)?
       
   241             # XXX Can there be whitespace before the first /?
       
   242             match = shorttag.match(rawdata, i)
       
   243             if not match:
       
   244                 return -1
       
   245             tag, data = match.group(1, 2)
       
   246             self.__starttag_text = '<%s/' % tag
       
   247             tag = tag.lower()
       
   248             k = match.end(0)
       
   249             self.finish_shorttag(tag, data)
       
   250             self.__starttag_text = rawdata[start_pos:match.end(1) + 1]
       
   251             return k
       
   252         # XXX The following should skip matching quotes (' or ")
       
   253         # As a shortcut way to exit, this isn't so bad, but shouldn't
       
   254         # be used to locate the actual end of the start tag since the
       
   255         # < or > characters may be embedded in an attribute value.
       
   256         match = endbracket.search(rawdata, i+1)
       
   257         if not match:
       
   258             return -1
       
   259         j = match.start(0)
       
   260         # Now parse the data between i+1 and j into a tag and attrs
       
   261         attrs = []
       
   262         if rawdata[i:i+2] == '<>':
       
   263             # SGML shorthand: <> == <last open tag seen>
       
   264             k = j
       
   265             tag = self.lasttag
       
   266         else:
       
   267             match = tagfind.match(rawdata, i+1)
       
   268             if not match:
       
   269                 self.error('unexpected call to parse_starttag')
       
   270             k = match.end(0)
       
   271             tag = rawdata[i+1:k].lower()
       
   272             self.lasttag = tag
       
   273         while k < j:
       
   274             match = attrfind.match(rawdata, k)
       
   275             if not match: break
       
   276             attrname, rest, attrvalue = match.group(1, 2, 3)
       
   277             if not rest:
       
   278                 attrvalue = attrname
       
   279             else:
       
   280                 if (attrvalue[:1] == "'" == attrvalue[-1:] or
       
   281                     attrvalue[:1] == '"' == attrvalue[-1:]):
       
   282                     # strip quotes
       
   283                     attrvalue = attrvalue[1:-1]
       
   284                 attrvalue = self.entity_or_charref.sub(
       
   285                     self._convert_ref, attrvalue)
       
   286             attrs.append((attrname.lower(), attrvalue))
       
   287             k = match.end(0)
       
   288         if rawdata[j] == '>':
       
   289             j = j+1
       
   290         self.__starttag_text = rawdata[start_pos:j]
       
   291         self.finish_starttag(tag, attrs)
       
   292         return j
       
   293 
       
   294     # Internal -- convert entity or character reference
       
   295     def _convert_ref(self, match):
       
   296         if match.group(2):
       
   297             return self.convert_charref(match.group(2)) or \
       
   298                 '&#%s%s' % match.groups()[1:]
       
   299         elif match.group(3):
       
   300             return self.convert_entityref(match.group(1)) or \
       
   301                 '&%s;' % match.group(1)
       
   302         else:
       
   303             return '&%s' % match.group(1)
       
   304 
       
   305     # Internal -- parse endtag
       
   306     def parse_endtag(self, i):
       
   307         rawdata = self.rawdata
       
   308         match = endbracket.search(rawdata, i+1)
       
   309         if not match:
       
   310             return -1
       
   311         j = match.start(0)
       
   312         tag = rawdata[i+2:j].strip().lower()
       
   313         if rawdata[j] == '>':
       
   314             j = j+1
       
   315         self.finish_endtag(tag)
       
   316         return j
       
   317 
       
   318     # Internal -- finish parsing of <tag/data/ (same as <tag>data</tag>)
       
   319     def finish_shorttag(self, tag, data):
       
   320         self.finish_starttag(tag, [])
       
   321         self.handle_data(data)
       
   322         self.finish_endtag(tag)
       
   323 
       
   324     # Internal -- finish processing of start tag
       
   325     # Return -1 for unknown tag, 0 for open-only tag, 1 for balanced tag
       
   326     def finish_starttag(self, tag, attrs):
       
   327         try:
       
   328             method = getattr(self, 'start_' + tag)
       
   329         except AttributeError:
       
   330             try:
       
   331                 method = getattr(self, 'do_' + tag)
       
   332             except AttributeError:
       
   333                 self.unknown_starttag(tag, attrs)
       
   334                 return -1
       
   335             else:
       
   336                 self.handle_starttag(tag, method, attrs)
       
   337                 return 0
       
   338         else:
       
   339             self.stack.append(tag)
       
   340             self.handle_starttag(tag, method, attrs)
       
   341             return 1
       
   342 
       
   343     # Internal -- finish processing of end tag
       
   344     def finish_endtag(self, tag):
       
   345         if not tag:
       
   346             found = len(self.stack) - 1
       
   347             if found < 0:
       
   348                 self.unknown_endtag(tag)
       
   349                 return
       
   350         else:
       
   351             if tag not in self.stack:
       
   352                 try:
       
   353                     method = getattr(self, 'end_' + tag)
       
   354                 except AttributeError:
       
   355                     self.unknown_endtag(tag)
       
   356                 else:
       
   357                     self.report_unbalanced(tag)
       
   358                 return
       
   359             found = len(self.stack)
       
   360             for i in range(found):
       
   361                 if self.stack[i] == tag: found = i
       
   362         while len(self.stack) > found:
       
   363             tag = self.stack[-1]
       
   364             try:
       
   365                 method = getattr(self, 'end_' + tag)
       
   366             except AttributeError:
       
   367                 method = None
       
   368             if method:
       
   369                 self.handle_endtag(tag, method)
       
   370             else:
       
   371                 self.unknown_endtag(tag)
       
   372             del self.stack[-1]
       
   373 
       
   374     # Overridable -- handle start tag
       
   375     def handle_starttag(self, tag, method, attrs):
       
   376         method(attrs)
       
   377 
       
   378     # Overridable -- handle end tag
       
   379     def handle_endtag(self, tag, method):
       
   380         method()
       
   381 
       
   382     # Example -- report an unbalanced </...> tag.
       
   383     def report_unbalanced(self, tag):
       
   384         if self.verbose:
       
   385             print '*** Unbalanced </' + tag + '>'
       
   386             print '*** Stack:', self.stack
       
   387 
       
   388     def convert_charref(self, name):
       
   389         """Convert character reference, may be overridden."""
       
   390         try:
       
   391             n = int(name)
       
   392         except ValueError:
       
   393             return
       
   394         if not 0 <= n <= 255:
       
   395             return
       
   396         return self.convert_codepoint(n)
       
   397 
       
   398     def convert_codepoint(self, codepoint):
       
   399         return chr(codepoint)
       
   400 
       
   401     def handle_charref(self, name):
       
   402         """Handle character reference, no need to override."""
       
   403         replacement = self.convert_charref(name)
       
   404         if replacement is None:
       
   405             self.unknown_charref(name)
       
   406         else:
       
   407             self.handle_data(replacement)
       
   408 
       
   409     # Definition of entities -- derived classes may override
       
   410     entitydefs = \
       
   411             {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}
       
   412 
       
   413     def convert_entityref(self, name):
       
   414         """Convert entity references.
       
   415 
       
   416         As an alternative to overriding this method; one can tailor the
       
   417         results by setting up the self.entitydefs mapping appropriately.
       
   418         """
       
   419         table = self.entitydefs
       
   420         if name in table:
       
   421             return table[name]
       
   422         else:
       
   423             return
       
   424 
       
   425     def handle_entityref(self, name):
       
   426         """Handle entity references, no need to override."""
       
   427         replacement = self.convert_entityref(name)
       
   428         if replacement is None:
       
   429             self.unknown_entityref(name)
       
   430         else:
       
   431             self.handle_data(self.convert_entityref(name))
       
   432 
       
   433     # Example -- handle data, should be overridden
       
   434     def handle_data(self, data):
       
   435         pass
       
   436 
       
   437     # Example -- handle comment, could be overridden
       
   438     def handle_comment(self, data):
       
   439         pass
       
   440 
       
   441     # Example -- handle declaration, could be overridden
       
   442     def handle_decl(self, decl):
       
   443         pass
       
   444 
       
   445     # Example -- handle processing instruction, could be overridden
       
   446     def handle_pi(self, data):
       
   447         pass
       
   448 
       
   449     # To be overridden -- handlers for unknown objects
       
   450     def unknown_starttag(self, tag, attrs): pass
       
   451     def unknown_endtag(self, tag): pass
       
   452     def unknown_charref(self, ref): pass
       
   453     def unknown_entityref(self, ref): pass
       
   454 
       
   455 
       
   456 class TestSGMLParser(SGMLParser):
       
   457 
       
   458     def __init__(self, verbose=0):
       
   459         self.testdata = ""
       
   460         SGMLParser.__init__(self, verbose)
       
   461 
       
   462     def handle_data(self, data):
       
   463         self.testdata = self.testdata + data
       
   464         if len(repr(self.testdata)) >= 70:
       
   465             self.flush()
       
   466 
       
   467     def flush(self):
       
   468         data = self.testdata
       
   469         if data:
       
   470             self.testdata = ""
       
   471             print 'data:', repr(data)
       
   472 
       
   473     def handle_comment(self, data):
       
   474         self.flush()
       
   475         r = repr(data)
       
   476         if len(r) > 68:
       
   477             r = r[:32] + '...' + r[-32:]
       
   478         print 'comment:', r
       
   479 
       
   480     def unknown_starttag(self, tag, attrs):
       
   481         self.flush()
       
   482         if not attrs:
       
   483             print 'start tag: <' + tag + '>'
       
   484         else:
       
   485             print 'start tag: <' + tag,
       
   486             for name, value in attrs:
       
   487                 print name + '=' + '"' + value + '"',
       
   488             print '>'
       
   489 
       
   490     def unknown_endtag(self, tag):
       
   491         self.flush()
       
   492         print 'end tag: </' + tag + '>'
       
   493 
       
   494     def unknown_entityref(self, ref):
       
   495         self.flush()
       
   496         print '*** unknown entity ref: &' + ref + ';'
       
   497 
       
   498     def unknown_charref(self, ref):
       
   499         self.flush()
       
   500         print '*** unknown char ref: &#' + ref + ';'
       
   501 
       
   502     def unknown_decl(self, data):
       
   503         self.flush()
       
   504         print '*** unknown decl: [' + data + ']'
       
   505 
       
   506     def close(self):
       
   507         SGMLParser.close(self)
       
   508         self.flush()
       
   509 
       
   510 
       
   511 def test(args = None):
       
   512     import sys
       
   513 
       
   514     if args is None:
       
   515         args = sys.argv[1:]
       
   516 
       
   517     if args and args[0] == '-s':
       
   518         args = args[1:]
       
   519         klass = SGMLParser
       
   520     else:
       
   521         klass = TestSGMLParser
       
   522 
       
   523     if args:
       
   524         file = args[0]
       
   525     else:
       
   526         file = 'test.html'
       
   527 
       
   528     if file == '-':
       
   529         f = sys.stdin
       
   530     else:
       
   531         try:
       
   532             f = open(file, 'r')
       
   533         except IOError, msg:
       
   534             print file, ":", msg
       
   535             sys.exit(1)
       
   536 
       
   537     data = f.read()
       
   538     if f is not sys.stdin:
       
   539         f.close()
       
   540 
       
   541     x = klass()
       
   542     for c in data:
       
   543         x.feed(c)
       
   544     x.close()
       
   545 
       
   546 
       
   547 if __name__ == '__main__':
       
   548     test()