symbian-qemu-0.9.1-12/python-win32-2.6.1/lib/HTMLParser.py
changeset 1 2fb8b9db1c86
equal deleted inserted replaced
0:ffa851df0825 1:2fb8b9db1c86
       
     1 """A parser for HTML and XHTML."""
       
     2 
       
     3 # This file is based on sgmllib.py, but the API is slightly different.
       
     4 
       
     5 # XXX There should be a way to distinguish between PCDATA (parsed
       
     6 # character data -- the normal case), RCDATA (replaceable character
       
     7 # data -- only char and entity references and end tags are special)
       
     8 # and CDATA (character data -- only end tags are special).
       
     9 
       
    10 
       
    11 import markupbase
       
    12 import re
       
    13 
       
    14 # Regular expressions used for parsing
       
    15 
       
    16 interesting_normal = re.compile('[&<]')
       
    17 interesting_cdata = re.compile(r'<(/|\Z)')
       
    18 incomplete = re.compile('&[a-zA-Z#]')
       
    19 
       
    20 entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
       
    21 charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
       
    22 
       
    23 starttagopen = re.compile('<[a-zA-Z]')
       
    24 piclose = re.compile('>')
       
    25 commentclose = re.compile(r'--\s*>')
       
    26 tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*')
       
    27 attrfind = re.compile(
       
    28     r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
       
    29     r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~@]*))?')
       
    30 
       
    31 locatestarttagend = re.compile(r"""
       
    32   <[a-zA-Z][-.a-zA-Z0-9:_]*          # tag name
       
    33   (?:\s+                             # whitespace before attribute name
       
    34     (?:[a-zA-Z_][-.:a-zA-Z0-9_]*     # attribute name
       
    35       (?:\s*=\s*                     # value indicator
       
    36         (?:'[^']*'                   # LITA-enclosed value
       
    37           |\"[^\"]*\"                # LIT-enclosed value
       
    38           |[^'\">\s]+                # bare value
       
    39          )
       
    40        )?
       
    41      )
       
    42    )*
       
    43   \s*                                # trailing whitespace
       
    44 """, re.VERBOSE)
       
    45 endendtag = re.compile('>')
       
    46 endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
       
    47 
       
    48 
       
    49 class HTMLParseError(Exception):
       
    50     """Exception raised for all parse errors."""
       
    51 
       
    52     def __init__(self, msg, position=(None, None)):
       
    53         assert msg
       
    54         self.msg = msg
       
    55         self.lineno = position[0]
       
    56         self.offset = position[1]
       
    57 
       
    58     def __str__(self):
       
    59         result = self.msg
       
    60         if self.lineno is not None:
       
    61             result = result + ", at line %d" % self.lineno
       
    62         if self.offset is not None:
       
    63             result = result + ", column %d" % (self.offset + 1)
       
    64         return result
       
    65 
       
    66 
       
    67 class HTMLParser(markupbase.ParserBase):
       
    68     """Find tags and other markup and call handler functions.
       
    69 
       
    70     Usage:
       
    71         p = HTMLParser()
       
    72         p.feed(data)
       
    73         ...
       
    74         p.close()
       
    75 
       
    76     Start tags are handled by calling self.handle_starttag() or
       
    77     self.handle_startendtag(); end tags by self.handle_endtag().  The
       
    78     data between tags is passed from the parser to the derived class
       
    79     by calling self.handle_data() with the data as argument (the data
       
    80     may be split up in arbitrary chunks).  Entity references are
       
    81     passed by calling self.handle_entityref() with the entity
       
    82     reference as the argument.  Numeric character references are
       
    83     passed to self.handle_charref() with the string containing the
       
    84     reference as the argument.
       
    85     """
       
    86 
       
    87     CDATA_CONTENT_ELEMENTS = ("script", "style")
       
    88 
       
    89 
       
    90     def __init__(self):
       
    91         """Initialize and reset this instance."""
       
    92         self.reset()
       
    93 
       
    94     def reset(self):
       
    95         """Reset this instance.  Loses all unprocessed data."""
       
    96         self.rawdata = ''
       
    97         self.lasttag = '???'
       
    98         self.interesting = interesting_normal
       
    99         markupbase.ParserBase.reset(self)
       
   100 
       
   101     def feed(self, data):
       
   102         """Feed data to the parser.
       
   103 
       
   104         Call this as often as you want, with as little or as much text
       
   105         as you want (may include '\n').
       
   106         """
       
   107         self.rawdata = self.rawdata + data
       
   108         self.goahead(0)
       
   109 
       
   110     def close(self):
       
   111         """Handle any buffered data."""
       
   112         self.goahead(1)
       
   113 
       
   114     def error(self, message):
       
   115         raise HTMLParseError(message, self.getpos())
       
   116 
       
   117     __starttag_text = None
       
   118 
       
   119     def get_starttag_text(self):
       
   120         """Return full source of start tag: '<...>'."""
       
   121         return self.__starttag_text
       
   122 
       
   123     def set_cdata_mode(self):
       
   124         self.interesting = interesting_cdata
       
   125 
       
   126     def clear_cdata_mode(self):
       
   127         self.interesting = interesting_normal
       
   128 
       
   129     # Internal -- handle data as far as reasonable.  May leave state
       
   130     # and data to be processed by a subsequent call.  If 'end' is
       
   131     # true, force handling all data as if followed by EOF marker.
       
   132     def goahead(self, end):
       
   133         rawdata = self.rawdata
       
   134         i = 0
       
   135         n = len(rawdata)
       
   136         while i < n:
       
   137             match = self.interesting.search(rawdata, i) # < or &
       
   138             if match:
       
   139                 j = match.start()
       
   140             else:
       
   141                 j = n
       
   142             if i < j: self.handle_data(rawdata[i:j])
       
   143             i = self.updatepos(i, j)
       
   144             if i == n: break
       
   145             startswith = rawdata.startswith
       
   146             if startswith('<', i):
       
   147                 if starttagopen.match(rawdata, i): # < + letter
       
   148                     k = self.parse_starttag(i)
       
   149                 elif startswith("</", i):
       
   150                     k = self.parse_endtag(i)
       
   151                 elif startswith("<!--", i):
       
   152                     k = self.parse_comment(i)
       
   153                 elif startswith("<?", i):
       
   154                     k = self.parse_pi(i)
       
   155                 elif startswith("<!", i):
       
   156                     k = self.parse_declaration(i)
       
   157                 elif (i + 1) < n:
       
   158                     self.handle_data("<")
       
   159                     k = i + 1
       
   160                 else:
       
   161                     break
       
   162                 if k < 0:
       
   163                     if end:
       
   164                         self.error("EOF in middle of construct")
       
   165                     break
       
   166                 i = self.updatepos(i, k)
       
   167             elif startswith("&#", i):
       
   168                 match = charref.match(rawdata, i)
       
   169                 if match:
       
   170                     name = match.group()[2:-1]
       
   171                     self.handle_charref(name)
       
   172                     k = match.end()
       
   173                     if not startswith(';', k-1):
       
   174                         k = k - 1
       
   175                     i = self.updatepos(i, k)
       
   176                     continue
       
   177                 else:
       
   178                     break
       
   179             elif startswith('&', i):
       
   180                 match = entityref.match(rawdata, i)
       
   181                 if match:
       
   182                     name = match.group(1)
       
   183                     self.handle_entityref(name)
       
   184                     k = match.end()
       
   185                     if not startswith(';', k-1):
       
   186                         k = k - 1
       
   187                     i = self.updatepos(i, k)
       
   188                     continue
       
   189                 match = incomplete.match(rawdata, i)
       
   190                 if match:
       
   191                     # match.group() will contain at least 2 chars
       
   192                     if end and match.group() == rawdata[i:]:
       
   193                         self.error("EOF in middle of entity or char ref")
       
   194                     # incomplete
       
   195                     break
       
   196                 elif (i + 1) < n:
       
   197                     # not the end of the buffer, and can't be confused
       
   198                     # with some other construct
       
   199                     self.handle_data("&")
       
   200                     i = self.updatepos(i, i + 1)
       
   201                 else:
       
   202                     break
       
   203             else:
       
   204                 assert 0, "interesting.search() lied"
       
   205         # end while
       
   206         if end and i < n:
       
   207             self.handle_data(rawdata[i:n])
       
   208             i = self.updatepos(i, n)
       
   209         self.rawdata = rawdata[i:]
       
   210 
       
   211     # Internal -- parse processing instr, return end or -1 if not terminated
       
   212     def parse_pi(self, i):
       
   213         rawdata = self.rawdata
       
   214         assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
       
   215         match = piclose.search(rawdata, i+2) # >
       
   216         if not match:
       
   217             return -1
       
   218         j = match.start()
       
   219         self.handle_pi(rawdata[i+2: j])
       
   220         j = match.end()
       
   221         return j
       
   222 
       
   223     # Internal -- handle starttag, return end or -1 if not terminated
       
   224     def parse_starttag(self, i):
       
   225         self.__starttag_text = None
       
   226         endpos = self.check_for_whole_start_tag(i)
       
   227         if endpos < 0:
       
   228             return endpos
       
   229         rawdata = self.rawdata
       
   230         self.__starttag_text = rawdata[i:endpos]
       
   231 
       
   232         # Now parse the data between i+1 and j into a tag and attrs
       
   233         attrs = []
       
   234         match = tagfind.match(rawdata, i+1)
       
   235         assert match, 'unexpected call to parse_starttag()'
       
   236         k = match.end()
       
   237         self.lasttag = tag = rawdata[i+1:k].lower()
       
   238 
       
   239         while k < endpos:
       
   240             m = attrfind.match(rawdata, k)
       
   241             if not m:
       
   242                 break
       
   243             attrname, rest, attrvalue = m.group(1, 2, 3)
       
   244             if not rest:
       
   245                 attrvalue = None
       
   246             elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
       
   247                  attrvalue[:1] == '"' == attrvalue[-1:]:
       
   248                 attrvalue = attrvalue[1:-1]
       
   249                 attrvalue = self.unescape(attrvalue)
       
   250             attrs.append((attrname.lower(), attrvalue))
       
   251             k = m.end()
       
   252 
       
   253         end = rawdata[k:endpos].strip()
       
   254         if end not in (">", "/>"):
       
   255             lineno, offset = self.getpos()
       
   256             if "\n" in self.__starttag_text:
       
   257                 lineno = lineno + self.__starttag_text.count("\n")
       
   258                 offset = len(self.__starttag_text) \
       
   259                          - self.__starttag_text.rfind("\n")
       
   260             else:
       
   261                 offset = offset + len(self.__starttag_text)
       
   262             self.error("junk characters in start tag: %r"
       
   263                        % (rawdata[k:endpos][:20],))
       
   264         if end.endswith('/>'):
       
   265             # XHTML-style empty tag: <span attr="value" />
       
   266             self.handle_startendtag(tag, attrs)
       
   267         else:
       
   268             self.handle_starttag(tag, attrs)
       
   269             if tag in self.CDATA_CONTENT_ELEMENTS:
       
   270                 self.set_cdata_mode()
       
   271         return endpos
       
   272 
       
   273     # Internal -- check to see if we have a complete starttag; return end
       
   274     # or -1 if incomplete.
       
   275     def check_for_whole_start_tag(self, i):
       
   276         rawdata = self.rawdata
       
   277         m = locatestarttagend.match(rawdata, i)
       
   278         if m:
       
   279             j = m.end()
       
   280             next = rawdata[j:j+1]
       
   281             if next == ">":
       
   282                 return j + 1
       
   283             if next == "/":
       
   284                 if rawdata.startswith("/>", j):
       
   285                     return j + 2
       
   286                 if rawdata.startswith("/", j):
       
   287                     # buffer boundary
       
   288                     return -1
       
   289                 # else bogus input
       
   290                 self.updatepos(i, j + 1)
       
   291                 self.error("malformed empty start tag")
       
   292             if next == "":
       
   293                 # end of input
       
   294                 return -1
       
   295             if next in ("abcdefghijklmnopqrstuvwxyz=/"
       
   296                         "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
       
   297                 # end of input in or before attribute value, or we have the
       
   298                 # '/' from a '/>' ending
       
   299                 return -1
       
   300             self.updatepos(i, j)
       
   301             self.error("malformed start tag")
       
   302         raise AssertionError("we should not get here!")
       
   303 
       
   304     # Internal -- parse endtag, return end or -1 if incomplete
       
   305     def parse_endtag(self, i):
       
   306         rawdata = self.rawdata
       
   307         assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
       
   308         match = endendtag.search(rawdata, i+1) # >
       
   309         if not match:
       
   310             return -1
       
   311         j = match.end()
       
   312         match = endtagfind.match(rawdata, i) # </ + tag + >
       
   313         if not match:
       
   314             self.error("bad end tag: %r" % (rawdata[i:j],))
       
   315         tag = match.group(1)
       
   316         self.handle_endtag(tag.lower())
       
   317         self.clear_cdata_mode()
       
   318         return j
       
   319 
       
   320     # Overridable -- finish processing of start+end tag: <tag.../>
       
   321     def handle_startendtag(self, tag, attrs):
       
   322         self.handle_starttag(tag, attrs)
       
   323         self.handle_endtag(tag)
       
   324 
       
   325     # Overridable -- handle start tag
       
   326     def handle_starttag(self, tag, attrs):
       
   327         pass
       
   328 
       
   329     # Overridable -- handle end tag
       
   330     def handle_endtag(self, tag):
       
   331         pass
       
   332 
       
   333     # Overridable -- handle character reference
       
   334     def handle_charref(self, name):
       
   335         pass
       
   336 
       
   337     # Overridable -- handle entity reference
       
   338     def handle_entityref(self, name):
       
   339         pass
       
   340 
       
   341     # Overridable -- handle data
       
   342     def handle_data(self, data):
       
   343         pass
       
   344 
       
   345     # Overridable -- handle comment
       
   346     def handle_comment(self, data):
       
   347         pass
       
   348 
       
   349     # Overridable -- handle declaration
       
   350     def handle_decl(self, decl):
       
   351         pass
       
   352 
       
   353     # Overridable -- handle processing instruction
       
   354     def handle_pi(self, data):
       
   355         pass
       
   356 
       
   357     def unknown_decl(self, data):
       
   358         self.error("unknown declaration: %r" % (data,))
       
   359 
       
   360     # Internal -- helper to remove special character quoting
       
   361     entitydefs = None
       
   362     def unescape(self, s):
       
   363         if '&' not in s:
       
   364             return s
       
   365         def replaceEntities(s):
       
   366             s = s.groups()[0]
       
   367             if s[0] == "#":
       
   368                 s = s[1:]
       
   369                 if s[0] in ['x','X']:
       
   370                     c = int(s[1:], 16)
       
   371                 else:
       
   372                     c = int(s)
       
   373                 return unichr(c)
       
   374             else:
       
   375                 # Cannot use name2codepoint directly, because HTMLParser supports apos,
       
   376                 # which is not part of HTML 4
       
   377                 import htmlentitydefs
       
   378                 if HTMLParser.entitydefs is None:
       
   379                     entitydefs = HTMLParser.entitydefs = {'apos':u"'"}
       
   380                     for k, v in htmlentitydefs.name2codepoint.iteritems():
       
   381                         entitydefs[k] = unichr(v)
       
   382                 try:
       
   383                     return self.entitydefs[s]
       
   384                 except KeyError:
       
   385                     return '&'+s+';'
       
   386 
       
   387         return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));", replaceEntities, s)