downloadkit/BeautifulSoup.py
changeset 125 b60a149520e7
equal deleted inserted replaced
124:be0078a0f686 125:b60a149520e7
       
     1 """Beautiful Soup
       
     2 Elixir and Tonic
       
     3 "The Screen-Scraper's Friend"
       
     4 http://www.crummy.com/software/BeautifulSoup/
       
     5 
       
     6 Beautiful Soup parses a (possibly invalid) XML or HTML document into a
       
     7 tree representation. It provides methods and Pythonic idioms that make
       
     8 it easy to navigate, search, and modify the tree.
       
     9 
       
    10 A well-formed XML/HTML document yields a well-formed data
       
    11 structure. An ill-formed XML/HTML document yields a correspondingly
       
    12 ill-formed data structure. If your document is only locally
       
    13 well-formed, you can use this library to find and process the
       
    14 well-formed part of it.
       
    15 
       
    16 Beautiful Soup works with Python 2.2 and up. It has no external
       
    17 dependencies, but you'll have more success at converting data to UTF-8
       
    18 if you also install these three packages:
       
    19 
       
    20 * chardet, for auto-detecting character encodings
       
    21   http://chardet.feedparser.org/
       
    22 * cjkcodecs and iconv_codec, which add more encodings to the ones supported
       
    23   by stock Python.
       
    24   http://cjkpython.i18n.org/
       
    25 
       
    26 Beautiful Soup defines classes for two main parsing strategies:
       
    27 
       
    28  * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
       
    29    language that kind of looks like XML.
       
    30 
       
    31  * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid
       
    32    or invalid. This class has web browser-like heuristics for
       
    33    obtaining a sensible parse tree in the face of common HTML errors.
       
    34 
       
    35 Beautiful Soup also defines a class (UnicodeDammit) for autodetecting
       
    36 the encoding of an HTML or XML document, and converting it to
       
    37 Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser.
       
    38 
       
    39 For more than you ever wanted to know about Beautiful Soup, see the
       
    40 documentation:
       
    41 http://www.crummy.com/software/BeautifulSoup/documentation.html
       
    42 
       
    43 Here, have some legalese:
       
    44 
       
    45 Copyright (c) 2004-2009, Leonard Richardson
       
    46 
       
    47 All rights reserved.
       
    48 
       
    49 Redistribution and use in source and binary forms, with or without
       
    50 modification, are permitted provided that the following conditions are
       
    51 met:
       
    52 
       
    53   * Redistributions of source code must retain the above copyright
       
    54     notice, this list of conditions and the following disclaimer.
       
    55 
       
    56   * Redistributions in binary form must reproduce the above
       
    57     copyright notice, this list of conditions and the following
       
    58     disclaimer in the documentation and/or other materials provided
       
    59     with the distribution.
       
    60 
       
    61   * Neither the name of the the Beautiful Soup Consortium and All
       
    62     Night Kosher Bakery nor the names of its contributors may be
       
    63     used to endorse or promote products derived from this software
       
    64     without specific prior written permission.
       
    65 
       
    66 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
       
    67 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
       
    68 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
       
    69 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
       
    70 CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
       
    71 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
       
    72 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
       
    73 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
       
    74 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
       
    75 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
       
    76 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT.
       
    77 
       
    78 """
       
    79 from __future__ import generators
       
    80 
       
    81 __author__ = "Leonard Richardson (leonardr@segfault.org)"
       
    82 __version__ = "3.1.0.1"
       
    83 __copyright__ = "Copyright (c) 2004-2009 Leonard Richardson"
       
    84 __license__ = "New-style BSD"
       
    85 
       
    86 import codecs
       
    87 import markupbase
       
    88 import types
       
    89 import re
       
    90 from HTMLParser import HTMLParser, HTMLParseError
       
    91 try:
       
    92     from htmlentitydefs import name2codepoint
       
    93 except ImportError:
       
    94     name2codepoint = {}
       
    95 try:
       
    96     set
       
    97 except NameError:
       
    98     from sets import Set as set
       
    99 
       
   100 #These hacks make Beautiful Soup able to parse XML with namespaces
       
   101 markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match
       
   102 
       
   103 DEFAULT_OUTPUT_ENCODING = "utf-8"
       
   104 
       
   105 # First, the classes that represent markup elements.
       
   106 
       
   107 def sob(unicode, encoding):
       
   108     """Returns either the given Unicode string or its encoding."""
       
   109     if encoding is None:
       
   110         return unicode
       
   111     else:
       
   112         return unicode.encode(encoding)
       
   113 
       
   114 class PageElement:
       
   115     """Contains the navigational information for some part of the page
       
   116     (either a tag or a piece of text)"""
       
   117 
       
   118     def setup(self, parent=None, previous=None):
       
   119         """Sets up the initial relations between this element and
       
   120         other elements."""
       
   121         self.parent = parent
       
   122         self.previous = previous
       
   123         self.next = None
       
   124         self.previousSibling = None
       
   125         self.nextSibling = None
       
   126         if self.parent and self.parent.contents:
       
   127             self.previousSibling = self.parent.contents[-1]
       
   128             self.previousSibling.nextSibling = self
       
   129 
       
   130     def replaceWith(self, replaceWith):
       
   131         oldParent = self.parent
       
   132         myIndex = self.parent.contents.index(self)
       
   133         if hasattr(replaceWith, 'parent') and replaceWith.parent == self.parent:
       
   134             # We're replacing this element with one of its siblings.
       
   135             index = self.parent.contents.index(replaceWith)
       
   136             if index and index < myIndex:
       
   137                 # Furthermore, it comes before this element. That
       
   138                 # means that when we extract it, the index of this
       
   139                 # element will change.
       
   140                 myIndex = myIndex - 1
       
   141         self.extract()
       
   142         oldParent.insert(myIndex, replaceWith)
       
   143 
       
   144     def extract(self):
       
   145         """Destructively rips this element out of the tree."""
       
   146         if self.parent:
       
   147             try:
       
   148                 self.parent.contents.remove(self)
       
   149             except ValueError:
       
   150                 pass
       
   151 
       
   152         #Find the two elements that would be next to each other if
       
   153         #this element (and any children) hadn't been parsed. Connect
       
   154         #the two.
       
   155         lastChild = self._lastRecursiveChild()
       
   156         nextElement = lastChild.next
       
   157 
       
   158         if self.previous:
       
   159             self.previous.next = nextElement
       
   160         if nextElement:
       
   161             nextElement.previous = self.previous
       
   162         self.previous = None
       
   163         lastChild.next = None
       
   164 
       
   165         self.parent = None
       
   166         if self.previousSibling:
       
   167             self.previousSibling.nextSibling = self.nextSibling
       
   168         if self.nextSibling:
       
   169             self.nextSibling.previousSibling = self.previousSibling
       
   170         self.previousSibling = self.nextSibling = None
       
   171         return self
       
   172 
       
   173     def _lastRecursiveChild(self):
       
   174         "Finds the last element beneath this object to be parsed."
       
   175         lastChild = self
       
   176         while hasattr(lastChild, 'contents') and lastChild.contents:
       
   177             lastChild = lastChild.contents[-1]
       
   178         return lastChild
       
   179 
       
   180     def insert(self, position, newChild):
       
   181         if (isinstance(newChild, basestring)
       
   182             or isinstance(newChild, unicode)) \
       
   183             and not isinstance(newChild, NavigableString):
       
   184             newChild = NavigableString(newChild)
       
   185 
       
   186         position =  min(position, len(self.contents))
       
   187         if hasattr(newChild, 'parent') and newChild.parent != None:
       
   188             # We're 'inserting' an element that's already one
       
   189             # of this object's children.
       
   190             if newChild.parent == self:
       
   191                 index = self.find(newChild)
       
   192                 if index and index < position:
       
   193                     # Furthermore we're moving it further down the
       
   194                     # list of this object's children. That means that
       
   195                     # when we extract this element, our target index
       
   196                     # will jump down one.
       
   197                     position = position - 1
       
   198             newChild.extract()
       
   199 
       
   200         newChild.parent = self
       
   201         previousChild = None
       
   202         if position == 0:
       
   203             newChild.previousSibling = None
       
   204             newChild.previous = self
       
   205         else:
       
   206             previousChild = self.contents[position-1]
       
   207             newChild.previousSibling = previousChild
       
   208             newChild.previousSibling.nextSibling = newChild
       
   209             newChild.previous = previousChild._lastRecursiveChild()
       
   210         if newChild.previous:
       
   211             newChild.previous.next = newChild
       
   212 
       
   213         newChildsLastElement = newChild._lastRecursiveChild()
       
   214 
       
   215         if position >= len(self.contents):
       
   216             newChild.nextSibling = None
       
   217 
       
   218             parent = self
       
   219             parentsNextSibling = None
       
   220             while not parentsNextSibling:
       
   221                 parentsNextSibling = parent.nextSibling
       
   222                 parent = parent.parent
       
   223                 if not parent: # This is the last element in the document.
       
   224                     break
       
   225             if parentsNextSibling:
       
   226                 newChildsLastElement.next = parentsNextSibling
       
   227             else:
       
   228                 newChildsLastElement.next = None
       
   229         else:
       
   230             nextChild = self.contents[position]
       
   231             newChild.nextSibling = nextChild
       
   232             if newChild.nextSibling:
       
   233                 newChild.nextSibling.previousSibling = newChild
       
   234             newChildsLastElement.next = nextChild
       
   235 
       
   236         if newChildsLastElement.next:
       
   237             newChildsLastElement.next.previous = newChildsLastElement
       
   238         self.contents.insert(position, newChild)
       
   239 
       
   240     def append(self, tag):
       
   241         """Appends the given tag to the contents of this tag."""
       
   242         self.insert(len(self.contents), tag)
       
   243 
       
   244     def findNext(self, name=None, attrs={}, text=None, **kwargs):
       
   245         """Returns the first item that matches the given criteria and
       
   246         appears after this Tag in the document."""
       
   247         return self._findOne(self.findAllNext, name, attrs, text, **kwargs)
       
   248 
       
   249     def findAllNext(self, name=None, attrs={}, text=None, limit=None,
       
   250                     **kwargs):
       
   251         """Returns all items that match the given criteria and appear
       
   252         after this Tag in the document."""
       
   253         return self._findAll(name, attrs, text, limit, self.nextGenerator,
       
   254                              **kwargs)
       
   255 
       
   256     def findNextSibling(self, name=None, attrs={}, text=None, **kwargs):
       
   257         """Returns the closest sibling to this Tag that matches the
       
   258         given criteria and appears after this Tag in the document."""
       
   259         return self._findOne(self.findNextSiblings, name, attrs, text,
       
   260                              **kwargs)
       
   261 
       
   262     def findNextSiblings(self, name=None, attrs={}, text=None, limit=None,
       
   263                          **kwargs):
       
   264         """Returns the siblings of this Tag that match the given
       
   265         criteria and appear after this Tag in the document."""
       
   266         return self._findAll(name, attrs, text, limit,
       
   267                              self.nextSiblingGenerator, **kwargs)
       
   268     fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x
       
   269 
       
   270     def findPrevious(self, name=None, attrs={}, text=None, **kwargs):
       
   271         """Returns the first item that matches the given criteria and
       
   272         appears before this Tag in the document."""
       
   273         return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs)
       
   274 
       
   275     def findAllPrevious(self, name=None, attrs={}, text=None, limit=None,
       
   276                         **kwargs):
       
   277         """Returns all items that match the given criteria and appear
       
   278         before this Tag in the document."""
       
   279         return self._findAll(name, attrs, text, limit, self.previousGenerator,
       
   280                            **kwargs)
       
   281     fetchPrevious = findAllPrevious # Compatibility with pre-3.x
       
   282 
       
   283     def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs):
       
   284         """Returns the closest sibling to this Tag that matches the
       
   285         given criteria and appears before this Tag in the document."""
       
   286         return self._findOne(self.findPreviousSiblings, name, attrs, text,
       
   287                              **kwargs)
       
   288 
       
   289     def findPreviousSiblings(self, name=None, attrs={}, text=None,
       
   290                              limit=None, **kwargs):
       
   291         """Returns the siblings of this Tag that match the given
       
   292         criteria and appear before this Tag in the document."""
       
   293         return self._findAll(name, attrs, text, limit,
       
   294                              self.previousSiblingGenerator, **kwargs)
       
   295     fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x
       
   296 
       
   297     def findParent(self, name=None, attrs={}, **kwargs):
       
   298         """Returns the closest parent of this Tag that matches the given
       
   299         criteria."""
       
   300         # NOTE: We can't use _findOne because findParents takes a different
       
   301         # set of arguments.
       
   302         r = None
       
   303         l = self.findParents(name, attrs, 1)
       
   304         if l:
       
   305             r = l[0]
       
   306         return r
       
   307 
       
   308     def findParents(self, name=None, attrs={}, limit=None, **kwargs):
       
   309         """Returns the parents of this Tag that match the given
       
   310         criteria."""
       
   311 
       
   312         return self._findAll(name, attrs, None, limit, self.parentGenerator,
       
   313                              **kwargs)
       
   314     fetchParents = findParents # Compatibility with pre-3.x
       
   315 
       
   316     #These methods do the real heavy lifting.
       
   317 
       
   318     def _findOne(self, method, name, attrs, text, **kwargs):
       
   319         r = None
       
   320         l = method(name, attrs, text, 1, **kwargs)
       
   321         if l:
       
   322             r = l[0]
       
   323         return r
       
   324 
       
   325     def _findAll(self, name, attrs, text, limit, generator, **kwargs):
       
   326         "Iterates over a generator looking for things that match."
       
   327 
       
   328         if isinstance(name, SoupStrainer):
       
   329             strainer = name
       
   330         else:
       
   331             # Build a SoupStrainer
       
   332             strainer = SoupStrainer(name, attrs, text, **kwargs)
       
   333         results = ResultSet(strainer)
       
   334         g = generator()
       
   335         while True:
       
   336             try:
       
   337                 i = g.next()
       
   338             except StopIteration:
       
   339                 break
       
   340             if i:
       
   341                 found = strainer.search(i)
       
   342                 if found:
       
   343                     results.append(found)
       
   344                     if limit and len(results) >= limit:
       
   345                         break
       
   346         return results
       
   347 
       
   348     #These Generators can be used to navigate starting from both
       
   349     #NavigableStrings and Tags.
       
   350     def nextGenerator(self):
       
   351         i = self
       
   352         while i:
       
   353             i = i.next
       
   354             yield i
       
   355 
       
   356     def nextSiblingGenerator(self):
       
   357         i = self
       
   358         while i:
       
   359             i = i.nextSibling
       
   360             yield i
       
   361 
       
   362     def previousGenerator(self):
       
   363         i = self
       
   364         while i:
       
   365             i = i.previous
       
   366             yield i
       
   367 
       
   368     def previousSiblingGenerator(self):
       
   369         i = self
       
   370         while i:
       
   371             i = i.previousSibling
       
   372             yield i
       
   373 
       
   374     def parentGenerator(self):
       
   375         i = self
       
   376         while i:
       
   377             i = i.parent
       
   378             yield i
       
   379 
       
   380     # Utility methods
       
   381     def substituteEncoding(self, str, encoding=None):
       
   382         encoding = encoding or "utf-8"
       
   383         return str.replace("%SOUP-ENCODING%", encoding)
       
   384 
       
   385     def toEncoding(self, s, encoding=None):
       
   386         """Encodes an object to a string in some encoding, or to Unicode.
       
   387         ."""
       
   388         if isinstance(s, unicode):
       
   389             if encoding:
       
   390                 s = s.encode(encoding)
       
   391         elif isinstance(s, str):
       
   392             if encoding:
       
   393                 s = s.encode(encoding)
       
   394             else:
       
   395                 s = unicode(s)
       
   396         else:
       
   397             if encoding:
       
   398                 s  = self.toEncoding(str(s), encoding)
       
   399             else:
       
   400                 s = unicode(s)
       
   401         return s
       
   402 
       
   403 class NavigableString(unicode, PageElement):
       
   404 
       
   405     def __new__(cls, value):
       
   406         """Create a new NavigableString.
       
   407 
       
   408         When unpickling a NavigableString, this method is called with
       
   409         the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be
       
   410         passed in to the superclass's __new__ or the superclass won't know
       
   411         how to handle non-ASCII characters.
       
   412         """
       
   413         if isinstance(value, unicode):
       
   414             return unicode.__new__(cls, value)
       
   415         return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
       
   416 
       
   417     def __getnewargs__(self):
       
   418         return (unicode(self),)
       
   419 
       
   420     def __getattr__(self, attr):
       
   421         """text.string gives you text. This is for backwards
       
   422         compatibility for Navigable*String, but for CData* it lets you
       
   423         get the string without the CData wrapper."""
       
   424         if attr == 'string':
       
   425             return self
       
   426         else:
       
   427             raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
       
   428 
       
   429     def encode(self, encoding=DEFAULT_OUTPUT_ENCODING):
       
   430         return self.decode().encode(encoding)
       
   431 
       
   432     def decodeGivenEventualEncoding(self, eventualEncoding):
       
   433         return self
       
   434 
       
   435 class CData(NavigableString):
       
   436 
       
   437     def decodeGivenEventualEncoding(self, eventualEncoding):
       
   438         return u'<![CDATA[' + self + u']]>'
       
   439 
       
   440 class ProcessingInstruction(NavigableString):
       
   441 
       
   442     def decodeGivenEventualEncoding(self, eventualEncoding):
       
   443         output = self
       
   444         if u'%SOUP-ENCODING%' in output:
       
   445             output = self.substituteEncoding(output, eventualEncoding)
       
   446         return u'<?' + output + u'?>'
       
   447 
       
   448 class Comment(NavigableString):
       
   449     def decodeGivenEventualEncoding(self, eventualEncoding):
       
   450         return u'<!--' + self + u'-->'
       
   451 
       
   452 class Declaration(NavigableString):
       
   453     def decodeGivenEventualEncoding(self, eventualEncoding):
       
   454         return u'<!' + self + u'>'
       
   455 
       
   456 class Tag(PageElement):
       
   457 
       
   458     """Represents a found HTML tag with its attributes and contents."""
       
   459 
       
   460     def _invert(h):
       
   461         "Cheap function to invert a hash."
       
   462         i = {}
       
   463         for k,v in h.items():
       
   464             i[v] = k
       
   465         return i
       
   466 
       
   467     XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'",
       
   468                                       "quot" : '"',
       
   469                                       "amp" : "&",
       
   470                                       "lt" : "<",
       
   471                                       "gt" : ">" }
       
   472 
       
   473     XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS)
       
   474 
       
   475     def _convertEntities(self, match):
       
   476         """Used in a call to re.sub to replace HTML, XML, and numeric
       
   477         entities with the appropriate Unicode characters. If HTML
       
   478         entities are being converted, any unrecognized entities are
       
   479         escaped."""
       
   480         x = match.group(1)
       
   481         if self.convertHTMLEntities and x in name2codepoint:
       
   482             return unichr(name2codepoint[x])
       
   483         elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS:
       
   484             if self.convertXMLEntities:
       
   485                 return self.XML_ENTITIES_TO_SPECIAL_CHARS[x]
       
   486             else:
       
   487                 return u'&%s;' % x
       
   488         elif len(x) > 0 and x[0] == '#':
       
   489             # Handle numeric entities
       
   490             if len(x) > 1 and x[1] == 'x':
       
   491                 return unichr(int(x[2:], 16))
       
   492             else:
       
   493                 return unichr(int(x[1:]))
       
   494 
       
   495         elif self.escapeUnrecognizedEntities:
       
   496             return u'&amp;%s;' % x
       
   497         else:
       
   498             return u'&%s;' % x
       
   499 
       
   500     def __init__(self, parser, name, attrs=None, parent=None,
       
   501                  previous=None):
       
   502         "Basic constructor."
       
   503 
       
   504         # We don't actually store the parser object: that lets extracted
       
   505         # chunks be garbage-collected
       
   506         self.parserClass = parser.__class__
       
   507         self.isSelfClosing = parser.isSelfClosingTag(name)
       
   508         self.name = name
       
   509         if attrs == None:
       
   510             attrs = []
       
   511         self.attrs = attrs
       
   512         self.contents = []
       
   513         self.setup(parent, previous)
       
   514         self.hidden = False
       
   515         self.containsSubstitutions = False
       
   516         self.convertHTMLEntities = parser.convertHTMLEntities
       
   517         self.convertXMLEntities = parser.convertXMLEntities
       
   518         self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities
       
   519 
       
   520         def convert(kval):
       
   521             "Converts HTML, XML and numeric entities in the attribute value."
       
   522             k, val = kval
       
   523             if val is None:
       
   524                 return kval
       
   525             return (k, re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);",
       
   526                               self._convertEntities, val))
       
   527         self.attrs = map(convert, self.attrs)
       
   528 
       
   529     def get(self, key, default=None):
       
   530         """Returns the value of the 'key' attribute for the tag, or
       
   531         the value given for 'default' if it doesn't have that
       
   532         attribute."""
       
   533         return self._getAttrMap().get(key, default)
       
   534 
       
   535     def has_key(self, key):
       
   536         return self._getAttrMap().has_key(key)
       
   537 
       
   538     def __getitem__(self, key):
       
   539         """tag[key] returns the value of the 'key' attribute for the tag,
       
   540         and throws an exception if it's not there."""
       
   541         return self._getAttrMap()[key]
       
   542 
       
   543     def __iter__(self):
       
   544         "Iterating over a tag iterates over its contents."
       
   545         return iter(self.contents)
       
   546 
       
   547     def __len__(self):
       
   548         "The length of a tag is the length of its list of contents."
       
   549         return len(self.contents)
       
   550 
       
   551     def __contains__(self, x):
       
   552         return x in self.contents
       
   553 
       
   554     def __nonzero__(self):
       
   555         "A tag is non-None even if it has no contents."
       
   556         return True
       
   557 
       
   558     def __setitem__(self, key, value):
       
   559         """Setting tag[key] sets the value of the 'key' attribute for the
       
   560         tag."""
       
   561         self._getAttrMap()
       
   562         self.attrMap[key] = value
       
   563         found = False
       
   564         for i in range(0, len(self.attrs)):
       
   565             if self.attrs[i][0] == key:
       
   566                 self.attrs[i] = (key, value)
       
   567                 found = True
       
   568         if not found:
       
   569             self.attrs.append((key, value))
       
   570         self._getAttrMap()[key] = value
       
   571 
       
   572     def __delitem__(self, key):
       
   573         "Deleting tag[key] deletes all 'key' attributes for the tag."
       
   574         for item in self.attrs:
       
   575             if item[0] == key:
       
   576                 self.attrs.remove(item)
       
   577                 #We don't break because bad HTML can define the same
       
   578                 #attribute multiple times.
       
   579             self._getAttrMap()
       
   580             if self.attrMap.has_key(key):
       
   581                 del self.attrMap[key]
       
   582 
       
   583     def __call__(self, *args, **kwargs):
       
   584         """Calling a tag like a function is the same as calling its
       
   585         findAll() method. Eg. tag('a') returns a list of all the A tags
       
   586         found within this tag."""
       
   587         return apply(self.findAll, args, kwargs)
       
   588 
       
   589     def __getattr__(self, tag):
       
   590         #print "Getattr %s.%s" % (self.__class__, tag)
       
   591         if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3:
       
   592             return self.find(tag[:-3])
       
   593         elif tag.find('__') != 0:
       
   594             return self.find(tag)
       
   595         raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__, tag)
       
   596 
       
   597     def __eq__(self, other):
       
   598         """Returns true iff this tag has the same name, the same attributes,
       
   599         and the same contents (recursively) as the given tag.
       
   600 
       
   601         NOTE: right now this will return false if two tags have the
       
   602         same attributes in a different order. Should this be fixed?"""
       
   603         if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):
       
   604             return False
       
   605         for i in range(0, len(self.contents)):
       
   606             if self.contents[i] != other.contents[i]:
       
   607                 return False
       
   608         return True
       
   609 
       
   610     def __ne__(self, other):
       
   611         """Returns true iff this tag is not identical to the other tag,
       
   612         as defined in __eq__."""
       
   613         return not self == other
       
   614 
       
   615     def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
       
   616         """Renders this tag as a string."""
       
   617         return self.decode(eventualEncoding=encoding)
       
   618 
       
   619     BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
       
   620                                            + "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
       
   621                                            + ")")
       
   622 
       
   623     def _sub_entity(self, x):
       
   624         """Used with a regular expression to substitute the
       
   625         appropriate XML entity for an XML special character."""
       
   626         return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";"
       
   627 
       
   628     def __unicode__(self):
       
   629         return self.decode()
       
   630 
       
   631     def __str__(self):
       
   632         return self.encode()
       
   633 
       
   634     def encode(self, encoding=DEFAULT_OUTPUT_ENCODING,
       
   635                prettyPrint=False, indentLevel=0):
       
   636         return self.decode(prettyPrint, indentLevel, encoding).encode(encoding)
       
   637 
       
   638     def decode(self, prettyPrint=False, indentLevel=0,
       
   639                eventualEncoding=DEFAULT_OUTPUT_ENCODING):
       
   640         """Returns a string or Unicode representation of this tag and
       
   641         its contents. To get Unicode, pass None for encoding."""
       
   642 
       
   643         attrs = []
       
   644         if self.attrs:
       
   645             for key, val in self.attrs:
       
   646                 fmt = '%s="%s"'
       
   647                 if isString(val):
       
   648                     if (self.containsSubstitutions
       
   649                         and eventualEncoding is not None
       
   650                         and '%SOUP-ENCODING%' in val):
       
   651                         val = self.substituteEncoding(val, eventualEncoding)
       
   652 
       
   653                     # The attribute value either:
       
   654                     #
       
   655                     # * Contains no embedded double quotes or single quotes.
       
   656                     #   No problem: we enclose it in double quotes.
       
   657                     # * Contains embedded single quotes. No problem:
       
   658                     #   double quotes work here too.
       
   659                     # * Contains embedded double quotes. No problem:
       
   660                     #   we enclose it in single quotes.
       
   661                     # * Embeds both single _and_ double quotes. This
       
   662                     #   can't happen naturally, but it can happen if
       
   663                     #   you modify an attribute value after parsing
       
   664                     #   the document. Now we have a bit of a
       
   665                     #   problem. We solve it by enclosing the
       
   666                     #   attribute in single quotes, and escaping any
       
   667                     #   embedded single quotes to XML entities.
       
   668                     if '"' in val:
       
   669                         fmt = "%s='%s'"
       
   670                         if "'" in val:
       
   671                             # TODO: replace with apos when
       
   672                             # appropriate.
       
   673                             val = val.replace("'", "&squot;")
       
   674 
       
   675                     # Now we're okay w/r/t quotes. But the attribute
       
   676                     # value might also contain angle brackets, or
       
   677                     # ampersands that aren't part of entities. We need
       
   678                     # to escape those to XML entities too.
       
   679                     val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val)
       
   680                 if val is None:
       
   681                     # Handle boolean attributes.
       
   682                     decoded = key
       
   683                 else:
       
   684                     decoded = fmt % (key, val)
       
   685                 attrs.append(decoded)
       
   686         close = ''
       
   687         closeTag = ''
       
   688         if self.isSelfClosing:
       
   689             close = ' /'
       
   690         else:
       
   691             closeTag = '</%s>' % self.name
       
   692 
       
   693         indentTag, indentContents = 0, 0
       
   694         if prettyPrint:
       
   695             indentTag = indentLevel
       
   696             space = (' ' * (indentTag-1))
       
   697             indentContents = indentTag + 1
       
   698         contents = self.decodeContents(prettyPrint, indentContents,
       
   699                                        eventualEncoding)
       
   700         if self.hidden:
       
   701             s = contents
       
   702         else:
       
   703             s = []
       
   704             attributeString = ''
       
   705             if attrs:
       
   706                 attributeString = ' ' + ' '.join(attrs)
       
   707             if prettyPrint:
       
   708                 s.append(space)
       
   709             s.append('<%s%s%s>' % (self.name, attributeString, close))
       
   710             if prettyPrint:
       
   711                 s.append("\n")
       
   712             s.append(contents)
       
   713             if prettyPrint and contents and contents[-1] != "\n":
       
   714                 s.append("\n")
       
   715             if prettyPrint and closeTag:
       
   716                 s.append(space)
       
   717             s.append(closeTag)
       
   718             if prettyPrint and closeTag and self.nextSibling:
       
   719                 s.append("\n")
       
   720             s = ''.join(s)
       
   721         return s
       
   722 
       
   723     def decompose(self):
       
   724         """Recursively destroys the contents of this tree."""
       
   725         contents = [i for i in self.contents]
       
   726         for i in contents:
       
   727             if isinstance(i, Tag):
       
   728                 i.decompose()
       
   729             else:
       
   730                 i.extract()
       
   731         self.extract()
       
   732 
       
   733     def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
       
   734         return self.encode(encoding, True)
       
   735 
       
   736     def encodeContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
       
   737                        prettyPrint=False, indentLevel=0):
       
   738         return self.decodeContents(prettyPrint, indentLevel).encode(encoding)
       
   739 
       
   740     def decodeContents(self, prettyPrint=False, indentLevel=0,
       
   741                        eventualEncoding=DEFAULT_OUTPUT_ENCODING):
       
   742         """Renders the contents of this tag as a string in the given
       
   743         encoding. If encoding is None, returns a Unicode string.."""
       
   744         s=[]
       
   745         for c in self:
       
   746             text = None
       
   747             if isinstance(c, NavigableString):
       
   748                 text = c.decodeGivenEventualEncoding(eventualEncoding)
       
   749             elif isinstance(c, Tag):
       
   750                 s.append(c.decode(prettyPrint, indentLevel, eventualEncoding))
       
   751             if text and prettyPrint:
       
   752                 text = text.strip()
       
   753             if text:
       
   754                 if prettyPrint:
       
   755                     s.append(" " * (indentLevel-1))
       
   756                 s.append(text)
       
   757                 if prettyPrint:
       
   758                     s.append("\n")
       
   759         return ''.join(s)
       
   760 
       
   761     #Soup methods
       
   762 
       
   763     def find(self, name=None, attrs={}, recursive=True, text=None,
       
   764              **kwargs):
       
   765         """Return only the first child of this Tag matching the given
       
   766         criteria."""
       
   767         r = None
       
   768         l = self.findAll(name, attrs, recursive, text, 1, **kwargs)
       
   769         if l:
       
   770             r = l[0]
       
   771         return r
       
   772     findChild = find
       
   773 
       
   774     def findAll(self, name=None, attrs={}, recursive=True, text=None,
       
   775                 limit=None, **kwargs):
       
   776         """Extracts a list of Tag objects that match the given
       
   777         criteria.  You can specify the name of the Tag and any
       
   778         attributes you want the Tag to have.
       
   779 
       
   780         The value of a key-value pair in the 'attrs' map can be a
       
   781         string, a list of strings, a regular expression object, or a
       
   782         callable that takes a string and returns whether or not the
       
   783         string matches for some custom definition of 'matches'. The
       
   784         same is true of the tag name."""
       
   785         generator = self.recursiveChildGenerator
       
   786         if not recursive:
       
   787             generator = self.childGenerator
       
   788         return self._findAll(name, attrs, text, limit, generator, **kwargs)
       
   789     findChildren = findAll
       
   790 
       
   791     # Pre-3.x compatibility methods. Will go away in 4.0.
       
   792     first = find
       
   793     fetch = findAll
       
   794 
       
   795     def fetchText(self, text=None, recursive=True, limit=None):
       
   796         return self.findAll(text=text, recursive=recursive, limit=limit)
       
   797 
       
   798     def firstText(self, text=None, recursive=True):
       
   799         return self.find(text=text, recursive=recursive)
       
   800 
       
   801     # 3.x compatibility methods. Will go away in 4.0.
       
   802     def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
       
   803                        prettyPrint=False, indentLevel=0):
       
   804         if encoding is None:
       
   805             return self.decodeContents(prettyPrint, indentLevel, encoding)
       
   806         else:
       
   807             return self.encodeContents(encoding, prettyPrint, indentLevel)
       
   808 
       
   809 
       
   810     #Private methods
       
   811 
       
   812     def _getAttrMap(self):
       
   813         """Initializes a map representation of this tag's attributes,
       
   814         if not already initialized."""
       
   815         if not getattr(self, 'attrMap'):
       
   816             self.attrMap = {}
       
   817             for (key, value) in self.attrs:
       
   818                 self.attrMap[key] = value
       
   819         return self.attrMap
       
   820 
       
   821     #Generator methods
       
   822     def recursiveChildGenerator(self):
       
   823         if not len(self.contents):
       
   824             raise StopIteration
       
   825         stopNode = self._lastRecursiveChild().next
       
   826         current = self.contents[0]
       
   827         while current is not stopNode:
       
   828             yield current
       
   829             current = current.next
       
   830 
       
   831     def childGenerator(self):
       
   832         if not len(self.contents):
       
   833             raise StopIteration
       
   834         current = self.contents[0]
       
   835         while current:
       
   836             yield current
       
   837             current = current.nextSibling
       
   838         raise StopIteration
       
   839 
       
   840 # Next, a couple classes to represent queries and their results.
       
   841 class SoupStrainer:
       
   842     """Encapsulates a number of ways of matching a markup element (tag or
       
   843     text)."""
       
   844 
       
   845     def __init__(self, name=None, attrs={}, text=None, **kwargs):
       
   846         self.name = name
       
   847         if isString(attrs):
       
   848             kwargs['class'] = attrs
       
   849             attrs = None
       
   850         if kwargs:
       
   851             if attrs:
       
   852                 attrs = attrs.copy()
       
   853                 attrs.update(kwargs)
       
   854             else:
       
   855                 attrs = kwargs
       
   856         self.attrs = attrs
       
   857         self.text = text
       
   858 
       
   859     def __str__(self):
       
   860         if self.text:
       
   861             return self.text
       
   862         else:
       
   863             return "%s|%s" % (self.name, self.attrs)
       
   864 
       
   865     def searchTag(self, markupName=None, markupAttrs={}):
       
   866         found = None
       
   867         markup = None
       
   868         if isinstance(markupName, Tag):
       
   869             markup = markupName
       
   870             markupAttrs = markup
       
   871         callFunctionWithTagData = callable(self.name) \
       
   872                                 and not isinstance(markupName, Tag)
       
   873 
       
   874         if (not self.name) \
       
   875                or callFunctionWithTagData \
       
   876                or (markup and self._matches(markup, self.name)) \
       
   877                or (not markup and self._matches(markupName, self.name)):
       
   878             if callFunctionWithTagData:
       
   879                 match = self.name(markupName, markupAttrs)
       
   880             else:
       
   881                 match = True
       
   882                 markupAttrMap = None
       
   883                 for attr, matchAgainst in self.attrs.items():
       
   884                     if not markupAttrMap:
       
   885                          if hasattr(markupAttrs, 'get'):
       
   886                             markupAttrMap = markupAttrs
       
   887                          else:
       
   888                             markupAttrMap = {}
       
   889                             for k,v in markupAttrs:
       
   890                                 markupAttrMap[k] = v
       
   891                     attrValue = markupAttrMap.get(attr)
       
   892                     if not self._matches(attrValue, matchAgainst):
       
   893                         match = False
       
   894                         break
       
   895             if match:
       
   896                 if markup:
       
   897                     found = markup
       
   898                 else:
       
   899                     found = markupName
       
   900         return found
       
   901 
       
   902     def search(self, markup):
       
   903         #print 'looking for %s in %s' % (self, markup)
       
   904         found = None
       
   905         # If given a list of items, scan it for a text element that
       
   906         # matches.
       
   907         if isList(markup) and not isinstance(markup, Tag):
       
   908             for element in markup:
       
   909                 if isinstance(element, NavigableString) \
       
   910                        and self.search(element):
       
   911                     found = element
       
   912                     break
       
   913         # If it's a Tag, make sure its name or attributes match.
       
   914         # Don't bother with Tags if we're searching for text.
       
   915         elif isinstance(markup, Tag):
       
   916             if not self.text:
       
   917                 found = self.searchTag(markup)
       
   918         # If it's text, make sure the text matches.
       
   919         elif isinstance(markup, NavigableString) or \
       
   920                  isString(markup):
       
   921             if self._matches(markup, self.text):
       
   922                 found = markup
       
   923         else:
       
   924             raise Exception, "I don't know how to match against a %s" \
       
   925                   % markup.__class__
       
   926         return found
       
   927 
       
   928     def _matches(self, markup, matchAgainst):
       
   929         #print "Matching %s against %s" % (markup, matchAgainst)
       
   930         result = False
       
   931         if matchAgainst == True and type(matchAgainst) == types.BooleanType:
       
   932             result = markup != None
       
   933         elif callable(matchAgainst):
       
   934             result = matchAgainst(markup)
       
   935         else:
       
   936             #Custom match methods take the tag as an argument, but all
       
   937             #other ways of matching match the tag name as a string.
       
   938             if isinstance(markup, Tag):
       
   939                 markup = markup.name
       
   940             if markup is not None and not isString(markup):
       
   941                 markup = unicode(markup)
       
   942             #Now we know that chunk is either a string, or None.
       
   943             if hasattr(matchAgainst, 'match'):
       
   944                 # It's a regexp object.
       
   945                 result = markup and matchAgainst.search(markup)
       
   946             elif (isList(matchAgainst)
       
   947                   and (markup is not None or not isString(matchAgainst))):
       
   948                 result = markup in matchAgainst
       
   949             elif hasattr(matchAgainst, 'items'):
       
   950                 result = markup.has_key(matchAgainst)
       
   951             elif matchAgainst and isString(markup):
       
   952                 if isinstance(markup, unicode):
       
   953                     matchAgainst = unicode(matchAgainst)
       
   954                 else:
       
   955                     matchAgainst = str(matchAgainst)
       
   956 
       
   957             if not result:
       
   958                 result = matchAgainst == markup
       
   959         return result
       
   960 
       
   961 class ResultSet(list):
       
   962     """A ResultSet is just a list that keeps track of the SoupStrainer
       
   963     that created it."""
       
   964     def __init__(self, source):
       
   965         list.__init__([])
       
   966         self.source = source
       
   967 
       
   968 # Now, some helper functions.
       
   969 
       
   970 def isList(l):
       
   971     """Convenience method that works with all 2.x versions of Python
       
   972     to determine whether or not something is listlike."""
       
   973     return ((hasattr(l, '__iter__') and not isString(l))
       
   974             or (type(l) in (types.ListType, types.TupleType)))
       
   975 
       
   976 def isString(s):
       
   977     """Convenience method that works with all 2.x versions of Python
       
   978     to determine whether or not something is stringlike."""
       
   979     try:
       
   980         return isinstance(s, unicode) or isinstance(s, basestring)
       
   981     except NameError:
       
   982         return isinstance(s, str)
       
   983 
       
   984 def buildTagMap(default, *args):
       
   985     """Turns a list of maps, lists, or scalars into a single map.
       
   986     Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and
       
   987     NESTING_RESET_TAGS maps out of lists and partial maps."""
       
   988     built = {}
       
   989     for portion in args:
       
   990         if hasattr(portion, 'items'):
       
   991             #It's a map. Merge it.
       
   992             for k,v in portion.items():
       
   993                 built[k] = v
       
   994         elif isList(portion) and not isString(portion):
       
   995             #It's a list. Map each item to the default.
       
   996             for k in portion:
       
   997                 built[k] = default
       
   998         else:
       
   999             #It's a scalar. Map it to the default.
       
  1000             built[portion] = default
       
  1001     return built
       
  1002 
       
  1003 # Now, the parser classes.
       
  1004 
       
  1005 class HTMLParserBuilder(HTMLParser):
       
  1006 
       
  1007     def __init__(self, soup):
       
  1008         HTMLParser.__init__(self)
       
  1009         self.soup = soup
       
  1010 
       
  1011     # We inherit feed() and reset().
       
  1012 
       
  1013     def handle_starttag(self, name, attrs):
       
  1014         if name == 'meta':
       
  1015             self.soup.extractCharsetFromMeta(attrs)
       
  1016         else:
       
  1017             self.soup.unknown_starttag(name, attrs)
       
  1018 
       
  1019     def handle_endtag(self, name):
       
  1020         self.soup.unknown_endtag(name)
       
  1021 
       
  1022     def handle_data(self, content):
       
  1023         self.soup.handle_data(content)
       
  1024 
       
  1025     def _toStringSubclass(self, text, subclass):
       
  1026         """Adds a certain piece of text to the tree as a NavigableString
       
  1027         subclass."""
       
  1028         self.soup.endData()
       
  1029         self.handle_data(text)
       
  1030         self.soup.endData(subclass)
       
  1031 
       
  1032     def handle_pi(self, text):
       
  1033         """Handle a processing instruction as a ProcessingInstruction
       
  1034         object, possibly one with a %SOUP-ENCODING% slot into which an
       
  1035         encoding will be plugged later."""
       
  1036         if text[:3] == "xml":
       
  1037             text = u"xml version='1.0' encoding='%SOUP-ENCODING%'"
       
  1038         self._toStringSubclass(text, ProcessingInstruction)
       
  1039 
       
  1040     def handle_comment(self, text):
       
  1041         "Handle comments as Comment objects."
       
  1042         self._toStringSubclass(text, Comment)
       
  1043 
       
  1044     def handle_charref(self, ref):
       
  1045         "Handle character references as data."
       
  1046         if self.soup.convertEntities:
       
  1047             data = unichr(int(ref))
       
  1048         else:
       
  1049             data = '&#%s;' % ref
       
  1050         self.handle_data(data)
       
  1051 
       
  1052     def handle_entityref(self, ref):
       
  1053         """Handle entity references as data, possibly converting known
       
  1054         HTML and/or XML entity references to the corresponding Unicode
       
  1055         characters."""
       
  1056         data = None
       
  1057         if self.soup.convertHTMLEntities:
       
  1058             try:
       
  1059                 data = unichr(name2codepoint[ref])
       
  1060             except KeyError:
       
  1061                 pass
       
  1062 
       
  1063         if not data and self.soup.convertXMLEntities:
       
  1064                 data = self.soup.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref)
       
  1065 
       
  1066         if not data and self.soup.convertHTMLEntities and \
       
  1067             not self.soup.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref):
       
  1068                 # TODO: We've got a problem here. We're told this is
       
  1069                 # an entity reference, but it's not an XML entity
       
  1070                 # reference or an HTML entity reference. Nonetheless,
       
  1071                 # the logical thing to do is to pass it through as an
       
  1072                 # unrecognized entity reference.
       
  1073                 #
       
  1074                 # Except: when the input is "&carol;" this function
       
  1075                 # will be called with input "carol". When the input is
       
  1076                 # "AT&T", this function will be called with input
       
  1077                 # "T". We have no way of knowing whether a semicolon
       
  1078                 # was present originally, so we don't know whether
       
  1079                 # this is an unknown entity or just a misplaced
       
  1080                 # ampersand.
       
  1081                 #
       
  1082                 # The more common case is a misplaced ampersand, so I
       
  1083                 # escape the ampersand and omit the trailing semicolon.
       
  1084                 data = "&amp;%s" % ref
       
  1085         if not data:
       
  1086             # This case is different from the one above, because we
       
  1087             # haven't already gone through a supposedly comprehensive
       
  1088             # mapping of entities to Unicode characters. We might not
       
  1089             # have gone through any mapping at all. So the chances are
       
  1090             # very high that this is a real entity, and not a
       
  1091             # misplaced ampersand.
       
  1092             data = "&%s;" % ref
       
  1093         self.handle_data(data)
       
  1094 
       
  1095     def handle_decl(self, data):
       
  1096         "Handle DOCTYPEs and the like as Declaration objects."
       
  1097         self._toStringSubclass(data, Declaration)
       
  1098 
       
  1099     def parse_declaration(self, i):
       
  1100         """Treat a bogus SGML declaration as raw data. Treat a CDATA
       
  1101         declaration as a CData object."""
       
  1102         j = None
       
  1103         if self.rawdata[i:i+9] == '<![CDATA[':
       
  1104              k = self.rawdata.find(']]>', i)
       
  1105              if k == -1:
       
  1106                  k = len(self.rawdata)
       
  1107              data = self.rawdata[i+9:k]
       
  1108              j = k+3
       
  1109              self._toStringSubclass(data, CData)
       
  1110         else:
       
  1111             try:
       
  1112                 j = HTMLParser.parse_declaration(self, i)
       
  1113             except HTMLParseError:
       
  1114                 toHandle = self.rawdata[i:]
       
  1115                 self.handle_data(toHandle)
       
  1116                 j = i + len(toHandle)
       
  1117         return j
       
  1118 
       
  1119 
       
  1120 class BeautifulStoneSoup(Tag):
       
  1121 
       
  1122     """This class contains the basic parser and search code. It defines
       
  1123     a parser that knows nothing about tag behavior except for the
       
  1124     following:
       
  1125 
       
  1126       You can't close a tag without closing all the tags it encloses.
       
  1127       That is, "<foo><bar></foo>" actually means
       
  1128       "<foo><bar></bar></foo>".
       
  1129 
       
  1130     [Another possible explanation is "<foo><bar /></foo>", but since
       
  1131     this class defines no SELF_CLOSING_TAGS, it will never use that
       
  1132     explanation.]
       
  1133 
       
  1134     This class is useful for parsing XML or made-up markup languages,
       
  1135     or when BeautifulSoup makes an assumption counter to what you were
       
  1136     expecting."""
       
  1137 
       
  1138     SELF_CLOSING_TAGS = {}
       
  1139     NESTABLE_TAGS = {}
       
  1140     RESET_NESTING_TAGS = {}
       
  1141     QUOTE_TAGS = {}
       
  1142     PRESERVE_WHITESPACE_TAGS = []
       
  1143 
       
  1144     MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'),
       
  1145                        lambda x: x.group(1) + ' />'),
       
  1146                       (re.compile('<!\s+([^<>]*)>'),
       
  1147                        lambda x: '<!' + x.group(1) + '>')
       
  1148                       ]
       
  1149 
       
  1150     ROOT_TAG_NAME = u'[document]'
       
  1151 
       
  1152     HTML_ENTITIES = "html"
       
  1153     XML_ENTITIES = "xml"
       
  1154     XHTML_ENTITIES = "xhtml"
       
  1155     # TODO: This only exists for backwards-compatibility
       
  1156     ALL_ENTITIES = XHTML_ENTITIES
       
  1157 
       
  1158     # Used when determining whether a text node is all whitespace and
       
  1159     # can be replaced with a single space. A text node that contains
       
  1160     # fancy Unicode spaces (usually non-breaking) should be left
       
  1161     # alone.
       
  1162     STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, }
       
  1163 
       
  1164     def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None,
       
  1165                  markupMassage=True, smartQuotesTo=XML_ENTITIES,
       
  1166                  convertEntities=None, selfClosingTags=None, isHTML=False,
       
  1167                  builder=HTMLParserBuilder):
       
  1168         """The Soup object is initialized as the 'root tag', and the
       
  1169         provided markup (which can be a string or a file-like object)
       
  1170         is fed into the underlying parser.
       
  1171 
       
  1172         HTMLParser will process most bad HTML, and the BeautifulSoup
       
  1173         class has some tricks for dealing with some HTML that kills
       
  1174         HTMLParser, but Beautiful Soup can nonetheless choke or lose data
       
  1175         if your data uses self-closing tags or declarations
       
  1176         incorrectly.
       
  1177 
       
  1178         By default, Beautiful Soup uses regexes to sanitize input,
       
  1179         avoiding the vast majority of these problems. If the problems
       
  1180         don't apply to you, pass in False for markupMassage, and
       
  1181         you'll get better performance.
       
  1182 
       
  1183         The default parser massage techniques fix the two most common
       
  1184         instances of invalid HTML that choke HTMLParser:
       
  1185 
       
  1186          <br/> (No space between name of closing tag and tag close)
       
  1187          <! --Comment--> (Extraneous whitespace in declaration)
       
  1188 
       
  1189         You can pass in a custom list of (RE object, replace method)
       
  1190         tuples to get Beautiful Soup to scrub your input the way you
       
  1191         want."""
       
  1192 
       
  1193         self.parseOnlyThese = parseOnlyThese
       
  1194         self.fromEncoding = fromEncoding
       
  1195         self.smartQuotesTo = smartQuotesTo
       
  1196         self.convertEntities = convertEntities
       
  1197         # Set the rules for how we'll deal with the entities we
       
  1198         # encounter
       
  1199         if self.convertEntities:
       
  1200             # It doesn't make sense to convert encoded characters to
       
  1201             # entities even while you're converting entities to Unicode.
       
  1202             # Just convert it all to Unicode.
       
  1203             self.smartQuotesTo = None
       
  1204             if convertEntities == self.HTML_ENTITIES:
       
  1205                 self.convertXMLEntities = False
       
  1206                 self.convertHTMLEntities = True
       
  1207                 self.escapeUnrecognizedEntities = True
       
  1208             elif convertEntities == self.XHTML_ENTITIES:
       
  1209                 self.convertXMLEntities = True
       
  1210                 self.convertHTMLEntities = True
       
  1211                 self.escapeUnrecognizedEntities = False
       
  1212             elif convertEntities == self.XML_ENTITIES:
       
  1213                 self.convertXMLEntities = True
       
  1214                 self.convertHTMLEntities = False
       
  1215                 self.escapeUnrecognizedEntities = False
       
  1216         else:
       
  1217             self.convertXMLEntities = False
       
  1218             self.convertHTMLEntities = False
       
  1219             self.escapeUnrecognizedEntities = False
       
  1220 
       
  1221         self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags)
       
  1222         self.builder = builder(self)
       
  1223         self.reset()
       
  1224 
       
  1225         if hasattr(markup, 'read'):        # It's a file-type object.
       
  1226             markup = markup.read()
       
  1227         self.markup = markup
       
  1228         self.markupMassage = markupMassage
       
  1229         try:
       
  1230             self._feed(isHTML=isHTML)
       
  1231         except StopParsing:
       
  1232             pass
       
  1233         self.markup = None                 # The markup can now be GCed.
       
  1234         self.builder = None                # So can the builder.
       
  1235 
       
  1236     def _feed(self, inDocumentEncoding=None, isHTML=False):
       
  1237         # Convert the document to Unicode.
       
  1238         markup = self.markup
       
  1239         if isinstance(markup, unicode):
       
  1240             if not hasattr(self, 'originalEncoding'):
       
  1241                 self.originalEncoding = None
       
  1242         else:
       
  1243             dammit = UnicodeDammit\
       
  1244                      (markup, [self.fromEncoding, inDocumentEncoding],
       
  1245                       smartQuotesTo=self.smartQuotesTo, isHTML=isHTML)
       
  1246             markup = dammit.unicode
       
  1247             self.originalEncoding = dammit.originalEncoding
       
  1248             self.declaredHTMLEncoding = dammit.declaredHTMLEncoding
       
  1249         if markup:
       
  1250             if self.markupMassage:
       
  1251                 if not isList(self.markupMassage):
       
  1252                     self.markupMassage = self.MARKUP_MASSAGE
       
  1253                 for fix, m in self.markupMassage:
       
  1254                     markup = fix.sub(m, markup)
       
  1255                 # TODO: We get rid of markupMassage so that the
       
  1256                 # soup object can be deepcopied later on. Some
       
  1257                 # Python installations can't copy regexes. If anyone
       
  1258                 # was relying on the existence of markupMassage, this
       
  1259                 # might cause problems.
       
  1260                 del(self.markupMassage)
       
  1261         self.builder.reset()
       
  1262 
       
  1263         self.builder.feed(markup)
       
  1264         # Close out any unfinished strings and close all the open tags.
       
  1265         self.endData()
       
  1266         while self.currentTag.name != self.ROOT_TAG_NAME:
       
  1267             self.popTag()
       
  1268 
       
  1269     def isSelfClosingTag(self, name):
       
  1270         """Returns true iff the given string is the name of a
       
  1271         self-closing tag according to this parser."""
       
  1272         return self.SELF_CLOSING_TAGS.has_key(name) \
       
  1273                or self.instanceSelfClosingTags.has_key(name)
       
  1274 
       
  1275     def reset(self):
       
  1276         Tag.__init__(self, self, self.ROOT_TAG_NAME)
       
  1277         self.hidden = 1
       
  1278         self.builder.reset()
       
  1279         self.currentData = []
       
  1280         self.currentTag = None
       
  1281         self.tagStack = []
       
  1282         self.quoteStack = []
       
  1283         self.pushTag(self)
       
  1284 
       
  1285     def popTag(self):
       
  1286         tag = self.tagStack.pop()
       
  1287         # Tags with just one string-owning child get the child as a
       
  1288         # 'string' property, so that soup.tag.string is shorthand for
       
  1289         # soup.tag.contents[0]
       
  1290         if len(self.currentTag.contents) == 1 and \
       
  1291            isinstance(self.currentTag.contents[0], NavigableString):
       
  1292             self.currentTag.string = self.currentTag.contents[0]
       
  1293 
       
  1294         #print "Pop", tag.name
       
  1295         if self.tagStack:
       
  1296             self.currentTag = self.tagStack[-1]
       
  1297         return self.currentTag
       
  1298 
       
  1299     def pushTag(self, tag):
       
  1300         #print "Push", tag.name
       
  1301         if self.currentTag:
       
  1302             self.currentTag.contents.append(tag)
       
  1303         self.tagStack.append(tag)
       
  1304         self.currentTag = self.tagStack[-1]
       
  1305 
       
  1306     def endData(self, containerClass=NavigableString):
       
  1307         if self.currentData:
       
  1308             currentData = u''.join(self.currentData)
       
  1309             if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and
       
  1310                 not set([tag.name for tag in self.tagStack]).intersection(
       
  1311                     self.PRESERVE_WHITESPACE_TAGS)):
       
  1312                 if '\n' in currentData:
       
  1313                     currentData = '\n'
       
  1314                 else:
       
  1315                     currentData = ' '
       
  1316             self.currentData = []
       
  1317             if self.parseOnlyThese and len(self.tagStack) <= 1 and \
       
  1318                    (not self.parseOnlyThese.text or \
       
  1319                     not self.parseOnlyThese.search(currentData)):
       
  1320                 return
       
  1321             o = containerClass(currentData)
       
  1322             o.setup(self.currentTag, self.previous)
       
  1323             if self.previous:
       
  1324                 self.previous.next = o
       
  1325             self.previous = o
       
  1326             self.currentTag.contents.append(o)
       
  1327 
       
  1328 
       
  1329     def _popToTag(self, name, inclusivePop=True):
       
  1330         """Pops the tag stack up to and including the most recent
       
  1331         instance of the given tag. If inclusivePop is false, pops the tag
       
  1332         stack up to but *not* including the most recent instqance of
       
  1333         the given tag."""
       
  1334         #print "Popping to %s" % name
       
  1335         if name == self.ROOT_TAG_NAME:
       
  1336             return
       
  1337 
       
  1338         numPops = 0
       
  1339         mostRecentTag = None
       
  1340         for i in range(len(self.tagStack)-1, 0, -1):
       
  1341             if name == self.tagStack[i].name:
       
  1342                 numPops = len(self.tagStack)-i
       
  1343                 break
       
  1344         if not inclusivePop:
       
  1345             numPops = numPops - 1
       
  1346 
       
  1347         for i in range(0, numPops):
       
  1348             mostRecentTag = self.popTag()
       
  1349         return mostRecentTag
       
  1350 
       
  1351     def _smartPop(self, name):
       
  1352 
       
  1353         """We need to pop up to the previous tag of this type, unless
       
  1354         one of this tag's nesting reset triggers comes between this
       
  1355         tag and the previous tag of this type, OR unless this tag is a
       
  1356         generic nesting trigger and another generic nesting trigger
       
  1357         comes between this tag and the previous tag of this type.
       
  1358 
       
  1359         Examples:
       
  1360          <p>Foo<b>Bar *<p>* should pop to 'p', not 'b'.
       
  1361          <p>Foo<table>Bar *<p>* should pop to 'table', not 'p'.
       
  1362          <p>Foo<table><tr>Bar *<p>* should pop to 'tr', not 'p'.
       
  1363 
       
  1364          <li><ul><li> *<li>* should pop to 'ul', not the first 'li'.
       
  1365          <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr'
       
  1366          <td><tr><td> *<td>* should pop to 'tr', not the first 'td'
       
  1367         """
       
  1368 
       
  1369         nestingResetTriggers = self.NESTABLE_TAGS.get(name)
       
  1370         isNestable = nestingResetTriggers != None
       
  1371         isResetNesting = self.RESET_NESTING_TAGS.has_key(name)
       
  1372         popTo = None
       
  1373         inclusive = True
       
  1374         for i in range(len(self.tagStack)-1, 0, -1):
       
  1375             p = self.tagStack[i]
       
  1376             if (not p or p.name == name) and not isNestable:
       
  1377                 #Non-nestable tags get popped to the top or to their
       
  1378                 #last occurance.
       
  1379                 popTo = name
       
  1380                 break
       
  1381             if (nestingResetTriggers != None
       
  1382                 and p.name in nestingResetTriggers) \
       
  1383                 or (nestingResetTriggers == None and isResetNesting
       
  1384                     and self.RESET_NESTING_TAGS.has_key(p.name)):
       
  1385 
       
  1386                 #If we encounter one of the nesting reset triggers
       
  1387                 #peculiar to this tag, or we encounter another tag
       
  1388                 #that causes nesting to reset, pop up to but not
       
  1389                 #including that tag.
       
  1390                 popTo = p.name
       
  1391                 inclusive = False
       
  1392                 break
       
  1393             p = p.parent
       
  1394         if popTo:
       
  1395             self._popToTag(popTo, inclusive)
       
  1396 
       
  1397     def unknown_starttag(self, name, attrs, selfClosing=0):
       
  1398         #print "Start tag %s: %s" % (name, attrs)
       
  1399         if self.quoteStack:
       
  1400             #This is not a real tag.
       
  1401             #print "<%s> is not real!" % name
       
  1402             attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs))
       
  1403             self.handle_data('<%s%s>' % (name, attrs))
       
  1404             return
       
  1405         self.endData()
       
  1406 
       
  1407         if not self.isSelfClosingTag(name) and not selfClosing:
       
  1408             self._smartPop(name)
       
  1409 
       
  1410         if self.parseOnlyThese and len(self.tagStack) <= 1 \
       
  1411                and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)):
       
  1412             return
       
  1413 
       
  1414         tag = Tag(self, name, attrs, self.currentTag, self.previous)
       
  1415         if self.previous:
       
  1416             self.previous.next = tag
       
  1417         self.previous = tag
       
  1418         self.pushTag(tag)
       
  1419         if selfClosing or self.isSelfClosingTag(name):
       
  1420             self.popTag()
       
  1421         if name in self.QUOTE_TAGS:
       
  1422             #print "Beginning quote (%s)" % name
       
  1423             self.quoteStack.append(name)
       
  1424             self.literal = 1
       
  1425         return tag
       
  1426 
       
  1427     def unknown_endtag(self, name):
       
  1428         #print "End tag %s" % name
       
  1429         if self.quoteStack and self.quoteStack[-1] != name:
       
  1430             #This is not a real end tag.
       
  1431             #print "</%s> is not real!" % name
       
  1432             self.handle_data('</%s>' % name)
       
  1433             return
       
  1434         self.endData()
       
  1435         self._popToTag(name)
       
  1436         if self.quoteStack and self.quoteStack[-1] == name:
       
  1437             self.quoteStack.pop()
       
  1438             self.literal = (len(self.quoteStack) > 0)
       
  1439 
       
  1440     def handle_data(self, data):
       
  1441         self.currentData.append(data)
       
  1442 
       
  1443     def extractCharsetFromMeta(self, attrs):
       
  1444         self.unknown_starttag('meta', attrs)
       
  1445 
       
  1446 
       
  1447 class BeautifulSoup(BeautifulStoneSoup):
       
  1448 
       
  1449     """This parser knows the following facts about HTML:
       
  1450 
       
  1451     * Some tags have no closing tag and should be interpreted as being
       
  1452       closed as soon as they are encountered.
       
  1453 
       
  1454     * The text inside some tags (ie. 'script') may contain tags which
       
  1455       are not really part of the document and which should be parsed
       
  1456       as text, not tags. If you want to parse the text as tags, you can
       
  1457       always fetch it and parse it explicitly.
       
  1458 
       
  1459     * Tag nesting rules:
       
  1460 
       
  1461       Most tags can't be nested at all. For instance, the occurance of
       
  1462       a <p> tag should implicitly close the previous <p> tag.
       
  1463 
       
  1464        <p>Para1<p>Para2
       
  1465         should be transformed into:
       
  1466        <p>Para1</p><p>Para2
       
  1467 
       
  1468       Some tags can be nested arbitrarily. For instance, the occurance
       
  1469       of a <blockquote> tag should _not_ implicitly close the previous
       
  1470       <blockquote> tag.
       
  1471 
       
  1472        Alice said: <blockquote>Bob said: <blockquote>Blah
       
  1473         should NOT be transformed into:
       
  1474        Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah
       
  1475 
       
  1476       Some tags can be nested, but the nesting is reset by the
       
  1477       interposition of other tags. For instance, a <tr> tag should
       
  1478       implicitly close the previous <tr> tag within the same <table>,
       
  1479       but not close a <tr> tag in another table.
       
  1480 
       
  1481        <table><tr>Blah<tr>Blah
       
  1482         should be transformed into:
       
  1483        <table><tr>Blah</tr><tr>Blah
       
  1484         but,
       
  1485        <tr>Blah<table><tr>Blah
       
  1486         should NOT be transformed into
       
  1487        <tr>Blah<table></tr><tr>Blah
       
  1488 
       
  1489     Differing assumptions about tag nesting rules are a major source
       
  1490     of problems with the BeautifulSoup class. If BeautifulSoup is not
       
  1491     treating as nestable a tag your page author treats as nestable,
       
  1492     try ICantBelieveItsBeautifulSoup, MinimalSoup, or
       
  1493     BeautifulStoneSoup before writing your own subclass."""
       
  1494 
       
  1495     def __init__(self, *args, **kwargs):
       
  1496         if not kwargs.has_key('smartQuotesTo'):
       
  1497             kwargs['smartQuotesTo'] = self.HTML_ENTITIES
       
  1498         kwargs['isHTML'] = True
       
  1499         BeautifulStoneSoup.__init__(self, *args, **kwargs)
       
  1500 
       
  1501     SELF_CLOSING_TAGS = buildTagMap(None,
       
  1502                                     ['br' , 'hr', 'input', 'img', 'meta',
       
  1503                                     'spacer', 'link', 'frame', 'base'])
       
  1504 
       
  1505     PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
       
  1506 
       
  1507     QUOTE_TAGS = {'script' : None, 'textarea' : None}
       
  1508 
       
  1509     #According to the HTML standard, each of these inline tags can
       
  1510     #contain another tag of the same type. Furthermore, it's common
       
  1511     #to actually use these tags this way.
       
  1512     NESTABLE_INLINE_TAGS = ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup',
       
  1513                             'center']
       
  1514 
       
  1515     #According to the HTML standard, these block tags can contain
       
  1516     #another tag of the same type. Furthermore, it's common
       
  1517     #to actually use these tags this way.
       
  1518     NESTABLE_BLOCK_TAGS = ['blockquote', 'div', 'fieldset', 'ins', 'del']
       
  1519 
       
  1520     #Lists can contain other lists, but there are restrictions.
       
  1521     NESTABLE_LIST_TAGS = { 'ol' : [],
       
  1522                            'ul' : [],
       
  1523                            'li' : ['ul', 'ol'],
       
  1524                            'dl' : [],
       
  1525                            'dd' : ['dl'],
       
  1526                            'dt' : ['dl'] }
       
  1527 
       
  1528     #Tables can contain other tables, but there are restrictions.
       
  1529     NESTABLE_TABLE_TAGS = {'table' : [],
       
  1530                            'tr' : ['table', 'tbody', 'tfoot', 'thead'],
       
  1531                            'td' : ['tr'],
       
  1532                            'th' : ['tr'],
       
  1533                            'thead' : ['table'],
       
  1534                            'tbody' : ['table'],
       
  1535                            'tfoot' : ['table'],
       
  1536                            }
       
  1537 
       
  1538     NON_NESTABLE_BLOCK_TAGS = ['address', 'form', 'p', 'pre']
       
  1539 
       
  1540     #If one of these tags is encountered, all tags up to the next tag of
       
  1541     #this type are popped.
       
  1542     RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript',
       
  1543                                      NON_NESTABLE_BLOCK_TAGS,
       
  1544                                      NESTABLE_LIST_TAGS,
       
  1545                                      NESTABLE_TABLE_TAGS)
       
  1546 
       
  1547     NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS,
       
  1548                                 NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS)
       
  1549 
       
  1550     # Used to detect the charset in a META tag; see start_meta
       
  1551     CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
       
  1552 
       
  1553     def extractCharsetFromMeta(self, attrs):
       
  1554         """Beautiful Soup can detect a charset included in a META tag,
       
  1555         try to convert the document to that charset, and re-parse the
       
  1556         document from the beginning."""
       
  1557         httpEquiv = None
       
  1558         contentType = None
       
  1559         contentTypeIndex = None
       
  1560         tagNeedsEncodingSubstitution = False
       
  1561 
       
  1562         for i in range(0, len(attrs)):
       
  1563             key, value = attrs[i]
       
  1564             key = key.lower()
       
  1565             if key == 'http-equiv':
       
  1566                 httpEquiv = value
       
  1567             elif key == 'content':
       
  1568                 contentType = value
       
  1569                 contentTypeIndex = i
       
  1570 
       
  1571         if httpEquiv and contentType: # It's an interesting meta tag.
       
  1572             match = self.CHARSET_RE.search(contentType)
       
  1573             if match:
       
  1574                 if (self.declaredHTMLEncoding is not None or
       
  1575                     self.originalEncoding == self.fromEncoding):
       
  1576                     # An HTML encoding was sniffed while converting
       
  1577                     # the document to Unicode, or an HTML encoding was
       
  1578                     # sniffed during a previous pass through the
       
  1579                     # document, or an encoding was specified
       
  1580                     # explicitly and it worked. Rewrite the meta tag.
       
  1581                     def rewrite(match):
       
  1582                         return match.group(1) + "%SOUP-ENCODING%"
       
  1583                     newAttr = self.CHARSET_RE.sub(rewrite, contentType)
       
  1584                     attrs[contentTypeIndex] = (attrs[contentTypeIndex][0],
       
  1585                                                newAttr)
       
  1586                     tagNeedsEncodingSubstitution = True
       
  1587                 else:
       
  1588                     # This is our first pass through the document.
       
  1589                     # Go through it again with the encoding information.
       
  1590                     newCharset = match.group(3)
       
  1591                     if newCharset and newCharset != self.originalEncoding:
       
  1592                         self.declaredHTMLEncoding = newCharset
       
  1593                         self._feed(self.declaredHTMLEncoding)
       
  1594                         raise StopParsing
       
  1595                     pass
       
  1596         tag = self.unknown_starttag("meta", attrs)
       
  1597         if tag and tagNeedsEncodingSubstitution:
       
  1598             tag.containsSubstitutions = True
       
  1599 
       
  1600 
       
  1601 class StopParsing(Exception):
       
  1602     pass
       
  1603 
       
  1604 class ICantBelieveItsBeautifulSoup(BeautifulSoup):
       
  1605 
       
  1606     """The BeautifulSoup class is oriented towards skipping over
       
  1607     common HTML errors like unclosed tags. However, sometimes it makes
       
  1608     errors of its own. For instance, consider this fragment:
       
  1609 
       
  1610      <b>Foo<b>Bar</b></b>
       
  1611 
       
  1612     This is perfectly valid (if bizarre) HTML. However, the
       
  1613     BeautifulSoup class will implicitly close the first b tag when it
       
  1614     encounters the second 'b'. It will think the author wrote
       
  1615     "<b>Foo<b>Bar", and didn't close the first 'b' tag, because
       
  1616     there's no real-world reason to bold something that's already
       
  1617     bold. When it encounters '</b></b>' it will close two more 'b'
       
  1618     tags, for a grand total of three tags closed instead of two. This
       
  1619     can throw off the rest of your document structure. The same is
       
  1620     true of a number of other tags, listed below.
       
  1621 
       
  1622     It's much more common for someone to forget to close a 'b' tag
       
  1623     than to actually use nested 'b' tags, and the BeautifulSoup class
       
  1624     handles the common case. This class handles the not-co-common
       
  1625     case: where you can't believe someone wrote what they did, but
       
  1626     it's valid HTML and BeautifulSoup screwed up by assuming it
       
  1627     wouldn't be."""
       
  1628 
       
  1629     I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \
       
  1630      ['em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong',
       
  1631       'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b',
       
  1632       'big']
       
  1633 
       
  1634     I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ['noscript']
       
  1635 
       
  1636     NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS,
       
  1637                                 I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS,
       
  1638                                 I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS)
       
  1639 
       
  1640 class MinimalSoup(BeautifulSoup):
       
  1641     """The MinimalSoup class is for parsing HTML that contains
       
  1642     pathologically bad markup. It makes no assumptions about tag
       
  1643     nesting, but it does know which tags are self-closing, that
       
  1644     <script> tags contain Javascript and should not be parsed, that
       
  1645     META tags may contain encoding information, and so on.
       
  1646 
       
  1647     This also makes it better for subclassing than BeautifulStoneSoup
       
  1648     or BeautifulSoup."""
       
  1649 
       
  1650     RESET_NESTING_TAGS = buildTagMap('noscript')
       
  1651     NESTABLE_TAGS = {}
       
  1652 
       
  1653 class BeautifulSOAP(BeautifulStoneSoup):
       
  1654     """This class will push a tag with only a single string child into
       
  1655     the tag's parent as an attribute. The attribute's name is the tag
       
  1656     name, and the value is the string child. An example should give
       
  1657     the flavor of the change:
       
  1658 
       
  1659     <foo><bar>baz</bar></foo>
       
  1660      =>
       
  1661     <foo bar="baz"><bar>baz</bar></foo>
       
  1662 
       
  1663     You can then access fooTag['bar'] instead of fooTag.barTag.string.
       
  1664 
       
  1665     This is, of course, useful for scraping structures that tend to
       
  1666     use subelements instead of attributes, such as SOAP messages. Note
       
  1667     that it modifies its input, so don't print the modified version
       
  1668     out.
       
  1669 
       
  1670     I'm not sure how many people really want to use this class; let me
       
  1671     know if you do. Mainly I like the name."""
       
  1672 
       
  1673     def popTag(self):
       
  1674         if len(self.tagStack) > 1:
       
  1675             tag = self.tagStack[-1]
       
  1676             parent = self.tagStack[-2]
       
  1677             parent._getAttrMap()
       
  1678             if (isinstance(tag, Tag) and len(tag.contents) == 1 and
       
  1679                 isinstance(tag.contents[0], NavigableString) and
       
  1680                 not parent.attrMap.has_key(tag.name)):
       
  1681                 parent[tag.name] = tag.contents[0]
       
  1682         BeautifulStoneSoup.popTag(self)
       
  1683 
       
  1684 #Enterprise class names! It has come to our attention that some people
       
  1685 #think the names of the Beautiful Soup parser classes are too silly
       
  1686 #and "unprofessional" for use in enterprise screen-scraping. We feel
       
  1687 #your pain! For such-minded folk, the Beautiful Soup Consortium And
       
  1688 #All-Night Kosher Bakery recommends renaming this file to
       
  1689 #"RobustParser.py" (or, in cases of extreme enterprisiness,
       
  1690 #"RobustParserBeanInterface.class") and using the following
       
  1691 #enterprise-friendly class aliases:
       
  1692 class RobustXMLParser(BeautifulStoneSoup):
       
  1693     pass
       
  1694 class RobustHTMLParser(BeautifulSoup):
       
  1695     pass
       
  1696 class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup):
       
  1697     pass
       
  1698 class RobustInsanelyWackAssHTMLParser(MinimalSoup):
       
  1699     pass
       
  1700 class SimplifyingSOAPParser(BeautifulSOAP):
       
  1701     pass
       
  1702 
       
  1703 ######################################################
       
  1704 #
       
  1705 # Bonus library: Unicode, Dammit
       
  1706 #
       
  1707 # This class forces XML data into a standard format (usually to UTF-8
       
  1708 # or Unicode).  It is heavily based on code from Mark Pilgrim's
       
  1709 # Universal Feed Parser. It does not rewrite the XML or HTML to
       
  1710 # reflect a new encoding: that happens in BeautifulStoneSoup.handle_pi
       
  1711 # (XML) and BeautifulSoup.start_meta (HTML).
       
  1712 
       
  1713 # Autodetects character encodings.
       
  1714 # Download from http://chardet.feedparser.org/
       
  1715 try:
       
  1716     import chardet
       
  1717 #    import chardet.constants
       
  1718 #    chardet.constants._debug = 1
       
  1719 except ImportError:
       
  1720     chardet = None
       
  1721 
       
  1722 # cjkcodecs and iconv_codec make Python know about more character encodings.
       
  1723 # Both are available from http://cjkpython.i18n.org/
       
  1724 # They're built in if you use Python 2.4.
       
  1725 try:
       
  1726     import cjkcodecs.aliases
       
  1727 except ImportError:
       
  1728     pass
       
  1729 try:
       
  1730     import iconv_codec
       
  1731 except ImportError:
       
  1732     pass
       
  1733 
       
  1734 class UnicodeDammit:
       
  1735     """A class for detecting the encoding of a *ML document and
       
  1736     converting it to a Unicode string. If the source encoding is
       
  1737     windows-1252, can replace MS smart quotes with their HTML or XML
       
  1738     equivalents."""
       
  1739 
       
  1740     # This dictionary maps commonly seen values for "charset" in HTML
       
  1741     # meta tags to the corresponding Python codec names. It only covers
       
  1742     # values that aren't in Python's aliases and can't be determined
       
  1743     # by the heuristics in find_codec.
       
  1744     CHARSET_ALIASES = { "macintosh" : "mac-roman",
       
  1745                         "x-sjis" : "shift-jis" }
       
  1746 
       
  1747     def __init__(self, markup, overrideEncodings=[],
       
  1748                  smartQuotesTo='xml', isHTML=False):
       
  1749         self.declaredHTMLEncoding = None
       
  1750         self.markup, documentEncoding, sniffedEncoding = \
       
  1751                      self._detectEncoding(markup, isHTML)
       
  1752         self.smartQuotesTo = smartQuotesTo
       
  1753         self.triedEncodings = []
       
  1754         if markup == '' or isinstance(markup, unicode):
       
  1755             self.originalEncoding = None
       
  1756             self.unicode = unicode(markup)
       
  1757             return
       
  1758 
       
  1759         u = None
       
  1760         for proposedEncoding in overrideEncodings:
       
  1761             u = self._convertFrom(proposedEncoding)
       
  1762             if u: break
       
  1763         if not u:
       
  1764             for proposedEncoding in (documentEncoding, sniffedEncoding):
       
  1765                 u = self._convertFrom(proposedEncoding)
       
  1766                 if u: break
       
  1767 
       
  1768         # If no luck and we have auto-detection library, try that:
       
  1769         if not u and chardet and not isinstance(self.markup, unicode):
       
  1770             u = self._convertFrom(chardet.detect(self.markup)['encoding'])
       
  1771 
       
  1772         # As a last resort, try utf-8 and windows-1252:
       
  1773         if not u:
       
  1774             for proposed_encoding in ("utf-8", "windows-1252"):
       
  1775                 u = self._convertFrom(proposed_encoding)
       
  1776                 if u: break
       
  1777 
       
  1778         self.unicode = u
       
  1779         if not u: self.originalEncoding = None
       
  1780 
       
  1781     def _subMSChar(self, match):
       
  1782         """Changes a MS smart quote character to an XML or HTML
       
  1783         entity."""
       
  1784         orig = match.group(1)
       
  1785         sub = self.MS_CHARS.get(orig)
       
  1786         if type(sub) == types.TupleType:
       
  1787             if self.smartQuotesTo == 'xml':
       
  1788                 sub = '&#x'.encode() + sub[1].encode() + ';'.encode()
       
  1789             else:
       
  1790                 sub = '&'.encode() + sub[0].encode() + ';'.encode()
       
  1791         else:
       
  1792             sub = sub.encode()
       
  1793         return sub
       
  1794 
       
  1795     def _convertFrom(self, proposed):
       
  1796         proposed = self.find_codec(proposed)
       
  1797         if not proposed or proposed in self.triedEncodings:
       
  1798             return None
       
  1799         self.triedEncodings.append(proposed)
       
  1800         markup = self.markup
       
  1801 
       
  1802         # Convert smart quotes to HTML if coming from an encoding
       
  1803         # that might have them.
       
  1804         if self.smartQuotesTo and proposed.lower() in("windows-1252",
       
  1805                                                       "iso-8859-1",
       
  1806                                                       "iso-8859-2"):
       
  1807             smart_quotes_re = "([\x80-\x9f])"
       
  1808             smart_quotes_compiled = re.compile(smart_quotes_re)
       
  1809             markup = smart_quotes_compiled.sub(self._subMSChar, markup)
       
  1810 
       
  1811         try:
       
  1812             # print "Trying to convert document to %s" % proposed
       
  1813             u = self._toUnicode(markup, proposed)
       
  1814             self.markup = u
       
  1815             self.originalEncoding = proposed
       
  1816         except Exception, e:
       
  1817             # print "That didn't work!"
       
  1818             # print e
       
  1819             return None
       
  1820         #print "Correct encoding: %s" % proposed
       
  1821         return self.markup
       
  1822 
       
  1823     def _toUnicode(self, data, encoding):
       
  1824         '''Given a string and its encoding, decodes the string into Unicode.
       
  1825         %encoding is a string recognized by encodings.aliases'''
       
  1826 
       
  1827         # strip Byte Order Mark (if present)
       
  1828         if (len(data) >= 4) and (data[:2] == '\xfe\xff') \
       
  1829                and (data[2:4] != '\x00\x00'):
       
  1830             encoding = 'utf-16be'
       
  1831             data = data[2:]
       
  1832         elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \
       
  1833                  and (data[2:4] != '\x00\x00'):
       
  1834             encoding = 'utf-16le'
       
  1835             data = data[2:]
       
  1836         elif data[:3] == '\xef\xbb\xbf':
       
  1837             encoding = 'utf-8'
       
  1838             data = data[3:]
       
  1839         elif data[:4] == '\x00\x00\xfe\xff':
       
  1840             encoding = 'utf-32be'
       
  1841             data = data[4:]
       
  1842         elif data[:4] == '\xff\xfe\x00\x00':
       
  1843             encoding = 'utf-32le'
       
  1844             data = data[4:]
       
  1845         newdata = unicode(data, encoding)
       
  1846         return newdata
       
  1847 
       
  1848     def _detectEncoding(self, xml_data, isHTML=False):
       
  1849         """Given a document, tries to detect its XML encoding."""
       
  1850         xml_encoding = sniffed_xml_encoding = None
       
  1851         try:
       
  1852             if xml_data[:4] == '\x4c\x6f\xa7\x94':
       
  1853                 # EBCDIC
       
  1854                 xml_data = self._ebcdic_to_ascii(xml_data)
       
  1855             elif xml_data[:4] == '\x00\x3c\x00\x3f':
       
  1856                 # UTF-16BE
       
  1857                 sniffed_xml_encoding = 'utf-16be'
       
  1858                 xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
       
  1859             elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \
       
  1860                      and (xml_data[2:4] != '\x00\x00'):
       
  1861                 # UTF-16BE with BOM
       
  1862                 sniffed_xml_encoding = 'utf-16be'
       
  1863                 xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
       
  1864             elif xml_data[:4] == '\x3c\x00\x3f\x00':
       
  1865                 # UTF-16LE
       
  1866                 sniffed_xml_encoding = 'utf-16le'
       
  1867                 xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
       
  1868             elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \
       
  1869                      (xml_data[2:4] != '\x00\x00'):
       
  1870                 # UTF-16LE with BOM
       
  1871                 sniffed_xml_encoding = 'utf-16le'
       
  1872                 xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
       
  1873             elif xml_data[:4] == '\x00\x00\x00\x3c':
       
  1874                 # UTF-32BE
       
  1875                 sniffed_xml_encoding = 'utf-32be'
       
  1876                 xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
       
  1877             elif xml_data[:4] == '\x3c\x00\x00\x00':
       
  1878                 # UTF-32LE
       
  1879                 sniffed_xml_encoding = 'utf-32le'
       
  1880                 xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
       
  1881             elif xml_data[:4] == '\x00\x00\xfe\xff':
       
  1882                 # UTF-32BE with BOM
       
  1883                 sniffed_xml_encoding = 'utf-32be'
       
  1884                 xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
       
  1885             elif xml_data[:4] == '\xff\xfe\x00\x00':
       
  1886                 # UTF-32LE with BOM
       
  1887                 sniffed_xml_encoding = 'utf-32le'
       
  1888                 xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
       
  1889             elif xml_data[:3] == '\xef\xbb\xbf':
       
  1890                 # UTF-8 with BOM
       
  1891                 sniffed_xml_encoding = 'utf-8'
       
  1892                 xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
       
  1893             else:
       
  1894                 sniffed_xml_encoding = 'ascii'
       
  1895                 pass
       
  1896         except:
       
  1897             xml_encoding_match = None
       
  1898         xml_encoding_re = '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode()
       
  1899         xml_encoding_match = re.compile(xml_encoding_re).match(xml_data)
       
  1900         if not xml_encoding_match and isHTML:
       
  1901             meta_re = '<\s*meta[^>]+charset=([^>]*?)[;\'">]'.encode()
       
  1902             regexp = re.compile(meta_re, re.I)
       
  1903             xml_encoding_match = regexp.search(xml_data)
       
  1904         if xml_encoding_match is not None:
       
  1905             xml_encoding = xml_encoding_match.groups()[0].decode(
       
  1906                 'ascii').lower()
       
  1907             if isHTML:
       
  1908                 self.declaredHTMLEncoding = xml_encoding
       
  1909             if sniffed_xml_encoding and \
       
  1910                (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
       
  1911                                  'iso-10646-ucs-4', 'ucs-4', 'csucs4',
       
  1912                                  'utf-16', 'utf-32', 'utf_16', 'utf_32',
       
  1913                                  'utf16', 'u16')):
       
  1914                 xml_encoding = sniffed_xml_encoding
       
  1915         return xml_data, xml_encoding, sniffed_xml_encoding
       
  1916 
       
  1917 
       
  1918     def find_codec(self, charset):
       
  1919         return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \
       
  1920                or (charset and self._codec(charset.replace("-", ""))) \
       
  1921                or (charset and self._codec(charset.replace("-", "_"))) \
       
  1922                or charset
       
  1923 
       
  1924     def _codec(self, charset):
       
  1925         if not charset: return charset
       
  1926         codec = None
       
  1927         try:
       
  1928             codecs.lookup(charset)
       
  1929             codec = charset
       
  1930         except (LookupError, ValueError):
       
  1931             pass
       
  1932         return codec
       
  1933 
       
  1934     EBCDIC_TO_ASCII_MAP = None
       
  1935     def _ebcdic_to_ascii(self, s):
       
  1936         c = self.__class__
       
  1937         if not c.EBCDIC_TO_ASCII_MAP:
       
  1938             emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
       
  1939                     16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
       
  1940                     128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
       
  1941                     144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
       
  1942                     32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
       
  1943                     38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
       
  1944                     45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
       
  1945                     186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
       
  1946                     195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
       
  1947                     201,202,106,107,108,109,110,111,112,113,114,203,204,205,
       
  1948                     206,207,208,209,126,115,116,117,118,119,120,121,122,210,
       
  1949                     211,212,213,214,215,216,217,218,219,220,221,222,223,224,
       
  1950                     225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
       
  1951                     73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
       
  1952                     82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
       
  1953                     90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
       
  1954                     250,251,252,253,254,255)
       
  1955             import string
       
  1956             c.EBCDIC_TO_ASCII_MAP = string.maketrans( \
       
  1957             ''.join(map(chr, range(256))), ''.join(map(chr, emap)))
       
  1958         return s.translate(c.EBCDIC_TO_ASCII_MAP)
       
  1959 
       
  1960     MS_CHARS = { '\x80' : ('euro', '20AC'),
       
  1961                  '\x81' : ' ',
       
  1962                  '\x82' : ('sbquo', '201A'),
       
  1963                  '\x83' : ('fnof', '192'),
       
  1964                  '\x84' : ('bdquo', '201E'),
       
  1965                  '\x85' : ('hellip', '2026'),
       
  1966                  '\x86' : ('dagger', '2020'),
       
  1967                  '\x87' : ('Dagger', '2021'),
       
  1968                  '\x88' : ('circ', '2C6'),
       
  1969                  '\x89' : ('permil', '2030'),
       
  1970                  '\x8A' : ('Scaron', '160'),
       
  1971                  '\x8B' : ('lsaquo', '2039'),
       
  1972                  '\x8C' : ('OElig', '152'),
       
  1973                  '\x8D' : '?',
       
  1974                  '\x8E' : ('#x17D', '17D'),
       
  1975                  '\x8F' : '?',
       
  1976                  '\x90' : '?',
       
  1977                  '\x91' : ('lsquo', '2018'),
       
  1978                  '\x92' : ('rsquo', '2019'),
       
  1979                  '\x93' : ('ldquo', '201C'),
       
  1980                  '\x94' : ('rdquo', '201D'),
       
  1981                  '\x95' : ('bull', '2022'),
       
  1982                  '\x96' : ('ndash', '2013'),
       
  1983                  '\x97' : ('mdash', '2014'),
       
  1984                  '\x98' : ('tilde', '2DC'),
       
  1985                  '\x99' : ('trade', '2122'),
       
  1986                  '\x9a' : ('scaron', '161'),
       
  1987                  '\x9b' : ('rsaquo', '203A'),
       
  1988                  '\x9c' : ('oelig', '153'),
       
  1989                  '\x9d' : '?',
       
  1990                  '\x9e' : ('#x17E', '17E'),
       
  1991                  '\x9f' : ('Yuml', ''),}
       
  1992 
       
  1993 #######################################################################
       
  1994 
       
  1995 
       
  1996 #By default, act as an HTML pretty-printer.
       
  1997 if __name__ == '__main__':
       
  1998     import sys
       
  1999     soup = BeautifulSoup(sys.stdin)
       
  2000     print soup.prettify()