Subversion Repositories Code-Repo

Rev

Go to most recent revision | Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
100 Kevin 1
"""Beautiful Soup
2
Elixir and Tonic
3
"The Screen-Scraper's Friend"
4
http://www.crummy.com/software/BeautifulSoup/
5
 
6
Beautiful Soup parses a (possibly invalid) XML or HTML document into a
7
tree representation. It provides methods and Pythonic idioms that make
8
it easy to navigate, search, and modify the tree.
9
 
10
A well-formed XML/HTML document yields a well-formed data
11
structure. An ill-formed XML/HTML document yields a correspondingly
12
ill-formed data structure. If your document is only locally
13
well-formed, you can use this library to find and process the
14
well-formed part of it.
15
 
16
Beautiful Soup works with Python 2.2 and up. It has no external
17
dependencies, but you'll have more success at converting data to UTF-8
18
if you also install these three packages:
19
 
20
* chardet, for auto-detecting character encodings
21
  http://chardet.feedparser.org/
22
* cjkcodecs and iconv_codec, which add more encodings to the ones supported
23
  by stock Python.
24
  http://cjkpython.i18n.org/
25
 
26
Beautiful Soup defines classes for two main parsing strategies:
27
 
28
 * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
29
   language that kind of looks like XML.
30
 
31
 * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid
32
   or invalid. This class has web browser-like heuristics for
33
   obtaining a sensible parse tree in the face of common HTML errors.
34
 
35
Beautiful Soup also defines a class (UnicodeDammit) for autodetecting
36
the encoding of an HTML or XML document, and converting it to
37
Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser.
38
 
39
For more than you ever wanted to know about Beautiful Soup, see the
40
documentation:
41
http://www.crummy.com/software/BeautifulSoup/documentation.html
42
 
43
Here, have some legalese:
44
 
45
Copyright (c) 2004-2010, Leonard Richardson
46
 
47
All rights reserved.
48
 
49
Redistribution and use in source and binary forms, with or without
50
modification, are permitted provided that the following conditions are
51
met:
52
 
53
  * Redistributions of source code must retain the above copyright
54
    notice, this list of conditions and the following disclaimer.
55
 
56
  * Redistributions in binary form must reproduce the above
57
    copyright notice, this list of conditions and the following
58
    disclaimer in the documentation and/or other materials provided
59
    with the distribution.
60
 
61
  * Neither the name of the the Beautiful Soup Consortium and All
62
    Night Kosher Bakery nor the names of its contributors may be
63
    used to endorse or promote products derived from this software
64
    without specific prior written permission.
65
 
66
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
67
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
68
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
69
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
70
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
71
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
72
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
73
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
74
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
75
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
76
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT.
77
 
78
"""
79
from __future__ import generators
80
 
81
__author__ = "Leonard Richardson (leonardr@segfault.org)"
82
__version__ = "3.2.0"
83
__copyright__ = "Copyright (c) 2004-2010 Leonard Richardson"
84
__license__ = "New-style BSD"
85
 
86
from sgmllib import SGMLParser, SGMLParseError
87
import codecs
88
import markupbase
89
import types
90
import re
91
import sgmllib
92
try:
93
  from htmlentitydefs import name2codepoint
94
except ImportError:
95
  name2codepoint = {}
96
try:
97
    set
98
except NameError:
99
    from sets import Set as set
100
 
101
#These hacks make Beautiful Soup able to parse XML with namespaces
102
sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
103
markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match
104
 
105
DEFAULT_OUTPUT_ENCODING = "utf-8"
106
 
107
def _match_css_class(str):
108
    """Build a RE to match the given CSS class."""
109
    return re.compile(r"(^|.*\s)%s($|\s)" % str)
110
 
111
# First, the classes that represent markup elements.
112
 
113
class PageElement(object):
114
    """Contains the navigational information for some part of the page
115
    (either a tag or a piece of text)"""
116
 
117
    def setup(self, parent=None, previous=None):
118
        """Sets up the initial relations between this element and
119
        other elements."""
120
        self.parent = parent
121
        self.previous = previous
122
        self.next = None
123
        self.previousSibling = None
124
        self.nextSibling = None
125
        if self.parent and self.parent.contents:
126
            self.previousSibling = self.parent.contents[-1]
127
            self.previousSibling.nextSibling = self
128
 
129
    def replaceWith(self, replaceWith):
130
        oldParent = self.parent
131
        myIndex = self.parent.index(self)
132
        if hasattr(replaceWith, "parent")\
133
                  and replaceWith.parent is self.parent:
134
            # We're replacing this element with one of its siblings.
135
            index = replaceWith.parent.index(replaceWith)
136
            if index and index < myIndex:
137
                # Furthermore, it comes before this element. That
138
                # means that when we extract it, the index of this
139
                # element will change.
140
                myIndex = myIndex - 1
141
        self.extract()
142
        oldParent.insert(myIndex, replaceWith)
143
 
144
    def replaceWithChildren(self):
145
        myParent = self.parent
146
        myIndex = self.parent.index(self)
147
        self.extract()
148
        reversedChildren = list(self.contents)
149
        reversedChildren.reverse()
150
        for child in reversedChildren:
151
            myParent.insert(myIndex, child)
152
 
153
    def extract(self):
154
        """Destructively rips this element out of the tree."""
155
        if self.parent:
156
            try:
157
                del self.parent.contents[self.parent.index(self)]
158
            except ValueError:
159
                pass
160
 
161
        #Find the two elements that would be next to each other if
162
        #this element (and any children) hadn't been parsed. Connect
163
        #the two.
164
        lastChild = self._lastRecursiveChild()
165
        nextElement = lastChild.next
166
 
167
        if self.previous:
168
            self.previous.next = nextElement
169
        if nextElement:
170
            nextElement.previous = self.previous
171
        self.previous = None
172
        lastChild.next = None
173
 
174
        self.parent = None
175
        if self.previousSibling:
176
            self.previousSibling.nextSibling = self.nextSibling
177
        if self.nextSibling:
178
            self.nextSibling.previousSibling = self.previousSibling
179
        self.previousSibling = self.nextSibling = None
180
        return self
181
 
182
    def _lastRecursiveChild(self):
183
        "Finds the last element beneath this object to be parsed."
184
        lastChild = self
185
        while hasattr(lastChild, 'contents') and lastChild.contents:
186
            lastChild = lastChild.contents[-1]
187
        return lastChild
188
 
189
    def insert(self, position, newChild):
190
        if isinstance(newChild, basestring) \
191
            and not isinstance(newChild, NavigableString):
192
            newChild = NavigableString(newChild)
193
 
194
        position =  min(position, len(self.contents))
195
        if hasattr(newChild, 'parent') and newChild.parent is not None:
196
            # We're 'inserting' an element that's already one
197
            # of this object's children.
198
            if newChild.parent is self:
199
                index = self.index(newChild)
200
                if index > position:
201
                    # Furthermore we're moving it further down the
202
                    # list of this object's children. That means that
203
                    # when we extract this element, our target index
204
                    # will jump down one.
205
                    position = position - 1
206
            newChild.extract()
207
 
208
        newChild.parent = self
209
        previousChild = None
210
        if position == 0:
211
            newChild.previousSibling = None
212
            newChild.previous = self
213
        else:
214
            previousChild = self.contents[position-1]
215
            newChild.previousSibling = previousChild
216
            newChild.previousSibling.nextSibling = newChild
217
            newChild.previous = previousChild._lastRecursiveChild()
218
        if newChild.previous:
219
            newChild.previous.next = newChild
220
 
221
        newChildsLastElement = newChild._lastRecursiveChild()
222
 
223
        if position >= len(self.contents):
224
            newChild.nextSibling = None
225
 
226
            parent = self
227
            parentsNextSibling = None
228
            while not parentsNextSibling:
229
                parentsNextSibling = parent.nextSibling
230
                parent = parent.parent
231
                if not parent: # This is the last element in the document.
232
                    break
233
            if parentsNextSibling:
234
                newChildsLastElement.next = parentsNextSibling
235
            else:
236
                newChildsLastElement.next = None
237
        else:
238
            nextChild = self.contents[position]
239
            newChild.nextSibling = nextChild
240
            if newChild.nextSibling:
241
                newChild.nextSibling.previousSibling = newChild
242
            newChildsLastElement.next = nextChild
243
 
244
        if newChildsLastElement.next:
245
            newChildsLastElement.next.previous = newChildsLastElement
246
        self.contents.insert(position, newChild)
247
 
248
    def append(self, tag):
249
        """Appends the given tag to the contents of this tag."""
250
        self.insert(len(self.contents), tag)
251
 
252
    def findNext(self, name=None, attrs={}, text=None, **kwargs):
253
        """Returns the first item that matches the given criteria and
254
        appears after this Tag in the document."""
255
        return self._findOne(self.findAllNext, name, attrs, text, **kwargs)
256
 
257
    def findAllNext(self, name=None, attrs={}, text=None, limit=None,
258
                    **kwargs):
259
        """Returns all items that match the given criteria and appear
260
        after this Tag in the document."""
261
        return self._findAll(name, attrs, text, limit, self.nextGenerator,
262
                             **kwargs)
263
 
264
    def findNextSibling(self, name=None, attrs={}, text=None, **kwargs):
265
        """Returns the closest sibling to this Tag that matches the
266
        given criteria and appears after this Tag in the document."""
267
        return self._findOne(self.findNextSiblings, name, attrs, text,
268
                             **kwargs)
269
 
270
    def findNextSiblings(self, name=None, attrs={}, text=None, limit=None,
271
                         **kwargs):
272
        """Returns the siblings of this Tag that match the given
273
        criteria and appear after this Tag in the document."""
274
        return self._findAll(name, attrs, text, limit,
275
                             self.nextSiblingGenerator, **kwargs)
276
    fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x
277
 
278
    def findPrevious(self, name=None, attrs={}, text=None, **kwargs):
279
        """Returns the first item that matches the given criteria and
280
        appears before this Tag in the document."""
281
        return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs)
282
 
283
    def findAllPrevious(self, name=None, attrs={}, text=None, limit=None,
284
                        **kwargs):
285
        """Returns all items that match the given criteria and appear
286
        before this Tag in the document."""
287
        return self._findAll(name, attrs, text, limit, self.previousGenerator,
288
                           **kwargs)
289
    fetchPrevious = findAllPrevious # Compatibility with pre-3.x
290
 
291
    def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs):
292
        """Returns the closest sibling to this Tag that matches the
293
        given criteria and appears before this Tag in the document."""
294
        return self._findOne(self.findPreviousSiblings, name, attrs, text,
295
                             **kwargs)
296
 
297
    def findPreviousSiblings(self, name=None, attrs={}, text=None,
298
                             limit=None, **kwargs):
299
        """Returns the siblings of this Tag that match the given
300
        criteria and appear before this Tag in the document."""
301
        return self._findAll(name, attrs, text, limit,
302
                             self.previousSiblingGenerator, **kwargs)
303
    fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x
304
 
305
    def findParent(self, name=None, attrs={}, **kwargs):
306
        """Returns the closest parent of this Tag that matches the given
307
        criteria."""
308
        # NOTE: We can't use _findOne because findParents takes a different
309
        # set of arguments.
310
        r = None
311
        l = self.findParents(name, attrs, 1)
312
        if l:
313
            r = l[0]
314
        return r
315
 
316
    def findParents(self, name=None, attrs={}, limit=None, **kwargs):
317
        """Returns the parents of this Tag that match the given
318
        criteria."""
319
 
320
        return self._findAll(name, attrs, None, limit, self.parentGenerator,
321
                             **kwargs)
322
    fetchParents = findParents # Compatibility with pre-3.x
323
 
324
    #These methods do the real heavy lifting.
325
 
326
    def _findOne(self, method, name, attrs, text, **kwargs):
327
        r = None
328
        l = method(name, attrs, text, 1, **kwargs)
329
        if l:
330
            r = l[0]
331
        return r
332
 
333
    def _findAll(self, name, attrs, text, limit, generator, **kwargs):
334
        "Iterates over a generator looking for things that match."
335
 
336
        if isinstance(name, SoupStrainer):
337
            strainer = name
338
        # (Possibly) special case some findAll*(...) searches
339
        elif text is None and not limit and not attrs and not kwargs:
340
            # findAll*(True)
341
            if name is True:
342
                return [element for element in generator()
343
                        if isinstance(element, Tag)]
344
            # findAll*('tag-name')
345
            elif isinstance(name, basestring):
346
                return [element for element in generator()
347
                        if isinstance(element, Tag) and
348
                        element.name == name]
349
            else:
350
                strainer = SoupStrainer(name, attrs, text, **kwargs)
351
        # Build a SoupStrainer
352
        else:
353
            strainer = SoupStrainer(name, attrs, text, **kwargs)
354
        results = ResultSet(strainer)
355
        g = generator()
356
        while True:
357
            try:
358
                i = g.next()
359
            except StopIteration:
360
                break
361
            if i:
362
                found = strainer.search(i)
363
                if found:
364
                    results.append(found)
365
                    if limit and len(results) >= limit:
366
                        break
367
        return results
368
 
369
    #These Generators can be used to navigate starting from both
370
    #NavigableStrings and Tags.
371
    def nextGenerator(self):
372
        i = self
373
        while i is not None:
374
            i = i.next
375
            yield i
376
 
377
    def nextSiblingGenerator(self):
378
        i = self
379
        while i is not None:
380
            i = i.nextSibling
381
            yield i
382
 
383
    def previousGenerator(self):
384
        i = self
385
        while i is not None:
386
            i = i.previous
387
            yield i
388
 
389
    def previousSiblingGenerator(self):
390
        i = self
391
        while i is not None:
392
            i = i.previousSibling
393
            yield i
394
 
395
    def parentGenerator(self):
396
        i = self
397
        while i is not None:
398
            i = i.parent
399
            yield i
400
 
401
    # Utility methods
402
    def substituteEncoding(self, str, encoding=None):
403
        encoding = encoding or "utf-8"
404
        return str.replace("%SOUP-ENCODING%", encoding)
405
 
406
    def toEncoding(self, s, encoding=None):
407
        """Encodes an object to a string in some encoding, or to Unicode.
408
        ."""
409
        if isinstance(s, unicode):
410
            if encoding:
411
                s = s.encode(encoding)
412
        elif isinstance(s, str):
413
            if encoding:
414
                s = s.encode(encoding)
415
            else:
416
                s = unicode(s)
417
        else:
418
            if encoding:
419
                s  = self.toEncoding(str(s), encoding)
420
            else:
421
                s = unicode(s)
422
        return s
423
 
424
class NavigableString(unicode, PageElement):
425
 
426
    def __new__(cls, value):
427
        """Create a new NavigableString.
428
 
429
        When unpickling a NavigableString, this method is called with
430
        the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be
431
        passed in to the superclass's __new__ or the superclass won't know
432
        how to handle non-ASCII characters.
433
        """
434
        if isinstance(value, unicode):
435
            return unicode.__new__(cls, value)
436
        return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
437
 
438
    def __getnewargs__(self):
439
        return (NavigableString.__str__(self),)
440
 
441
    def __getattr__(self, attr):
442
        """text.string gives you text. This is for backwards
443
        compatibility for Navigable*String, but for CData* it lets you
444
        get the string without the CData wrapper."""
445
        if attr == 'string':
446
            return self
447
        else:
448
            raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
449
 
450
    def __unicode__(self):
451
        return str(self).decode(DEFAULT_OUTPUT_ENCODING)
452
 
453
    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
454
        if encoding:
455
            return self.encode(encoding)
456
        else:
457
            return self
458
 
459
class CData(NavigableString):
460
 
461
    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
462
        return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding)
463
 
464
class ProcessingInstruction(NavigableString):
465
    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
466
        output = self
467
        if "%SOUP-ENCODING%" in output:
468
            output = self.substituteEncoding(output, encoding)
469
        return "<?%s?>" % self.toEncoding(output, encoding)
470
 
471
class Comment(NavigableString):
472
    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
473
        return "<!--%s-->" % NavigableString.__str__(self, encoding)
474
 
475
class Declaration(NavigableString):
476
    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
477
        return "<!%s>" % NavigableString.__str__(self, encoding)
478
 
479
class Tag(PageElement):
480
 
481
    """Represents a found HTML tag with its attributes and contents."""
482
 
483
    def _invert(h):
484
        "Cheap function to invert a hash."
485
        i = {}
486
        for k,v in h.items():
487
            i[v] = k
488
        return i
489
 
490
    XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'",
491
                                      "quot" : '"',
492
                                      "amp" : "&",
493
                                      "lt" : "<",
494
                                      "gt" : ">" }
495
 
496
    XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS)
497
 
498
    def _convertEntities(self, match):
499
        """Used in a call to re.sub to replace HTML, XML, and numeric
500
        entities with the appropriate Unicode characters. If HTML
501
        entities are being converted, any unrecognized entities are
502
        escaped."""
503
        x = match.group(1)
504
        if self.convertHTMLEntities and x in name2codepoint:
505
            return unichr(name2codepoint[x])
506
        elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS:
507
            if self.convertXMLEntities:
508
                return self.XML_ENTITIES_TO_SPECIAL_CHARS[x]
509
            else:
510
                return u'&%s;' % x
511
        elif len(x) > 0 and x[0] == '#':
512
            # Handle numeric entities
513
            if len(x) > 1 and x[1] == 'x':
514
                return unichr(int(x[2:], 16))
515
            else:
516
                return unichr(int(x[1:]))
517
 
518
        elif self.escapeUnrecognizedEntities:
519
            return u'&amp;%s;' % x
520
        else:
521
            return u'&%s;' % x
522
 
523
    def __init__(self, parser, name, attrs=None, parent=None,
524
                 previous=None):
525
        "Basic constructor."
526
 
527
        # We don't actually store the parser object: that lets extracted
528
        # chunks be garbage-collected
529
        self.parserClass = parser.__class__
530
        self.isSelfClosing = parser.isSelfClosingTag(name)
531
        self.name = name
532
        if attrs is None:
533
            attrs = []
534
        elif isinstance(attrs, dict):
535
            attrs = attrs.items()
536
        self.attrs = attrs
537
        self.contents = []
538
        self.setup(parent, previous)
539
        self.hidden = False
540
        self.containsSubstitutions = False
541
        self.convertHTMLEntities = parser.convertHTMLEntities
542
        self.convertXMLEntities = parser.convertXMLEntities
543
        self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities
544
 
545
        # Convert any HTML, XML, or numeric entities in the attribute values.
546
        convert = lambda(k, val): (k,
547
                                   re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);",
548
                                          self._convertEntities,
549
                                          val))
550
        self.attrs = map(convert, self.attrs)
551
 
552
    def getString(self):
553
        if (len(self.contents) == 1
554
            and isinstance(self.contents[0], NavigableString)):
555
            return self.contents[0]
556
 
557
    def setString(self, string):
558
        """Replace the contents of the tag with a string"""
559
        self.clear()
560
        self.append(string)
561
 
562
    string = property(getString, setString)
563
 
564
    def getText(self, separator=u""):
565
        if not len(self.contents):
566
            return u""
567
        stopNode = self._lastRecursiveChild().next
568
        strings = []
569
        current = self.contents[0]
570
        while current is not stopNode:
571
            if isinstance(current, NavigableString):
572
                strings.append(current.strip())
573
            current = current.next
574
        return separator.join(strings)
575
 
576
    text = property(getText)
577
 
578
    def get(self, key, default=None):
579
        """Returns the value of the 'key' attribute for the tag, or
580
        the value given for 'default' if it doesn't have that
581
        attribute."""
582
        return self._getAttrMap().get(key, default)
583
 
584
    def clear(self):
585
        """Extract all children."""
586
        for child in self.contents[:]:
587
            child.extract()
588
 
589
    def index(self, element):
590
        for i, child in enumerate(self.contents):
591
            if child is element:
592
                return i
593
        raise ValueError("Tag.index: element not in tag")
594
 
595
    def has_key(self, key):
596
        return self._getAttrMap().has_key(key)
597
 
598
    def __getitem__(self, key):
599
        """tag[key] returns the value of the 'key' attribute for the tag,
600
        and throws an exception if it's not there."""
601
        return self._getAttrMap()[key]
602
 
603
    def __iter__(self):
604
        "Iterating over a tag iterates over its contents."
605
        return iter(self.contents)
606
 
607
    def __len__(self):
608
        "The length of a tag is the length of its list of contents."
609
        return len(self.contents)
610
 
611
    def __contains__(self, x):
612
        return x in self.contents
613
 
614
    def __nonzero__(self):
615
        "A tag is non-None even if it has no contents."
616
        return True
617
 
618
    def __setitem__(self, key, value):
619
        """Setting tag[key] sets the value of the 'key' attribute for the
620
        tag."""
621
        self._getAttrMap()
622
        self.attrMap[key] = value
623
        found = False
624
        for i in range(0, len(self.attrs)):
625
            if self.attrs[i][0] == key:
626
                self.attrs[i] = (key, value)
627
                found = True
628
        if not found:
629
            self.attrs.append((key, value))
630
        self._getAttrMap()[key] = value
631
 
632
    def __delitem__(self, key):
633
        "Deleting tag[key] deletes all 'key' attributes for the tag."
634
        for item in self.attrs:
635
            if item[0] == key:
636
                self.attrs.remove(item)
637
                #We don't break because bad HTML can define the same
638
                #attribute multiple times.
639
            self._getAttrMap()
640
            if self.attrMap.has_key(key):
641
                del self.attrMap[key]
642
 
643
    def __call__(self, *args, **kwargs):
644
        """Calling a tag like a function is the same as calling its
645
        findAll() method. Eg. tag('a') returns a list of all the A tags
646
        found within this tag."""
647
        return apply(self.findAll, args, kwargs)
648
 
649
    def __getattr__(self, tag):
650
        #print "Getattr %s.%s" % (self.__class__, tag)
651
        if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3:
652
            return self.find(tag[:-3])
653
        elif tag.find('__') != 0:
654
            return self.find(tag)
655
        raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__, tag)
656
 
657
    def __eq__(self, other):
658
        """Returns true iff this tag has the same name, the same attributes,
659
        and the same contents (recursively) as the given tag.
660
 
661
        NOTE: right now this will return false if two tags have the
662
        same attributes in a different order. Should this be fixed?"""
663
        if other is self:
664
            return True
665
        if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):
666
            return False
667
        for i in range(0, len(self.contents)):
668
            if self.contents[i] != other.contents[i]:
669
                return False
670
        return True
671
 
672
    def __ne__(self, other):
673
        """Returns true iff this tag is not identical to the other tag,
674
        as defined in __eq__."""
675
        return not self == other
676
 
677
    def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
678
        """Renders this tag as a string."""
679
        return self.__str__(encoding)
680
 
681
    def __unicode__(self):
682
        return self.__str__(None)
683
 
684
    BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
685
                                           + "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
686
                                           + ")")
687
 
688
    def _sub_entity(self, x):
689
        """Used with a regular expression to substitute the
690
        appropriate XML entity for an XML special character."""
691
        return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";"
692
 
693
    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING,
694
                prettyPrint=False, indentLevel=0):
695
        """Returns a string or Unicode representation of this tag and
696
        its contents. To get Unicode, pass None for encoding.
697
 
698
        NOTE: since Python's HTML parser consumes whitespace, this
699
        method is not certain to reproduce the whitespace present in
700
        the original string."""
701
 
702
        encodedName = self.toEncoding(self.name, encoding)
703
 
704
        attrs = []
705
        if self.attrs:
706
            for key, val in self.attrs:
707
                fmt = '%s="%s"'
708
                if isinstance(val, basestring):
709
                    if self.containsSubstitutions and '%SOUP-ENCODING%' in val:
710
                        val = self.substituteEncoding(val, encoding)
711
 
712
                    # The attribute value either:
713
                    #
714
                    # * Contains no embedded double quotes or single quotes.
715
                    #   No problem: we enclose it in double quotes.
716
                    # * Contains embedded single quotes. No problem:
717
                    #   double quotes work here too.
718
                    # * Contains embedded double quotes. No problem:
719
                    #   we enclose it in single quotes.
720
                    # * Embeds both single _and_ double quotes. This
721
                    #   can't happen naturally, but it can happen if
722
                    #   you modify an attribute value after parsing
723
                    #   the document. Now we have a bit of a
724
                    #   problem. We solve it by enclosing the
725
                    #   attribute in single quotes, and escaping any
726
                    #   embedded single quotes to XML entities.
727
                    if '"' in val:
728
                        fmt = "%s='%s'"
729
                        if "'" in val:
730
                            # TODO: replace with apos when
731
                            # appropriate.
732
                            val = val.replace("'", "&squot;")
733
 
734
                    # Now we're okay w/r/t quotes. But the attribute
735
                    # value might also contain angle brackets, or
736
                    # ampersands that aren't part of entities. We need
737
                    # to escape those to XML entities too.
738
                    val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val)
739
 
740
                attrs.append(fmt % (self.toEncoding(key, encoding),
741
                                    self.toEncoding(val, encoding)))
742
        close = ''
743
        closeTag = ''
744
        if self.isSelfClosing:
745
            close = ' /'
746
        else:
747
            closeTag = '</%s>' % encodedName
748
 
749
        indentTag, indentContents = 0, 0
750
        if prettyPrint:
751
            indentTag = indentLevel
752
            space = (' ' * (indentTag-1))
753
            indentContents = indentTag + 1
754
        contents = self.renderContents(encoding, prettyPrint, indentContents)
755
        if self.hidden:
756
            s = contents
757
        else:
758
            s = []
759
            attributeString = ''
760
            if attrs:
761
                attributeString = ' ' + ' '.join(attrs)
762
            if prettyPrint:
763
                s.append(space)
764
            s.append('<%s%s%s>' % (encodedName, attributeString, close))
765
            if prettyPrint:
766
                s.append("\n")
767
            s.append(contents)
768
            if prettyPrint and contents and contents[-1] != "\n":
769
                s.append("\n")
770
            if prettyPrint and closeTag:
771
                s.append(space)
772
            s.append(closeTag)
773
            if prettyPrint and closeTag and self.nextSibling:
774
                s.append("\n")
775
            s = ''.join(s)
776
        return s
777
 
778
    def decompose(self):
779
        """Recursively destroys the contents of this tree."""
780
        self.extract()
781
        if len(self.contents) == 0:
782
            return
783
        current = self.contents[0]
784
        while current is not None:
785
            next = current.next
786
            if isinstance(current, Tag):
787
                del current.contents[:]
788
            current.parent = None
789
            current.previous = None
790
            current.previousSibling = None
791
            current.next = None
792
            current.nextSibling = None
793
            current = next
794
 
795
    def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
796
        return self.__str__(encoding, True)
797
 
798
    def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
799
                       prettyPrint=False, indentLevel=0):
800
        """Renders the contents of this tag as a string in the given
801
        encoding. If encoding is None, returns a Unicode string.."""
802
        s=[]
803
        for c in self:
804
            text = None
805
            if isinstance(c, NavigableString):
806
                text = c.__str__(encoding)
807
            elif isinstance(c, Tag):
808
                s.append(c.__str__(encoding, prettyPrint, indentLevel))
809
            if text and prettyPrint:
810
                text = text.strip()
811
            if text:
812
                if prettyPrint:
813
                    s.append(" " * (indentLevel-1))
814
                s.append(text)
815
                if prettyPrint:
816
                    s.append("\n")
817
        return ''.join(s)
818
 
819
    #Soup methods
820
 
821
    def find(self, name=None, attrs={}, recursive=True, text=None,
822
             **kwargs):
823
        """Return only the first child of this Tag matching the given
824
        criteria."""
825
        r = None
826
        l = self.findAll(name, attrs, recursive, text, 1, **kwargs)
827
        if l:
828
            r = l[0]
829
        return r
830
    findChild = find
831
 
832
    def findAll(self, name=None, attrs={}, recursive=True, text=None,
833
                limit=None, **kwargs):
834
        """Extracts a list of Tag objects that match the given
835
        criteria.  You can specify the name of the Tag and any
836
        attributes you want the Tag to have.
837
 
838
        The value of a key-value pair in the 'attrs' map can be a
839
        string, a list of strings, a regular expression object, or a
840
        callable that takes a string and returns whether or not the
841
        string matches for some custom definition of 'matches'. The
842
        same is true of the tag name."""
843
        generator = self.recursiveChildGenerator
844
        if not recursive:
845
            generator = self.childGenerator
846
        return self._findAll(name, attrs, text, limit, generator, **kwargs)
847
    findChildren = findAll
848
 
849
    # Pre-3.x compatibility methods
850
    first = find
851
    fetch = findAll
852
 
853
    def fetchText(self, text=None, recursive=True, limit=None):
854
        return self.findAll(text=text, recursive=recursive, limit=limit)
855
 
856
    def firstText(self, text=None, recursive=True):
857
        return self.find(text=text, recursive=recursive)
858
 
859
    #Private methods
860
 
861
    def _getAttrMap(self):
862
        """Initializes a map representation of this tag's attributes,
863
        if not already initialized."""
864
        if not getattr(self, 'attrMap'):
865
            self.attrMap = {}
866
            for (key, value) in self.attrs:
867
                self.attrMap[key] = value
868
        return self.attrMap
869
 
870
    #Generator methods
871
    def childGenerator(self):
872
        # Just use the iterator from the contents
873
        return iter(self.contents)
874
 
875
    def recursiveChildGenerator(self):
876
        if not len(self.contents):
877
            raise StopIteration
878
        stopNode = self._lastRecursiveChild().next
879
        current = self.contents[0]
880
        while current is not stopNode:
881
            yield current
882
            current = current.next
883
 
884
 
885
# Next, a couple classes to represent queries and their results.
886
class SoupStrainer:
887
    """Encapsulates a number of ways of matching a markup element (tag or
888
    text)."""
889
 
890
    def __init__(self, name=None, attrs={}, text=None, **kwargs):
891
        self.name = name
892
        if isinstance(attrs, basestring):
893
            kwargs['class'] = _match_css_class(attrs)
894
            attrs = None
895
        if kwargs:
896
            if attrs:
897
                attrs = attrs.copy()
898
                attrs.update(kwargs)
899
            else:
900
                attrs = kwargs
901
        self.attrs = attrs
902
        self.text = text
903
 
904
    def __str__(self):
905
        if self.text:
906
            return self.text
907
        else:
908
            return "%s|%s" % (self.name, self.attrs)
909
 
910
    def searchTag(self, markupName=None, markupAttrs={}):
911
        found = None
912
        markup = None
913
        if isinstance(markupName, Tag):
914
            markup = markupName
915
            markupAttrs = markup
916
        callFunctionWithTagData = callable(self.name) \
917
                                and not isinstance(markupName, Tag)
918
 
919
        if (not self.name) \
920
               or callFunctionWithTagData \
921
               or (markup and self._matches(markup, self.name)) \
922
               or (not markup and self._matches(markupName, self.name)):
923
            if callFunctionWithTagData:
924
                match = self.name(markupName, markupAttrs)
925
            else:
926
                match = True
927
                markupAttrMap = None
928
                for attr, matchAgainst in self.attrs.items():
929
                    if not markupAttrMap:
930
                         if hasattr(markupAttrs, 'get'):
931
                            markupAttrMap = markupAttrs
932
                         else:
933
                            markupAttrMap = {}
934
                            for k,v in markupAttrs:
935
                                markupAttrMap[k] = v
936
                    attrValue = markupAttrMap.get(attr)
937
                    if not self._matches(attrValue, matchAgainst):
938
                        match = False
939
                        break
940
            if match:
941
                if markup:
942
                    found = markup
943
                else:
944
                    found = markupName
945
        return found
946
 
947
    def search(self, markup):
948
        #print 'looking for %s in %s' % (self, markup)
949
        found = None
950
        # If given a list of items, scan it for a text element that
951
        # matches.
952
        if hasattr(markup, "__iter__") \
953
                and not isinstance(markup, Tag):
954
            for element in markup:
955
                if isinstance(element, NavigableString) \
956
                       and self.search(element):
957
                    found = element
958
                    break
959
        # If it's a Tag, make sure its name or attributes match.
960
        # Don't bother with Tags if we're searching for text.
961
        elif isinstance(markup, Tag):
962
            if not self.text:
963
                found = self.searchTag(markup)
964
        # If it's text, make sure the text matches.
965
        elif isinstance(markup, NavigableString) or \
966
                 isinstance(markup, basestring):
967
            if self._matches(markup, self.text):
968
                found = markup
969
        else:
970
            raise Exception, "I don't know how to match against a %s" \
971
                  % markup.__class__
972
        return found
973
 
974
    def _matches(self, markup, matchAgainst):
975
        #print "Matching %s against %s" % (markup, matchAgainst)
976
        result = False
977
        if matchAgainst is True:
978
            result = markup is not None
979
        elif callable(matchAgainst):
980
            result = matchAgainst(markup)
981
        else:
982
            #Custom match methods take the tag as an argument, but all
983
            #other ways of matching match the tag name as a string.
984
            if isinstance(markup, Tag):
985
                markup = markup.name
986
            if markup and not isinstance(markup, basestring):
987
                markup = unicode(markup)
988
            #Now we know that chunk is either a string, or None.
989
            if hasattr(matchAgainst, 'match'):
990
                # It's a regexp object.
991
                result = markup and matchAgainst.search(markup)
992
            elif hasattr(matchAgainst, '__iter__'): # list-like
993
                result = markup in matchAgainst
994
            elif hasattr(matchAgainst, 'items'):
995
                result = markup.has_key(matchAgainst)
996
            elif matchAgainst and isinstance(markup, basestring):
997
                if isinstance(markup, unicode):
998
                    matchAgainst = unicode(matchAgainst)
999
                else:
1000
                    matchAgainst = str(matchAgainst)
1001
 
1002
            if not result:
1003
                result = matchAgainst == markup
1004
        return result
1005
 
1006
class ResultSet(list):
1007
    """A ResultSet is just a list that keeps track of the SoupStrainer
1008
    that created it."""
1009
    def __init__(self, source):
1010
        list.__init__([])
1011
        self.source = source
1012
 
1013
# Now, some helper functions.
1014
 
1015
def buildTagMap(default, *args):
1016
    """Turns a list of maps, lists, or scalars into a single map.
1017
    Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and
1018
    NESTING_RESET_TAGS maps out of lists and partial maps."""
1019
    built = {}
1020
    for portion in args:
1021
        if hasattr(portion, 'items'):
1022
            #It's a map. Merge it.
1023
            for k,v in portion.items():
1024
                built[k] = v
1025
        elif hasattr(portion, '__iter__'): # is a list
1026
            #It's a list. Map each item to the default.
1027
            for k in portion:
1028
                built[k] = default
1029
        else:
1030
            #It's a scalar. Map it to the default.
1031
            built[portion] = default
1032
    return built
1033
 
1034
# Now, the parser classes.
1035
 
1036
class BeautifulStoneSoup(Tag, SGMLParser):
1037
 
1038
    """This class contains the basic parser and search code. It defines
1039
    a parser that knows nothing about tag behavior except for the
1040
    following:
1041
 
1042
      You can't close a tag without closing all the tags it encloses.
1043
      That is, "<foo><bar></foo>" actually means
1044
      "<foo><bar></bar></foo>".
1045
 
1046
    [Another possible explanation is "<foo><bar /></foo>", but since
1047
    this class defines no SELF_CLOSING_TAGS, it will never use that
1048
    explanation.]
1049
 
1050
    This class is useful for parsing XML or made-up markup languages,
1051
    or when BeautifulSoup makes an assumption counter to what you were
1052
    expecting."""
1053
 
1054
    SELF_CLOSING_TAGS = {}
1055
    NESTABLE_TAGS = {}
1056
    RESET_NESTING_TAGS = {}
1057
    QUOTE_TAGS = {}
1058
    PRESERVE_WHITESPACE_TAGS = []
1059
 
1060
    MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'),
1061
                       lambda x: x.group(1) + ' />'),
1062
                      (re.compile('<!\s+([^<>]*)>'),
1063
                       lambda x: '<!' + x.group(1) + '>')
1064
                      ]
1065
 
1066
    ROOT_TAG_NAME = u'[document]'
1067
 
1068
    HTML_ENTITIES = "html"
1069
    XML_ENTITIES = "xml"
1070
    XHTML_ENTITIES = "xhtml"
1071
    # TODO: This only exists for backwards-compatibility
1072
    ALL_ENTITIES = XHTML_ENTITIES
1073
 
1074
    # Used when determining whether a text node is all whitespace and
1075
    # can be replaced with a single space. A text node that contains
1076
    # fancy Unicode spaces (usually non-breaking) should be left
1077
    # alone.
1078
    STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, }
1079
 
1080
    def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None,
1081
                 markupMassage=True, smartQuotesTo=XML_ENTITIES,
1082
                 convertEntities=None, selfClosingTags=None, isHTML=False):
1083
        """The Soup object is initialized as the 'root tag', and the
1084
        provided markup (which can be a string or a file-like object)
1085
        is fed into the underlying parser.
1086
 
1087
        sgmllib will process most bad HTML, and the BeautifulSoup
1088
        class has some tricks for dealing with some HTML that kills
1089
        sgmllib, but Beautiful Soup can nonetheless choke or lose data
1090
        if your data uses self-closing tags or declarations
1091
        incorrectly.
1092
 
1093
        By default, Beautiful Soup uses regexes to sanitize input,
1094
        avoiding the vast majority of these problems. If the problems
1095
        don't apply to you, pass in False for markupMassage, and
1096
        you'll get better performance.
1097
 
1098
        The default parser massage techniques fix the two most common
1099
        instances of invalid HTML that choke sgmllib:
1100
 
1101
         <br/> (No space between name of closing tag and tag close)
1102
         <! --Comment--> (Extraneous whitespace in declaration)
1103
 
1104
        You can pass in a custom list of (RE object, replace method)
1105
        tuples to get Beautiful Soup to scrub your input the way you
1106
        want."""
1107
 
1108
        self.parseOnlyThese = parseOnlyThese
1109
        self.fromEncoding = fromEncoding
1110
        self.smartQuotesTo = smartQuotesTo
1111
        self.convertEntities = convertEntities
1112
        # Set the rules for how we'll deal with the entities we
1113
        # encounter
1114
        if self.convertEntities:
1115
            # It doesn't make sense to convert encoded characters to
1116
            # entities even while you're converting entities to Unicode.
1117
            # Just convert it all to Unicode.
1118
            self.smartQuotesTo = None
1119
            if convertEntities == self.HTML_ENTITIES:
1120
                self.convertXMLEntities = False
1121
                self.convertHTMLEntities = True
1122
                self.escapeUnrecognizedEntities = True
1123
            elif convertEntities == self.XHTML_ENTITIES:
1124
                self.convertXMLEntities = True
1125
                self.convertHTMLEntities = True
1126
                self.escapeUnrecognizedEntities = False
1127
            elif convertEntities == self.XML_ENTITIES:
1128
                self.convertXMLEntities = True
1129
                self.convertHTMLEntities = False
1130
                self.escapeUnrecognizedEntities = False
1131
        else:
1132
            self.convertXMLEntities = False
1133
            self.convertHTMLEntities = False
1134
            self.escapeUnrecognizedEntities = False
1135
 
1136
        self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags)
1137
        SGMLParser.__init__(self)
1138
 
1139
        if hasattr(markup, 'read'):        # It's a file-type object.
1140
            markup = markup.read()
1141
        self.markup = markup
1142
        self.markupMassage = markupMassage
1143
        try:
1144
            self._feed(isHTML=isHTML)
1145
        except StopParsing:
1146
            pass
1147
        self.markup = None                 # The markup can now be GCed
1148
 
1149
    def convert_charref(self, name):
1150
        """This method fixes a bug in Python's SGMLParser."""
1151
        try:
1152
            n = int(name)
1153
        except ValueError:
1154
            return
1155
        if not 0 <= n <= 127 : # ASCII ends at 127, not 255
1156
            return
1157
        return self.convert_codepoint(n)
1158
 
1159
    def _feed(self, inDocumentEncoding=None, isHTML=False):
1160
        # Convert the document to Unicode.
1161
        markup = self.markup
1162
        if isinstance(markup, unicode):
1163
            if not hasattr(self, 'originalEncoding'):
1164
                self.originalEncoding = None
1165
        else:
1166
            dammit = UnicodeDammit\
1167
                     (markup, [self.fromEncoding, inDocumentEncoding],
1168
                      smartQuotesTo=self.smartQuotesTo, isHTML=isHTML)
1169
            markup = dammit.unicode
1170
            self.originalEncoding = dammit.originalEncoding
1171
            self.declaredHTMLEncoding = dammit.declaredHTMLEncoding
1172
        if markup:
1173
            if self.markupMassage:
1174
                if not hasattr(self.markupMassage, "__iter__"):
1175
                    self.markupMassage = self.MARKUP_MASSAGE
1176
                for fix, m in self.markupMassage:
1177
                    markup = fix.sub(m, markup)
1178
                # TODO: We get rid of markupMassage so that the
1179
                # soup object can be deepcopied later on. Some
1180
                # Python installations can't copy regexes. If anyone
1181
                # was relying on the existence of markupMassage, this
1182
                # might cause problems.
1183
                del(self.markupMassage)
1184
        self.reset()
1185
 
1186
        SGMLParser.feed(self, markup)
1187
        # Close out any unfinished strings and close all the open tags.
1188
        self.endData()
1189
        while self.currentTag.name != self.ROOT_TAG_NAME:
1190
            self.popTag()
1191
 
1192
    def __getattr__(self, methodName):
1193
        """This method routes method call requests to either the SGMLParser
1194
        superclass or the Tag superclass, depending on the method name."""
1195
        #print "__getattr__ called on %s.%s" % (self.__class__, methodName)
1196
 
1197
        if methodName.startswith('start_') or methodName.startswith('end_') \
1198
               or methodName.startswith('do_'):
1199
            return SGMLParser.__getattr__(self, methodName)
1200
        elif not methodName.startswith('__'):
1201
            return Tag.__getattr__(self, methodName)
1202
        else:
1203
            raise AttributeError
1204
 
1205
    def isSelfClosingTag(self, name):
1206
        """Returns true iff the given string is the name of a
1207
        self-closing tag according to this parser."""
1208
        return self.SELF_CLOSING_TAGS.has_key(name) \
1209
               or self.instanceSelfClosingTags.has_key(name)
1210
 
1211
    def reset(self):
1212
        Tag.__init__(self, self, self.ROOT_TAG_NAME)
1213
        self.hidden = 1
1214
        SGMLParser.reset(self)
1215
        self.currentData = []
1216
        self.currentTag = None
1217
        self.tagStack = []
1218
        self.quoteStack = []
1219
        self.pushTag(self)
1220
 
1221
    def popTag(self):
1222
        tag = self.tagStack.pop()
1223
 
1224
        #print "Pop", tag.name
1225
        if self.tagStack:
1226
            self.currentTag = self.tagStack[-1]
1227
        return self.currentTag
1228
 
1229
    def pushTag(self, tag):
1230
        #print "Push", tag.name
1231
        if self.currentTag:
1232
            self.currentTag.contents.append(tag)
1233
        self.tagStack.append(tag)
1234
        self.currentTag = self.tagStack[-1]
1235
 
1236
    def endData(self, containerClass=NavigableString):
1237
        if self.currentData:
1238
            currentData = u''.join(self.currentData)
1239
            if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and
1240
                not set([tag.name for tag in self.tagStack]).intersection(
1241
                    self.PRESERVE_WHITESPACE_TAGS)):
1242
                if '\n' in currentData:
1243
                    currentData = '\n'
1244
                else:
1245
                    currentData = ' '
1246
            self.currentData = []
1247
            if self.parseOnlyThese and len(self.tagStack) <= 1 and \
1248
                   (not self.parseOnlyThese.text or \
1249
                    not self.parseOnlyThese.search(currentData)):
1250
                return
1251
            o = containerClass(currentData)
1252
            o.setup(self.currentTag, self.previous)
1253
            if self.previous:
1254
                self.previous.next = o
1255
            self.previous = o
1256
            self.currentTag.contents.append(o)
1257
 
1258
 
1259
    def _popToTag(self, name, inclusivePop=True):
1260
        """Pops the tag stack up to and including the most recent
1261
        instance of the given tag. If inclusivePop is false, pops the tag
1262
        stack up to but *not* including the most recent instqance of
1263
        the given tag."""
1264
        #print "Popping to %s" % name
1265
        if name == self.ROOT_TAG_NAME:
1266
            return
1267
 
1268
        numPops = 0
1269
        mostRecentTag = None
1270
        for i in range(len(self.tagStack)-1, 0, -1):
1271
            if name == self.tagStack[i].name:
1272
                numPops = len(self.tagStack)-i
1273
                break
1274
        if not inclusivePop:
1275
            numPops = numPops - 1
1276
 
1277
        for i in range(0, numPops):
1278
            mostRecentTag = self.popTag()
1279
        return mostRecentTag
1280
 
1281
    def _smartPop(self, name):
1282
 
1283
        """We need to pop up to the previous tag of this type, unless
1284
        one of this tag's nesting reset triggers comes between this
1285
        tag and the previous tag of this type, OR unless this tag is a
1286
        generic nesting trigger and another generic nesting trigger
1287
        comes between this tag and the previous tag of this type.
1288
 
1289
        Examples:
1290
         <p>Foo<b>Bar *<p>* should pop to 'p', not 'b'.
1291
         <p>Foo<table>Bar *<p>* should pop to 'table', not 'p'.
1292
         <p>Foo<table><tr>Bar *<p>* should pop to 'tr', not 'p'.
1293
 
1294
         <li><ul><li> *<li>* should pop to 'ul', not the first 'li'.
1295
         <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr'
1296
         <td><tr><td> *<td>* should pop to 'tr', not the first 'td'
1297
        """
1298
 
1299
        nestingResetTriggers = self.NESTABLE_TAGS.get(name)
1300
        isNestable = nestingResetTriggers != None
1301
        isResetNesting = self.RESET_NESTING_TAGS.has_key(name)
1302
        popTo = None
1303
        inclusive = True
1304
        for i in range(len(self.tagStack)-1, 0, -1):
1305
            p = self.tagStack[i]
1306
            if (not p or p.name == name) and not isNestable:
1307
                #Non-nestable tags get popped to the top or to their
1308
                #last occurance.
1309
                popTo = name
1310
                break
1311
            if (nestingResetTriggers is not None
1312
                and p.name in nestingResetTriggers) \
1313
                or (nestingResetTriggers is None and isResetNesting
1314
                    and self.RESET_NESTING_TAGS.has_key(p.name)):
1315
 
1316
                #If we encounter one of the nesting reset triggers
1317
                #peculiar to this tag, or we encounter another tag
1318
                #that causes nesting to reset, pop up to but not
1319
                #including that tag.
1320
                popTo = p.name
1321
                inclusive = False
1322
                break
1323
            p = p.parent
1324
        if popTo:
1325
            self._popToTag(popTo, inclusive)
1326
 
1327
    def unknown_starttag(self, name, attrs, selfClosing=0):
1328
        #print "Start tag %s: %s" % (name, attrs)
1329
        if self.quoteStack:
1330
            #This is not a real tag.
1331
            #print "<%s> is not real!" % name
1332
            attrs = ''.join([' %s="%s"' % (x, y) for x, y in attrs])
1333
            self.handle_data('<%s%s>' % (name, attrs))
1334
            return
1335
        self.endData()
1336
 
1337
        if not self.isSelfClosingTag(name) and not selfClosing:
1338
            self._smartPop(name)
1339
 
1340
        if self.parseOnlyThese and len(self.tagStack) <= 1 \
1341
               and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)):
1342
            return
1343
 
1344
        tag = Tag(self, name, attrs, self.currentTag, self.previous)
1345
        if self.previous:
1346
            self.previous.next = tag
1347
        self.previous = tag
1348
        self.pushTag(tag)
1349
        if selfClosing or self.isSelfClosingTag(name):
1350
            self.popTag()
1351
        if name in self.QUOTE_TAGS:
1352
            #print "Beginning quote (%s)" % name
1353
            self.quoteStack.append(name)
1354
            self.literal = 1
1355
        return tag
1356
 
1357
    def unknown_endtag(self, name):
1358
        #print "End tag %s" % name
1359
        if self.quoteStack and self.quoteStack[-1] != name:
1360
            #This is not a real end tag.
1361
            #print "</%s> is not real!" % name
1362
            self.handle_data('</%s>' % name)
1363
            return
1364
        self.endData()
1365
        self._popToTag(name)
1366
        if self.quoteStack and self.quoteStack[-1] == name:
1367
            self.quoteStack.pop()
1368
            self.literal = (len(self.quoteStack) > 0)
1369
 
1370
    def handle_data(self, data):
1371
        self.currentData.append(data)
1372
 
1373
    def _toStringSubclass(self, text, subclass):
1374
        """Adds a certain piece of text to the tree as a NavigableString
1375
        subclass."""
1376
        self.endData()
1377
        self.handle_data(text)
1378
        self.endData(subclass)
1379
 
1380
    def handle_pi(self, text):
1381
        """Handle a processing instruction as a ProcessingInstruction
1382
        object, possibly one with a %SOUP-ENCODING% slot into which an
1383
        encoding will be plugged later."""
1384
        if text[:3] == "xml":
1385
            text = u"xml version='1.0' encoding='%SOUP-ENCODING%'"
1386
        self._toStringSubclass(text, ProcessingInstruction)
1387
 
1388
    def handle_comment(self, text):
1389
        "Handle comments as Comment objects."
1390
        self._toStringSubclass(text, Comment)
1391
 
1392
    def handle_charref(self, ref):
1393
        "Handle character references as data."
1394
        if self.convertEntities:
1395
            data = unichr(int(ref))
1396
        else:
1397
            data = '&#%s;' % ref
1398
        self.handle_data(data)
1399
 
1400
    def handle_entityref(self, ref):
1401
        """Handle entity references as data, possibly converting known
1402
        HTML and/or XML entity references to the corresponding Unicode
1403
        characters."""
1404
        data = None
1405
        if self.convertHTMLEntities:
1406
            try:
1407
                data = unichr(name2codepoint[ref])
1408
            except KeyError:
1409
                pass
1410
 
1411
        if not data and self.convertXMLEntities:
1412
                data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref)
1413
 
1414
        if not data and self.convertHTMLEntities and \
1415
            not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref):
1416
                # TODO: We've got a problem here. We're told this is
1417
                # an entity reference, but it's not an XML entity
1418
                # reference or an HTML entity reference. Nonetheless,
1419
                # the logical thing to do is to pass it through as an
1420
                # unrecognized entity reference.
1421
                #
1422
                # Except: when the input is "&carol;" this function
1423
                # will be called with input "carol". When the input is
1424
                # "AT&T", this function will be called with input
1425
                # "T". We have no way of knowing whether a semicolon
1426
                # was present originally, so we don't know whether
1427
                # this is an unknown entity or just a misplaced
1428
                # ampersand.
1429
                #
1430
                # The more common case is a misplaced ampersand, so I
1431
                # escape the ampersand and omit the trailing semicolon.
1432
                data = "&amp;%s" % ref
1433
        if not data:
1434
            # This case is different from the one above, because we
1435
            # haven't already gone through a supposedly comprehensive
1436
            # mapping of entities to Unicode characters. We might not
1437
            # have gone through any mapping at all. So the chances are
1438
            # very high that this is a real entity, and not a
1439
            # misplaced ampersand.
1440
            data = "&%s;" % ref
1441
        self.handle_data(data)
1442
 
1443
    def handle_decl(self, data):
1444
        "Handle DOCTYPEs and the like as Declaration objects."
1445
        self._toStringSubclass(data, Declaration)
1446
 
1447
    def parse_declaration(self, i):
1448
        """Treat a bogus SGML declaration as raw data. Treat a CDATA
1449
        declaration as a CData object."""
1450
        j = None
1451
        if self.rawdata[i:i+9] == '<![CDATA[':
1452
             k = self.rawdata.find(']]>', i)
1453
             if k == -1:
1454
                 k = len(self.rawdata)
1455
             data = self.rawdata[i+9:k]
1456
             j = k+3
1457
             self._toStringSubclass(data, CData)
1458
        else:
1459
            try:
1460
                j = SGMLParser.parse_declaration(self, i)
1461
            except SGMLParseError:
1462
                toHandle = self.rawdata[i:]
1463
                self.handle_data(toHandle)
1464
                j = i + len(toHandle)
1465
        return j
1466
 
1467
class BeautifulSoup(BeautifulStoneSoup):
1468
 
1469
    """This parser knows the following facts about HTML:
1470
 
1471
    * Some tags have no closing tag and should be interpreted as being
1472
      closed as soon as they are encountered.
1473
 
1474
    * The text inside some tags (ie. 'script') may contain tags which
1475
      are not really part of the document and which should be parsed
1476
      as text, not tags. If you want to parse the text as tags, you can
1477
      always fetch it and parse it explicitly.
1478
 
1479
    * Tag nesting rules:
1480
 
1481
      Most tags can't be nested at all. For instance, the occurance of
1482
      a <p> tag should implicitly close the previous <p> tag.
1483
 
1484
       <p>Para1<p>Para2
1485
        should be transformed into:
1486
       <p>Para1</p><p>Para2
1487
 
1488
      Some tags can be nested arbitrarily. For instance, the occurance
1489
      of a <blockquote> tag should _not_ implicitly close the previous
1490
      <blockquote> tag.
1491
 
1492
       Alice said: <blockquote>Bob said: <blockquote>Blah
1493
        should NOT be transformed into:
1494
       Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah
1495
 
1496
      Some tags can be nested, but the nesting is reset by the
1497
      interposition of other tags. For instance, a <tr> tag should
1498
      implicitly close the previous <tr> tag within the same <table>,
1499
      but not close a <tr> tag in another table.
1500
 
1501
       <table><tr>Blah<tr>Blah
1502
        should be transformed into:
1503
       <table><tr>Blah</tr><tr>Blah
1504
        but,
1505
       <tr>Blah<table><tr>Blah
1506
        should NOT be transformed into
1507
       <tr>Blah<table></tr><tr>Blah
1508
 
1509
    Differing assumptions about tag nesting rules are a major source
1510
    of problems with the BeautifulSoup class. If BeautifulSoup is not
1511
    treating as nestable a tag your page author treats as nestable,
1512
    try ICantBelieveItsBeautifulSoup, MinimalSoup, or
1513
    BeautifulStoneSoup before writing your own subclass."""
1514
 
1515
    def __init__(self, *args, **kwargs):
1516
        if not kwargs.has_key('smartQuotesTo'):
1517
            kwargs['smartQuotesTo'] = self.HTML_ENTITIES
1518
        kwargs['isHTML'] = True
1519
        BeautifulStoneSoup.__init__(self, *args, **kwargs)
1520
 
1521
    SELF_CLOSING_TAGS = buildTagMap(None,
1522
                                    ('br' , 'hr', 'input', 'img', 'meta',
1523
                                    'spacer', 'link', 'frame', 'base', 'col'))
1524
 
1525
    PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
1526
 
1527
    QUOTE_TAGS = {'script' : None, 'textarea' : None}
1528
 
1529
    #According to the HTML standard, each of these inline tags can
1530
    #contain another tag of the same type. Furthermore, it's common
1531
    #to actually use these tags this way.
1532
    NESTABLE_INLINE_TAGS = ('span', 'font', 'q', 'object', 'bdo', 'sub', 'sup',
1533
                            'center')
1534
 
1535
    #According to the HTML standard, these block tags can contain
1536
    #another tag of the same type. Furthermore, it's common
1537
    #to actually use these tags this way.
1538
    NESTABLE_BLOCK_TAGS = ('blockquote', 'div', 'fieldset', 'ins', 'del')
1539
 
1540
    #Lists can contain other lists, but there are restrictions.
1541
    NESTABLE_LIST_TAGS = { 'ol' : [],
1542
                           'ul' : [],
1543
                           'li' : ['ul', 'ol'],
1544
                           'dl' : [],
1545
                           'dd' : ['dl'],
1546
                           'dt' : ['dl'] }
1547
 
1548
    #Tables can contain other tables, but there are restrictions.
1549
    NESTABLE_TABLE_TAGS = {'table' : [],
1550
                           'tr' : ['table', 'tbody', 'tfoot', 'thead'],
1551
                           'td' : ['tr'],
1552
                           'th' : ['tr'],
1553
                           'thead' : ['table'],
1554
                           'tbody' : ['table'],
1555
                           'tfoot' : ['table'],
1556
                           }
1557
 
1558
    NON_NESTABLE_BLOCK_TAGS = ('address', 'form', 'p', 'pre')
1559
 
1560
    #If one of these tags is encountered, all tags up to the next tag of
1561
    #this type are popped.
1562
    RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript',
1563
                                     NON_NESTABLE_BLOCK_TAGS,
1564
                                     NESTABLE_LIST_TAGS,
1565
                                     NESTABLE_TABLE_TAGS)
1566
 
1567
    NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS,
1568
                                NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS)
1569
 
1570
    # Used to detect the charset in a META tag; see start_meta
1571
    CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
1572
 
1573
    def start_meta(self, attrs):
1574
        """Beautiful Soup can detect a charset included in a META tag,
1575
        try to convert the document to that charset, and re-parse the
1576
        document from the beginning."""
1577
        httpEquiv = None
1578
        contentType = None
1579
        contentTypeIndex = None
1580
        tagNeedsEncodingSubstitution = False
1581
 
1582
        for i in range(0, len(attrs)):
1583
            key, value = attrs[i]
1584
            key = key.lower()
1585
            if key == 'http-equiv':
1586
                httpEquiv = value
1587
            elif key == 'content':
1588
                contentType = value
1589
                contentTypeIndex = i
1590
 
1591
        if httpEquiv and contentType: # It's an interesting meta tag.
1592
            match = self.CHARSET_RE.search(contentType)
1593
            if match:
1594
                if (self.declaredHTMLEncoding is not None or
1595
                    self.originalEncoding == self.fromEncoding):
1596
                    # An HTML encoding was sniffed while converting
1597
                    # the document to Unicode, or an HTML encoding was
1598
                    # sniffed during a previous pass through the
1599
                    # document, or an encoding was specified
1600
                    # explicitly and it worked. Rewrite the meta tag.
1601
                    def rewrite(match):
1602
                        return match.group(1) + "%SOUP-ENCODING%"
1603
                    newAttr = self.CHARSET_RE.sub(rewrite, contentType)
1604
                    attrs[contentTypeIndex] = (attrs[contentTypeIndex][0],
1605
                                               newAttr)
1606
                    tagNeedsEncodingSubstitution = True
1607
                else:
1608
                    # This is our first pass through the document.
1609
                    # Go through it again with the encoding information.
1610
                    newCharset = match.group(3)
1611
                    if newCharset and newCharset != self.originalEncoding:
1612
                        self.declaredHTMLEncoding = newCharset
1613
                        self._feed(self.declaredHTMLEncoding)
1614
                        raise StopParsing
1615
                    pass
1616
        tag = self.unknown_starttag("meta", attrs)
1617
        if tag and tagNeedsEncodingSubstitution:
1618
            tag.containsSubstitutions = True
1619
 
1620
class StopParsing(Exception):
1621
    pass
1622
 
1623
class ICantBelieveItsBeautifulSoup(BeautifulSoup):
1624
 
1625
    """The BeautifulSoup class is oriented towards skipping over
1626
    common HTML errors like unclosed tags. However, sometimes it makes
1627
    errors of its own. For instance, consider this fragment:
1628
 
1629
     <b>Foo<b>Bar</b></b>
1630
 
1631
    This is perfectly valid (if bizarre) HTML. However, the
1632
    BeautifulSoup class will implicitly close the first b tag when it
1633
    encounters the second 'b'. It will think the author wrote
1634
    "<b>Foo<b>Bar", and didn't close the first 'b' tag, because
1635
    there's no real-world reason to bold something that's already
1636
    bold. When it encounters '</b></b>' it will close two more 'b'
1637
    tags, for a grand total of three tags closed instead of two. This
1638
    can throw off the rest of your document structure. The same is
1639
    true of a number of other tags, listed below.
1640
 
1641
    It's much more common for someone to forget to close a 'b' tag
1642
    than to actually use nested 'b' tags, and the BeautifulSoup class
1643
    handles the common case. This class handles the not-co-common
1644
    case: where you can't believe someone wrote what they did, but
1645
    it's valid HTML and BeautifulSoup screwed up by assuming it
1646
    wouldn't be."""
1647
 
1648
    I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \
1649
     ('em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong',
1650
      'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b',
1651
      'big')
1652
 
1653
    I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ('noscript',)
1654
 
1655
    NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS,
1656
                                I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS,
1657
                                I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS)
1658
 
1659
class MinimalSoup(BeautifulSoup):
1660
    """The MinimalSoup class is for parsing HTML that contains
1661
    pathologically bad markup. It makes no assumptions about tag
1662
    nesting, but it does know which tags are self-closing, that
1663
    <script> tags contain Javascript and should not be parsed, that
1664
    META tags may contain encoding information, and so on.
1665
 
1666
    This also makes it better for subclassing than BeautifulStoneSoup
1667
    or BeautifulSoup."""
1668
 
1669
    RESET_NESTING_TAGS = buildTagMap('noscript')
1670
    NESTABLE_TAGS = {}
1671
 
1672
class BeautifulSOAP(BeautifulStoneSoup):
1673
    """This class will push a tag with only a single string child into
1674
    the tag's parent as an attribute. The attribute's name is the tag
1675
    name, and the value is the string child. An example should give
1676
    the flavor of the change:
1677
 
1678
    <foo><bar>baz</bar></foo>
1679
     =>
1680
    <foo bar="baz"><bar>baz</bar></foo>
1681
 
1682
    You can then access fooTag['bar'] instead of fooTag.barTag.string.
1683
 
1684
    This is, of course, useful for scraping structures that tend to
1685
    use subelements instead of attributes, such as SOAP messages. Note
1686
    that it modifies its input, so don't print the modified version
1687
    out.
1688
 
1689
    I'm not sure how many people really want to use this class; let me
1690
    know if you do. Mainly I like the name."""
1691
 
1692
    def popTag(self):
1693
        if len(self.tagStack) > 1:
1694
            tag = self.tagStack[-1]
1695
            parent = self.tagStack[-2]
1696
            parent._getAttrMap()
1697
            if (isinstance(tag, Tag) and len(tag.contents) == 1 and
1698
                isinstance(tag.contents[0], NavigableString) and
1699
                not parent.attrMap.has_key(tag.name)):
1700
                parent[tag.name] = tag.contents[0]
1701
        BeautifulStoneSoup.popTag(self)
1702
 
1703
#Enterprise class names! It has come to our attention that some people
1704
#think the names of the Beautiful Soup parser classes are too silly
1705
#and "unprofessional" for use in enterprise screen-scraping. We feel
1706
#your pain! For such-minded folk, the Beautiful Soup Consortium And
1707
#All-Night Kosher Bakery recommends renaming this file to
1708
#"RobustParser.py" (or, in cases of extreme enterprisiness,
1709
#"RobustParserBeanInterface.class") and using the following
1710
#enterprise-friendly class aliases:
1711
class RobustXMLParser(BeautifulStoneSoup):
1712
    pass
1713
class RobustHTMLParser(BeautifulSoup):
1714
    pass
1715
class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup):
1716
    pass
1717
class RobustInsanelyWackAssHTMLParser(MinimalSoup):
1718
    pass
1719
class SimplifyingSOAPParser(BeautifulSOAP):
1720
    pass
1721
 
1722
######################################################
1723
#
1724
# Bonus library: Unicode, Dammit
1725
#
1726
# This class forces XML data into a standard format (usually to UTF-8
1727
# or Unicode).  It is heavily based on code from Mark Pilgrim's
1728
# Universal Feed Parser. It does not rewrite the XML or HTML to
1729
# reflect a new encoding: that happens in BeautifulStoneSoup.handle_pi
1730
# (XML) and BeautifulSoup.start_meta (HTML).
1731
 
1732
# Autodetects character encodings.
1733
# Download from http://chardet.feedparser.org/
1734
try:
1735
    import chardet
1736
#    import chardet.constants
1737
#    chardet.constants._debug = 1
1738
except ImportError:
1739
    chardet = None
1740
 
1741
# cjkcodecs and iconv_codec make Python know about more character encodings.
1742
# Both are available from http://cjkpython.i18n.org/
1743
# They're built in if you use Python 2.4.
1744
try:
1745
    import cjkcodecs.aliases
1746
except ImportError:
1747
    pass
1748
try:
1749
    import iconv_codec
1750
except ImportError:
1751
    pass
1752
 
1753
class UnicodeDammit:
1754
    """A class for detecting the encoding of a *ML document and
1755
    converting it to a Unicode string. If the source encoding is
1756
    windows-1252, can replace MS smart quotes with their HTML or XML
1757
    equivalents."""
1758
 
1759
    # This dictionary maps commonly seen values for "charset" in HTML
1760
    # meta tags to the corresponding Python codec names. It only covers
1761
    # values that aren't in Python's aliases and can't be determined
1762
    # by the heuristics in find_codec.
1763
    CHARSET_ALIASES = { "macintosh" : "mac-roman",
1764
                        "x-sjis" : "shift-jis" }
1765
 
1766
    def __init__(self, markup, overrideEncodings=[],
1767
                 smartQuotesTo='xml', isHTML=False):
1768
        self.declaredHTMLEncoding = None
1769
        self.markup, documentEncoding, sniffedEncoding = \
1770
                     self._detectEncoding(markup, isHTML)
1771
        self.smartQuotesTo = smartQuotesTo
1772
        self.triedEncodings = []
1773
        if markup == '' or isinstance(markup, unicode):
1774
            self.originalEncoding = None
1775
            self.unicode = unicode(markup)
1776
            return
1777
 
1778
        u = None
1779
        for proposedEncoding in overrideEncodings:
1780
            u = self._convertFrom(proposedEncoding)
1781
            if u: break
1782
        if not u:
1783
            for proposedEncoding in (documentEncoding, sniffedEncoding):
1784
                u = self._convertFrom(proposedEncoding)
1785
                if u: break
1786
 
1787
        # If no luck and we have auto-detection library, try that:
1788
        if not u and chardet and not isinstance(self.markup, unicode):
1789
            u = self._convertFrom(chardet.detect(self.markup)['encoding'])
1790
 
1791
        # As a last resort, try utf-8 and windows-1252:
1792
        if not u:
1793
            for proposed_encoding in ("utf-8", "windows-1252"):
1794
                u = self._convertFrom(proposed_encoding)
1795
                if u: break
1796
 
1797
        self.unicode = u
1798
        if not u: self.originalEncoding = None
1799
 
1800
    def _subMSChar(self, orig):
1801
        """Changes a MS smart quote character to an XML or HTML
1802
        entity."""
1803
        sub = self.MS_CHARS.get(orig)
1804
        if isinstance(sub, tuple):
1805
            if self.smartQuotesTo == 'xml':
1806
                sub = '&#x%s;' % sub[1]
1807
            else:
1808
                sub = '&%s;' % sub[0]
1809
        return sub
1810
 
1811
    def _convertFrom(self, proposed):
1812
        proposed = self.find_codec(proposed)
1813
        if not proposed or proposed in self.triedEncodings:
1814
            return None
1815
        self.triedEncodings.append(proposed)
1816
        markup = self.markup
1817
 
1818
        # Convert smart quotes to HTML if coming from an encoding
1819
        # that might have them.
1820
        if self.smartQuotesTo and proposed.lower() in("windows-1252",
1821
                                                      "iso-8859-1",
1822
                                                      "iso-8859-2"):
1823
            markup = re.compile("([\x80-\x9f])").sub \
1824
                     (lambda(x): self._subMSChar(x.group(1)),
1825
                      markup)
1826
 
1827
        try:
1828
            # print "Trying to convert document to %s" % proposed
1829
            u = self._toUnicode(markup, proposed)
1830
            self.markup = u
1831
            self.originalEncoding = proposed
1832
        except Exception, e:
1833
            # print "That didn't work!"
1834
            # print e
1835
            return None
1836
        #print "Correct encoding: %s" % proposed
1837
        return self.markup
1838
 
1839
    def _toUnicode(self, data, encoding):
1840
        '''Given a string and its encoding, decodes the string into Unicode.
1841
        %encoding is a string recognized by encodings.aliases'''
1842
 
1843
        # strip Byte Order Mark (if present)
1844
        if (len(data) >= 4) and (data[:2] == '\xfe\xff') \
1845
               and (data[2:4] != '\x00\x00'):
1846
            encoding = 'utf-16be'
1847
            data = data[2:]
1848
        elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \
1849
                 and (data[2:4] != '\x00\x00'):
1850
            encoding = 'utf-16le'
1851
            data = data[2:]
1852
        elif data[:3] == '\xef\xbb\xbf':
1853
            encoding = 'utf-8'
1854
            data = data[3:]
1855
        elif data[:4] == '\x00\x00\xfe\xff':
1856
            encoding = 'utf-32be'
1857
            data = data[4:]
1858
        elif data[:4] == '\xff\xfe\x00\x00':
1859
            encoding = 'utf-32le'
1860
            data = data[4:]
1861
        newdata = unicode(data, encoding)
1862
        return newdata
1863
 
1864
    def _detectEncoding(self, xml_data, isHTML=False):
1865
        """Given a document, tries to detect its XML encoding."""
1866
        xml_encoding = sniffed_xml_encoding = None
1867
        try:
1868
            if xml_data[:4] == '\x4c\x6f\xa7\x94':
1869
                # EBCDIC
1870
                xml_data = self._ebcdic_to_ascii(xml_data)
1871
            elif xml_data[:4] == '\x00\x3c\x00\x3f':
1872
                # UTF-16BE
1873
                sniffed_xml_encoding = 'utf-16be'
1874
                xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
1875
            elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \
1876
                     and (xml_data[2:4] != '\x00\x00'):
1877
                # UTF-16BE with BOM
1878
                sniffed_xml_encoding = 'utf-16be'
1879
                xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
1880
            elif xml_data[:4] == '\x3c\x00\x3f\x00':
1881
                # UTF-16LE
1882
                sniffed_xml_encoding = 'utf-16le'
1883
                xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
1884
            elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \
1885
                     (xml_data[2:4] != '\x00\x00'):
1886
                # UTF-16LE with BOM
1887
                sniffed_xml_encoding = 'utf-16le'
1888
                xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
1889
            elif xml_data[:4] == '\x00\x00\x00\x3c':
1890
                # UTF-32BE
1891
                sniffed_xml_encoding = 'utf-32be'
1892
                xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
1893
            elif xml_data[:4] == '\x3c\x00\x00\x00':
1894
                # UTF-32LE
1895
                sniffed_xml_encoding = 'utf-32le'
1896
                xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
1897
            elif xml_data[:4] == '\x00\x00\xfe\xff':
1898
                # UTF-32BE with BOM
1899
                sniffed_xml_encoding = 'utf-32be'
1900
                xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
1901
            elif xml_data[:4] == '\xff\xfe\x00\x00':
1902
                # UTF-32LE with BOM
1903
                sniffed_xml_encoding = 'utf-32le'
1904
                xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
1905
            elif xml_data[:3] == '\xef\xbb\xbf':
1906
                # UTF-8 with BOM
1907
                sniffed_xml_encoding = 'utf-8'
1908
                xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
1909
            else:
1910
                sniffed_xml_encoding = 'ascii'
1911
                pass
1912
        except:
1913
            xml_encoding_match = None
1914
        xml_encoding_match = re.compile(
1915
            '^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data)
1916
        if not xml_encoding_match and isHTML:
1917
            regexp = re.compile('<\s*meta[^>]+charset=([^>]*?)[;\'">]', re.I)
1918
            xml_encoding_match = regexp.search(xml_data)
1919
        if xml_encoding_match is not None:
1920
            xml_encoding = xml_encoding_match.groups()[0].lower()
1921
            if isHTML:
1922
                self.declaredHTMLEncoding = xml_encoding
1923
            if sniffed_xml_encoding and \
1924
               (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
1925
                                 'iso-10646-ucs-4', 'ucs-4', 'csucs4',
1926
                                 'utf-16', 'utf-32', 'utf_16', 'utf_32',
1927
                                 'utf16', 'u16')):
1928
                xml_encoding = sniffed_xml_encoding
1929
        return xml_data, xml_encoding, sniffed_xml_encoding
1930
 
1931
 
1932
    def find_codec(self, charset):
1933
        return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \
1934
               or (charset and self._codec(charset.replace("-", ""))) \
1935
               or (charset and self._codec(charset.replace("-", "_"))) \
1936
               or charset
1937
 
1938
    def _codec(self, charset):
1939
        if not charset: return charset
1940
        codec = None
1941
        try:
1942
            codecs.lookup(charset)
1943
            codec = charset
1944
        except (LookupError, ValueError):
1945
            pass
1946
        return codec
1947
 
1948
    EBCDIC_TO_ASCII_MAP = None
1949
    def _ebcdic_to_ascii(self, s):
1950
        c = self.__class__
1951
        if not c.EBCDIC_TO_ASCII_MAP:
1952
            emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
1953
                    16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
1954
                    128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
1955
                    144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
1956
                    32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
1957
                    38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
1958
                    45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
1959
                    186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
1960
                    195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
1961
                    201,202,106,107,108,109,110,111,112,113,114,203,204,205,
1962
                    206,207,208,209,126,115,116,117,118,119,120,121,122,210,
1963
                    211,212,213,214,215,216,217,218,219,220,221,222,223,224,
1964
                    225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
1965
                    73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
1966
                    82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
1967
                    90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
1968
                    250,251,252,253,254,255)
1969
            import string
1970
            c.EBCDIC_TO_ASCII_MAP = string.maketrans( \
1971
            ''.join(map(chr, range(256))), ''.join(map(chr, emap)))
1972
        return s.translate(c.EBCDIC_TO_ASCII_MAP)
1973
 
1974
    MS_CHARS = { '\x80' : ('euro', '20AC'),
1975
                 '\x81' : ' ',
1976
                 '\x82' : ('sbquo', '201A'),
1977
                 '\x83' : ('fnof', '192'),
1978
                 '\x84' : ('bdquo', '201E'),
1979
                 '\x85' : ('hellip', '2026'),
1980
                 '\x86' : ('dagger', '2020'),
1981
                 '\x87' : ('Dagger', '2021'),
1982
                 '\x88' : ('circ', '2C6'),
1983
                 '\x89' : ('permil', '2030'),
1984
                 '\x8A' : ('Scaron', '160'),
1985
                 '\x8B' : ('lsaquo', '2039'),
1986
                 '\x8C' : ('OElig', '152'),
1987
                 '\x8D' : '?',
1988
                 '\x8E' : ('#x17D', '17D'),
1989
                 '\x8F' : '?',
1990
                 '\x90' : '?',
1991
                 '\x91' : ('lsquo', '2018'),
1992
                 '\x92' : ('rsquo', '2019'),
1993
                 '\x93' : ('ldquo', '201C'),
1994
                 '\x94' : ('rdquo', '201D'),
1995
                 '\x95' : ('bull', '2022'),
1996
                 '\x96' : ('ndash', '2013'),
1997
                 '\x97' : ('mdash', '2014'),
1998
                 '\x98' : ('tilde', '2DC'),
1999
                 '\x99' : ('trade', '2122'),
2000
                 '\x9a' : ('scaron', '161'),
2001
                 '\x9b' : ('rsaquo', '203A'),
2002
                 '\x9c' : ('oelig', '153'),
2003
                 '\x9d' : '?',
2004
                 '\x9e' : ('#x17E', '17E'),
2005
                 '\x9f' : ('Yuml', ''),}
2006
 
2007
#######################################################################
2008
 
2009
 
2010
#By default, act as an HTML pretty-printer.
2011
if __name__ == '__main__':
2012
    import sys
2013
    soup = BeautifulSoup(sys.stdin)
2014
    print soup.prettify()