Commit 6fe07c7c authored by Bikram Dhoju's avatar Bikram Dhoju

Initial commit

parents

Too many changes to show.

To preserve performance only 192 of 192+ files are displayed.

"""Beautiful Soup
Elixir and Tonic
"The Screen-Scraper's Friend"
http://www.crummy.com/software/BeautifulSoup/
Beautiful Soup parses a (possibly invalid) XML or HTML document into a
tree representation. It provides methods and Pythonic idioms that make
it easy to navigate, search, and modify the tree.
A well-formed XML/HTML document yields a well-formed data
structure. An ill-formed XML/HTML document yields a correspondingly
ill-formed data structure. If your document is only locally
well-formed, you can use this library to find and process the
well-formed part of it.
Beautiful Soup works with Python 2.2 and up. It has no external
dependencies, but you'll have more success at converting data to UTF-8
if you also install these three packages:
* chardet, for auto-detecting character encodings
http://chardet.feedparser.org/
* cjkcodecs and iconv_codec, which add more encodings to the ones supported
by stock Python.
http://cjkpython.i18n.org/
Beautiful Soup defines classes for two main parsing strategies:
* BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
language that kind of looks like XML.
* BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid
or invalid. This class has web browser-like heuristics for
obtaining a sensible parse tree in the face of common HTML errors.
Beautiful Soup also defines a class (UnicodeDammit) for autodetecting
the encoding of an HTML or XML document, and converting it to
Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser.
For more than you ever wanted to know about Beautiful Soup, see the
documentation:
http://www.crummy.com/software/BeautifulSoup/documentation.html
Here, have some legalese:
Copyright (c) 2004-2010, Leonard Richardson
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following
disclaimer in the documentation and/or other materials provided
with the distribution.
* Neither the name of the the Beautiful Soup Consortium and All
Night Kosher Bakery nor the names of its contributors may be
used to endorse or promote products derived from this software
without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT.
"""
from __future__ import generators
__author__ = "Leonard Richardson (leonardr@segfault.org)"
__version__ = "3.0.8.1"
__copyright__ = "Copyright (c) 2004-2010 Leonard Richardson"
__license__ = "New-style BSD"
from sgmllib import SGMLParser, SGMLParseError
import codecs
import markupbase
import types
import re
import sgmllib
try:
from htmlentitydefs import name2codepoint
except ImportError:
name2codepoint = {}
try:
set
except NameError:
from sets import Set as set
#These hacks make Beautiful Soup able to parse XML with namespaces
sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match
DEFAULT_OUTPUT_ENCODING = "utf-8"
def _match_css_class(str):
"""Build a RE to match the given CSS class."""
return re.compile(r"(^|.*\s)%s($|\s)" % str)
# First, the classes that represent markup elements.
class PageElement(object):
"""Contains the navigational information for some part of the page
(either a tag or a piece of text)"""
def setup(self, parent=None, previous=None):
"""Sets up the initial relations between this element and
other elements."""
self.parent = parent
self.previous = previous
self.next = None
self.previousSibling = None
self.nextSibling = None
if self.parent and self.parent.contents:
self.previousSibling = self.parent.contents[-1]
self.previousSibling.nextSibling = self
def replaceWith(self, replaceWith):
oldParent = self.parent
myIndex = self.parent.index(self)
if hasattr(replaceWith, "parent")\
and replaceWith.parent is self.parent:
# We're replacing this element with one of its siblings.
index = replaceWith.parent.index(replaceWith)
if index and index < myIndex:
# Furthermore, it comes before this element. That
# means that when we extract it, the index of this
# element will change.
myIndex = myIndex - 1
self.extract()
oldParent.insert(myIndex, replaceWith)
def replaceWithChildren(self):
myParent = self.parent
myIndex = self.parent.index(self)
self.extract()
reversedChildren = list(self.contents)
reversedChildren.reverse()
for child in reversedChildren:
myParent.insert(myIndex, child)
def extract(self):
"""Destructively rips this element out of the tree."""
if self.parent:
try:
del self.parent.contents[self.parent.index(self)]
except ValueError:
pass
#Find the two elements that would be next to each other if
#this element (and any children) hadn't been parsed. Connect
#the two.
lastChild = self._lastRecursiveChild()
nextElement = lastChild.next
if self.previous:
self.previous.next = nextElement
if nextElement:
nextElement.previous = self.previous
self.previous = None
lastChild.next = None
self.parent = None
if self.previousSibling:
self.previousSibling.nextSibling = self.nextSibling
if self.nextSibling:
self.nextSibling.previousSibling = self.previousSibling
self.previousSibling = self.nextSibling = None
return self
def _lastRecursiveChild(self):
"Finds the last element beneath this object to be parsed."
lastChild = self
while hasattr(lastChild, 'contents') and lastChild.contents:
lastChild = lastChild.contents[-1]
return lastChild
def insert(self, position, newChild):
if isinstance(newChild, basestring) \
and not isinstance(newChild, NavigableString):
newChild = NavigableString(newChild)
position = min(position, len(self.contents))
if hasattr(newChild, 'parent') and newChild.parent is not None:
# We're 'inserting' an element that's already one
# of this object's children.
if newChild.parent is self:
index = self.index(newChild)
if index > position:
# Furthermore we're moving it further down the
# list of this object's children. That means that
# when we extract this element, our target index
# will jump down one.
position = position - 1
newChild.extract()
newChild.parent = self
previousChild = None
if position == 0:
newChild.previousSibling = None
newChild.previous = self
else:
previousChild = self.contents[position-1]
newChild.previousSibling = previousChild
newChild.previousSibling.nextSibling = newChild
newChild.previous = previousChild._lastRecursiveChild()
if newChild.previous:
newChild.previous.next = newChild
newChildsLastElement = newChild._lastRecursiveChild()
if position >= len(self.contents):
newChild.nextSibling = None
parent = self
parentsNextSibling = None
while not parentsNextSibling:
parentsNextSibling = parent.nextSibling
parent = parent.parent
if not parent: # This is the last element in the document.
break
if parentsNextSibling:
newChildsLastElement.next = parentsNextSibling
else:
newChildsLastElement.next = None
else:
nextChild = self.contents[position]
newChild.nextSibling = nextChild
if newChild.nextSibling:
newChild.nextSibling.previousSibling = newChild
newChildsLastElement.next = nextChild
if newChildsLastElement.next:
newChildsLastElement.next.previous = newChildsLastElement
self.contents.insert(position, newChild)
def append(self, tag):
"""Appends the given tag to the contents of this tag."""
self.insert(len(self.contents), tag)
def findNext(self, name=None, attrs={}, text=None, **kwargs):
"""Returns the first item that matches the given criteria and
appears after this Tag in the document."""
return self._findOne(self.findAllNext, name, attrs, text, **kwargs)
def findAllNext(self, name=None, attrs={}, text=None, limit=None,
**kwargs):
"""Returns all items that match the given criteria and appear
after this Tag in the document."""
return self._findAll(name, attrs, text, limit, self.nextGenerator,
**kwargs)
def findNextSibling(self, name=None, attrs={}, text=None, **kwargs):
"""Returns the closest sibling to this Tag that matches the
given criteria and appears after this Tag in the document."""
return self._findOne(self.findNextSiblings, name, attrs, text,
**kwargs)
def findNextSiblings(self, name=None, attrs={}, text=None, limit=None,
**kwargs):
"""Returns the siblings of this Tag that match the given
criteria and appear after this Tag in the document."""
return self._findAll(name, attrs, text, limit,
self.nextSiblingGenerator, **kwargs)
fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x
def findPrevious(self, name=None, attrs={}, text=None, **kwargs):
"""Returns the first item that matches the given criteria and
appears before this Tag in the document."""
return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs)
def findAllPrevious(self, name=None, attrs={}, text=None, limit=None,
**kwargs):
"""Returns all items that match the given criteria and appear
before this Tag in the document."""
return self._findAll(name, attrs, text, limit, self.previousGenerator,
**kwargs)
fetchPrevious = findAllPrevious # Compatibility with pre-3.x
def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs):
"""Returns the closest sibling to this Tag that matches the
given criteria and appears before this Tag in the document."""
return self._findOne(self.findPreviousSiblings, name, attrs, text,
**kwargs)
def findPreviousSiblings(self, name=None, attrs={}, text=None,
limit=None, **kwargs):
"""Returns the siblings of this Tag that match the given
criteria and appear before this Tag in the document."""
return self._findAll(name, attrs, text, limit,
self.previousSiblingGenerator, **kwargs)
fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x
def findParent(self, name=None, attrs={}, **kwargs):
"""Returns the closest parent of this Tag that matches the given
criteria."""
# NOTE: We can't use _findOne because findParents takes a different
# set of arguments.
r = None
l = self.findParents(name, attrs, 1)
if l:
r = l[0]
return r
def findParents(self, name=None, attrs={}, limit=None, **kwargs):
"""Returns the parents of this Tag that match the given
criteria."""
return self._findAll(name, attrs, None, limit, self.parentGenerator,
**kwargs)
fetchParents = findParents # Compatibility with pre-3.x
#These methods do the real heavy lifting.
def _findOne(self, method, name, attrs, text, **kwargs):
r = None
l = method(name, attrs, text, 1, **kwargs)
if l:
r = l[0]
return r
def _findAll(self, name, attrs, text, limit, generator, **kwargs):
"Iterates over a generator looking for things that match."
if isinstance(name, SoupStrainer):
strainer = name
# (Possibly) special case some findAll*(...) searches
elif text is None and not limit and not attrs and not kwargs:
# findAll*(True)
if name is True:
return [element for element in generator()
if isinstance(element, Tag)]
# findAll*('tag-name')
elif isinstance(name, basestring):
return [element for element in generator()
if isinstance(element, Tag) and
element.name == name]
else:
strainer = SoupStrainer(name, attrs, text, **kwargs)
# Build a SoupStrainer
else:
strainer = SoupStrainer(name, attrs, text, **kwargs)
results = ResultSet(strainer)
g = generator()
while True:
try:
i = g.next()
except StopIteration:
break
if i:
found = strainer.search(i)
if found:
results.append(found)
if limit and len(results) >= limit:
break
return results
#These Generators can be used to navigate starting from both
#NavigableStrings and Tags.
def nextGenerator(self):
i = self
while i is not None:
i = i.next
yield i
def nextSiblingGenerator(self):
i = self
while i is not None:
i = i.nextSibling
yield i
def previousGenerator(self):
i = self
while i is not None:
i = i.previous
yield i
def previousSiblingGenerator(self):
i = self
while i is not None:
i = i.previousSibling
yield i
def parentGenerator(self):
i = self
while i is not None:
i = i.parent
yield i
# Utility methods
def substituteEncoding(self, str, encoding=None):
encoding = encoding or "utf-8"
return str.replace("%SOUP-ENCODING%", encoding)
def toEncoding(self, s, encoding=None):
"""Encodes an object to a string in some encoding, or to Unicode.
."""
if isinstance(s, unicode):
if encoding:
s = s.encode(encoding)
elif isinstance(s, str):
if encoding:
s = s.encode(encoding)
else:
s = unicode(s)
else:
if encoding:
s = self.toEncoding(str(s), encoding)
else:
s = unicode(s)
return s
class NavigableString(unicode, PageElement):
def __new__(cls, value):
"""Create a new NavigableString.
When unpickling a NavigableString, this method is called with
the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be
passed in to the superclass's __new__ or the superclass won't know
how to handle non-ASCII characters.
"""
if isinstance(value, unicode):
return unicode.__new__(cls, value)
return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
def __getnewargs__(self):
return (NavigableString.__str__(self),)
def __getattr__(self, attr):
"""text.string gives you text. This is for backwards
compatibility for Navigable*String, but for CData* it lets you
get the string without the CData wrapper."""
if attr == 'string':
return self
else:
raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
def __unicode__(self):
return str(self).decode(DEFAULT_OUTPUT_ENCODING)
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
if encoding:
return self.encode(encoding)
else:
return self
class CData(NavigableString):
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding)
class ProcessingInstruction(NavigableString):
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
output = self
if "%SOUP-ENCODING%" in output:
output = self.substituteEncoding(output, encoding)
return "<?%s?>" % self.toEncoding(output, encoding)
class Comment(NavigableString):
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
return "<!--%s-->" % NavigableString.__str__(self, encoding)
class Declaration(NavigableString):
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
return "<!%s>" % NavigableString.__str__(self, encoding)
class Tag(PageElement):
"""Represents a found HTML tag with its attributes and contents."""
def _invert(h):
"Cheap function to invert a hash."
i = {}
for k,v in h.items():
i[v] = k
return i
XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'",
"quot" : '"',
"amp" : "&",
"lt" : "<",
"gt" : ">" }
XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS)
def _convertEntities(self, match):
"""Used in a call to re.sub to replace HTML, XML, and numeric
entities with the appropriate Unicode characters. If HTML
entities are being converted, any unrecognized entities are
escaped."""
x = match.group(1)
if self.convertHTMLEntities and x in name2codepoint:
return unichr(name2codepoint[x])
elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS:
if self.convertXMLEntities:
return self.XML_ENTITIES_TO_SPECIAL_CHARS[x]
else:
return u'&%s;' % x
elif len(x) > 0 and x[0] == '#':
# Handle numeric entities
if len(x) > 1 and x[1] == 'x':
return unichr(int(x[2:], 16))
else:
return unichr(int(x[1:]))
elif self.escapeUnrecognizedEntities:
return u'&amp;%s;' % x
else:
return u'&%s;' % x
def __init__(self, parser, name, attrs=None, parent=None,
previous=None):
"Basic constructor."
# We don't actually store the parser object: that lets extracted
# chunks be garbage-collected
self.parserClass = parser.__class__
self.isSelfClosing = parser.isSelfClosingTag(name)
self.name = name
if attrs is None:
attrs = []
self.attrs = attrs
self.contents = []
self.setup(parent, previous)
self.hidden = False
self.containsSubstitutions = False
self.convertHTMLEntities = parser.convertHTMLEntities
self.convertXMLEntities = parser.convertXMLEntities
self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities
# Convert any HTML, XML, or numeric entities in the attribute values.
convert = lambda(k, val): (k,
re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);",
self._convertEntities,
val))
self.attrs = map(convert, self.attrs)
def getString(self):
if (len(self.contents) == 1
and isinstance(self.contents[0], NavigableString)):
return self.contents[0]
def setString(self, string):
"""Replace the contents of the tag with a string"""
self.clear()
self.append(string)
string = property(getString, setString)
def getText(self, separator=u""):
if not len(self.contents):
return u""
stopNode = self._lastRecursiveChild().next
strings = []
current = self.contents[0]
while current is not stopNode:
if isinstance(current, NavigableString):
strings.append(current.strip())
current = current.next
return separator.join(strings)
text = property(getText)
def get(self, key, default=None):
"""Returns the value of the 'key' attribute for the tag, or
the value given for 'default' if it doesn't have that
attribute."""
return self._getAttrMap().get(key, default)
def clear(self):
"""Extract all children."""
for child in self.contents[:]:
child.extract()
def index(self, element):
for i, child in enumerate(self.contents):
if child is element:
return i
raise ValueError("Tag.index: element not in tag")
def has_key(self, key):
return self._getAttrMap().has_key(key)
def __getitem__(self, key):
"""tag[key] returns the value of the 'key' attribute for the tag,
and throws an exception if it's not there."""
return self._getAttrMap()[key]
def __iter__(self):
"Iterating over a tag iterates over its contents."
return iter(self.contents)
def __len__(self):
"The length of a tag is the length of its list of contents."
return len(self.contents)
def __contains__(self, x):
return x in self.contents
def __nonzero__(self):
"A tag is non-None even if it has no contents."
return True
def __setitem__(self, key, value):
"""Setting tag[key] sets the value of the 'key' attribute for the
tag."""