#!/usr/bin/env python
##
## Name: spider.py
## Purpose: Simple web link-graph generator.
## Author: M. J. Fromberger
## Info: $Id: spider.py 353 2007-08-23 17:44:33Z sting $
##
## A link graph is a graph G = (V, E) in which
##
## V = a set of vertices, labelled by distinct URLs,
## E = a set of directed edges,
##
## so that an edge (u, v) \in E if the content located at URL u
## contains a hyperlink to URL v.
##
## This library permits construction of a link graph by expanding a
## seed vertex. The following general traversal algorithm is used:
##
## let Q = [seed], V = {}, E = {} in
## while Q is not empty,
## q = remove front of Q
## add q to V
## for each legal link u in page(q),
## if u is not in V,
## add u to Q
## add (q, u) to E
## result is G = (V, E).
## end.
##
## There are four classes implementing variations on this algorithm:
##
## Spider
## This is the base class, that implements a depth-first search
## ordering, but does not produce any output. Spider is the base
## class for the other variations.
##
## ConstrainedSpider
## Like Spider, but allows the specification of inclusion and
## exclusion constraints. A node that is within an included area
## may be visited; but no node that in an excluded area may be
## visited. See also the Area class which implements this.
##
## IncrementalSpider
## Builds the link graph incrementally, writing each edge to the
## output as it is discovered and then discarding it from memory.
## This is the version used by the command-line driver.
##
## GraphSpider
## Builds an in-memory representation of the graph.
##
import os, re, sys
import getopt, urllib, urlparse
from BeautifulSoup import BeautifulSoup as HTMLParser
# {{ class NoAuthURLopener
class NoAuthURLopener (urllib.FancyURLopener):
"""This is a URL opener that simply fails on HTTP authentication
requests, behaving like a "cancel" from the user. See urllib for
more details.
"""
def prompt_user_passwd(self, host, realm):
return None, None
# }}
# {{ open_url(url)
# Like urllib.urlopen(), except it doesn't try for authentication.
special_opener = NoAuthURLopener()
def open_url(url):
"""This is a replacement for the urllib.urlopen() function, that
overrides the handling of HTTP basic authentication. In
particular, an empty name and password are returned, effectively
cancelling the authentication.
"""
return special_opener.open(url)
# }}
# {{ get_links(url)
def get_links(url):
"""Fetch a set of all the outbound hyperlinks visible in the URL
specified. Raises IOError if the URL does not lead to a real
document. Returns a tuple (real_url, links) where real_url is the
actual URL that was loaded, including redirections, and links is a
set.
"""
ws = re.compile(r'[\t\r\n]+')
u = open_url(url) ; info = u.info() ; real = u.geturl()
out = set()
if info.gettype() == 'text/html':
for elt in HTMLParser(u.read()).fetch('a'):
if elt.has_key('href'):
url = urllib.unquote(ws.sub('', elt.get('href').strip()))
# Check URL for additional properties, as desired...
out.add(url)
u.close()
return (real, out)
# }}
# {{ class Area
class Area (object):
"""An Area represents a collection of links that share a common
base URL. Create a new link in the area using .makelink().
"""
def __init__(self, base_url, allow_query = True):
"""Create a new area sharing the specified base URL. If
allow_query is true, query components of URLs will be
considered when expanding URL's; otherwise, they will be
discarded.
"""
self._base = base_url
self._queryp = allow_query
class Link (object):
"""Represents a hyperlink within a given area; the area
determines the base URL for all instances of this class.
"""
_area = self
def __init__(self, url):
self._url = url
self._exp = None
def original(self):
"""Fetch the original link text given to the constructor."""
return self._url
def expanded(self):
"""Return the URL expanded by the common base. The result
is cached to avoid recomputation.
"""
if self._exp is None:
u = list(urlparse.urlparse(
urlparse.urljoin(self.area().base(), self._url)))
if not self.area()._queryp:
u[4] = ''
self._exp = urlparse.urldefrag(
urlparse.urlunparse(u))[0]
return self._exp
def base(self):
"""Fetch the common base URL."""
return self._area.base()
def area(self):
"""Fetch the area this link belongs to."""
return self._area
def __cmp__(self, other):
"""Compare two links. Comparison is based on the
expanded string including the base URL.
"""
try:
return cmp(self.expanded(), other.expanded())
except AttributeError:
return False
def __str__(self):
return self.expanded()
def __hash__(self):
return hash(self.expanded())
self.LinkType = Link
def makelink(self, url):
"""Construct a new link in this area, if possible; this will
fail with a ValueError if the specified URL does not conform
to the base URL for this area.
"""
if self.contains(url):
return self.LinkType(url)
else:
raise ValueError("URL is outside this area: %s" % url)
def contains(self, url):
"""Test whether the specified URL is compatible with the base
of this area. A relative URL is always compatible. Returns
True or False as appropriate.
"""
return urlparse.urljoin(self._base, url).startswith(self._base)
def base(self):
"""Return the base URL text given to the constructor."""
return self._base
def __cmp__(self, other):
"""Compare two areas. Comparison is based on the base URL string."""
try:
return cmp(self.base(), other.base())
except AttributeError:
return False
def __contains__(self, other):
return self.contains(other)
def __repr__(self):
return '#<%s base="%s">' % (type(self).__name__, self.base())
# }}
# {{ class Spider
class Spider (object):
"""A spider object encapsulates the traversal of a link graph. A
subclass should override the methods of this class to achieve a
useful effect. This class is abstract, in that it does not
provide a complete implementation of the override protocol. See
ConstrainedSpider, IncrementalSpider, and GraphSpider for concrete
implementations. The default search strategy is DFS.
The override protocol consists of the following methods:
start, finish,
check_node, visit_node, visit_edge, expand_node, keep_node,
init_queue, dequeue, update_queue, queue_empty, queue_contains,
diagnostic
"""
def __init__(self, *args, **kwargs):
"""Default initializer. The arguments are ignored, but a
subclass should insure this is called, so that the internal
search state will be set up correctly.
"""
self._debug = False
self._queue = None
self._vnode = None
def search(self, seed):
"""Perform a search of the link graph starting from the given
seed URL.
"""
first = self.check_node(seed)
if first is None:
raise ValueError("Seed value does not check as valid: %s" % seed)
self.start(first)
queue = self.init_queue(first)
while not self.queue_empty(queue):
elt, queue = self.dequeue(queue)
try:
real, seqs = self.expand_node(elt)
except IOError:
self.diagnostic('X: %s', elt)
continue
self.diagnostic('V: %s', elt)
self.visit_node(elt, real)
for seq in seqs:
if self.keep_node(elt, seq) and \
not self.queue_contains(queue, seq):
self.diagnostic(' + %s', seq)
queue = self.update_queue(queue, (seq,))
self.visit_edge(elt, seq)
self.diagnostic('')
return self.finish()
def diagnostic(self, msg, *args):
"""Called to emit diagnostic messages for debugging purposes.
The default implementation writes a string to standard error
if the ._debug attribute is a true value.
"""
if self._debug:
print >> sys.stderr, msg % args
def check_node(self, url):
"""This is called on each node that is discovered during
search, before it is added to the queue. It allows the
implementation to check for broken URL's and fix them as
needed. Returns either a fixed URL (a string) or None; the
latter means the URL should be discarded.
"""
return url
def start(self, url):
"""This method is called once at the beginning of a traversal,
giving the seed URL.
"""
self._vnode = set()
self._queue = [url]
def visit_node(self, url, real = None):
"""This method is called with each vertex that is removed from
the queue, to "visit" the vertex. The "url" is the actual URL
that was found in the graph; "real" is the actual URL that was
retrieved, which may differ due to redirection.
"""
self._vnode.add(url)
if real is not None:
self._vnode.add(None)
def visit_edge(self, src, dst):
"""Called each time an edge is discovered, giving the endpoints
of the edge as strings.
"""
pass
def expand_node(self, url):
"""This method is called to obtain a set of the direct
neighbors for a vertex. If the URL in question is invalid,
this method should raise IOError; otherwise, it returns a
tuple (real, links) where real is the "normalized" URL and
links is a set of strings.
This default implementation just calls get_links() to extract
links from HTML documents and filters the result through the
.chck_links() method.
"""
real, links = get_links(url)
s = set(self.check_node(elt) for elt in links)
s.discard(None)
return real, s
def keep_node(self, src, dst):
"""This method is called to determine whether a direct
neighbor dst of a given vertex src should be kept for further
exploration; returns true or false as appropriate.
"""
return dst not in self._vnode
def init_queue(self, seed):
"""Called to create an initial search queue given the seed
URL. Returns the new queue structure.
"""
return self._queue
def dequeue(self, queue):
"""Remove one item from the queue, returning a tuple of the
form (item, new_queue). If the queue is empty, the behaviour
is undefined; an exception is probably appropriate.
"""
itm = queue.pop(0)
return itm, queue
def update_queue(self, queue, elts):
"""Called to add new items to the queue. The resulting queue
is returned. Note that the implementation is free to use
either a persistent or an ephemeral data structure.
"""
queue[0:0] = list(elts)
return queue
def queue_empty(self, queue):
"""Called to check whether queue is empty; returns true or
false accordingly. You do NOT need to override this if your
queue structure supports the len(Q) protocol; the default
implementation tests for a length of zero.
"""
return len(queue) == 0
def queue_contains(self, queue, elt):
"""Called to check whether queue contains elt; returns true or
false accordingly. You do NOT need to override this if your
queue structure supports the "x in Q" protocol, which is the
default implementation.
"""
return elt in queue
def finish(self):
"""This method is called once at the end of the traversal to
clean up any intermediate data. The result returned by this
method is returned as the value from .search().
"""
self._vnode = self._queue = None
return None
# }}
# {{ class ConstrainedSpider
class ConstrainedSpider (Spider):
"""A version of Spider that constrains search to a collection of
user-defined areas; no link that points outside the given area
will be followed.
"""
def __init__(self, good = (), avoid = (), keepq = True):
"""Initialize a new spider with the specified constraints:
good -- sequence of included base URL's.
avoid -- sequence of excluded base URL's.
keepq -- retain query component of URL's?
"""
super(ConstrainedSpider, self).__init__()
self._incl = tuple(Area(x, keepq) for x in good)
self._excl = tuple(Area(x, False) for x in avoid)
def inbounds(self, url):
"""If the given URL is in bounds for this object, return a
link object representing that URL; otherwise return False.
"""
# Check for loops
path = urlparse.urlparse(url)[2]
comp = path.strip('/').split('/')
if len(comp) > 0 and comp[-1] and comp[-1] in comp[:-1]:
self.diagnostic('L: %s', url)
return False
# Check whitelist and blacklist
for area in self._incl:
if area.contains(url):
for bad in self._excl:
if bad.contains(url):
return False
return area.makelink(url)
return False
def expand_node(self, url):
"""When expanding a URL, find neighbors only for those URL's
that fall within the bounds of this query.
Also does ad-hoc surgery to find directories that lack a
trailing slash. A common error in URL construction is to
write http://host/path/to/dir and omit a trailing slash; this
is interpreted correctly by the server, because it can see the
underlying resource; but the client has no way to know this
without fetching it. The URL spec treats 'dir' as a plain
file name, and drops it during base joining.
"""
if self.inbounds(url) is False:
return url, set()
real, links = super(ConstrainedSpider, self).expand_node(url)
p_url = list(urlparse.urlparse(url))
ptail = p_url[2].split('/').pop()
if ptail and '.' not in ptail and not ptail.endswith('/'):
p_url[2] += '/'
url = urlparse.urlunparse(p_url)
seqs = set()
for elt in links:
u = urlparse.urljoin(url, elt)
ok = self.inbounds(u)
if not ok:
seqs.add(u) # fringe
else:
seqs.add(ok.expanded())
return real, seqs
# }}
# {{ class IncrementalSpider
class IncrementalSpider (ConstrainedSpider):
"""Build a graph incrementally with output to a file, one line per
edge of the graph.
"""
def __init__(self, ofp, good = (), avoid = (), keepq = True):
"""Initialize a new spider.
ofp -- file to use for output edges.
good -- sequence of included base URL's.
avoid -- sequence of excluded base URL's.
keepq -- retain query component of URL's?
"""
super(IncrementalSpider, self).__init__(good, avoid, keepq)
self._ofp = ofp
def visit_edge(self, src, dst):
print >> self._ofp, src + '\t' + dst
# }}
# {{ class GraphSpider
class GraphSpider (ConstrainedSpider):
"""Build a complete graph representation."""
def start(self, seed):
"""Initialize the empty graph."""
super(GraphSpider, self).start(seed)
self._graph = {}
def visit_node(self, url, real = None):
"""Add a vertex to the graph."""
super(GraphSpider, self).visit_node(url, real)
self._graph.setdefault(url, set())
def visit_edge(self, src, dst):
"""Add an edge to the graph."""
super(GraphSpider, self).visit_edge(src, dst)
self._graph[src].add(dst)
def finish(self):
"""Extract and return the completed graph. The result is
discarded from the object state so that it can be garbage
collected when the user is done with it.
"""
super(GraphSpider, self).finish()
out = self._graph
self._graph = None
return out
# }}
# {{ main(argv)
def main(argv):
"""A very simple command-line driver to spider a web site into a file.
Usage: spider.py [options] +
The graph is written to standard output with one edge per line, in
the format , where is the source URL,
is the target URL, and is an ASCII tab character.
"""
def usage(long = False):
print >> sys.stderr, "Usage: spider.py [options] +"
if long:
print >> sys.stderr, """
This program generates a link graph of a web site, generated by
following the anchor tags imbedded in HTML documents. The results are
written as text output with one line for each link in the graph, having
the format:
TAB EOL
The progress of the search may be restricted using the --include (-i)
and --exclude (-e) command-line switches described below."""
print >> sys.stderr, """\nOptions:
-D/--debug -- enable debugging output to stderr.
-h/--help -- display this help message.
-i/--include -- permit URL's under this base to be included.
-e/--exclude -- exclude URL's under this base.
-q/--query -- keep query portion of URL.
-Q/--noquery -- discard query portion of URL (default).
Note that the exclusion list overrides the inclusion list. If no
URL's are explicitly included, the seed URL's are implicitly added to
the inclusion list.\n"""
else:
print >> sys.stderr, " [use `-h' or `--help' for options]\n"
inc = set() # allow URL's under these bases to be visited.
exc = set() # never visit URL's under these bases.
que = False # keep query portion of URL's?
dbg = False # generate diagnostic output?
ofp = sys.stdout # send output to this file.
try:
opts, args = getopt.gnu_getopt(argv, 'Dhi:e:o:qQ',
('include=', 'exclude=', 'output=',
'query', 'noquery', 'debug', 'help'))
except getopt.GetoptError, e:
print >> sys.stderr, "Error: %s" % e
usage()
sys.exit(1)
for opt, arg in opts:
if opt in ('-i', '--include'):
inc.add(arg)
elif opt in ('-e', '--exclude'):
exc.add(arg)
elif opt in ('-D', '--debug'):
dbg = True
elif opt in ('-o', '--output'):
try:
ofp = file(arg, 'wt')
except (IOError, OSError), e:
print >> sys.stderr, "Error opening output: %s" % arg
print >> sys.stderr, " -- %s\n" % e
sys.exit(1)
elif opt in ('-q', '--query'):
que = True
elif opt in ('-Q', '--noquery'):
que = False
elif opt in ('-h', '--help'):
usage(long = True)
sys.exit(0)
if len(args) == 0:
usage() ; sys.exit(1)
if not inc:
inc.update(args)
if dbg:
print >> sys.stderr, "[diagnostic output enabled]"
for url in args:
sp = IncrementalSpider(ofp, inc, exc, que)
if dbg: sp._debug = True
sp.search(url)
# }}
if __name__ == "__main__":
main(sys.argv[1:])
sys.exit(0)
__all__ = ('open_url', 'get_links', 'Area', 'Spider',
'ConstrainedSpider', 'IncrementalSpider', 'GraphSpider')
# Here there be dragons