#!/usr/bin/env python ## ## Name: spider.py ## Purpose: Simple web link-graph generator. ## Author: M. J. Fromberger ## Info: $Id: spider.py 353 2007-08-23 17:44:33Z sting $ ## ## A link graph is a graph G = (V, E) in which ## ## V = a set of vertices, labelled by distinct URLs, ## E = a set of directed edges, ## ## so that an edge (u, v) \in E if the content located at URL u ## contains a hyperlink to URL v. ## ## This library permits construction of a link graph by expanding a ## seed vertex. The following general traversal algorithm is used: ## ## let Q = [seed], V = {}, E = {} in ## while Q is not empty, ## q = remove front of Q ## add q to V ## for each legal link u in page(q), ## if u is not in V, ## add u to Q ## add (q, u) to E ## result is G = (V, E). ## end. ## ## There are four classes implementing variations on this algorithm: ## ## Spider ## This is the base class, that implements a depth-first search ## ordering, but does not produce any output. Spider is the base ## class for the other variations. ## ## ConstrainedSpider ## Like Spider, but allows the specification of inclusion and ## exclusion constraints. A node that is within an included area ## may be visited; but no node that in an excluded area may be ## visited. See also the Area class which implements this. ## ## IncrementalSpider ## Builds the link graph incrementally, writing each edge to the ## output as it is discovered and then discarding it from memory. ## This is the version used by the command-line driver. ## ## GraphSpider ## Builds an in-memory representation of the graph. ## import os, re, sys import getopt, urllib, urlparse from BeautifulSoup import BeautifulSoup as HTMLParser # {{ class NoAuthURLopener class NoAuthURLopener (urllib.FancyURLopener): """This is a URL opener that simply fails on HTTP authentication requests, behaving like a "cancel" from the user. See urllib for more details. """ def prompt_user_passwd(self, host, realm): return None, None # }} # {{ open_url(url) # Like urllib.urlopen(), except it doesn't try for authentication. special_opener = NoAuthURLopener() def open_url(url): """This is a replacement for the urllib.urlopen() function, that overrides the handling of HTTP basic authentication. In particular, an empty name and password are returned, effectively cancelling the authentication. """ return special_opener.open(url) # }} # {{ get_links(url) def get_links(url): """Fetch a set of all the outbound hyperlinks visible in the URL specified. Raises IOError if the URL does not lead to a real document. Returns a tuple (real_url, links) where real_url is the actual URL that was loaded, including redirections, and links is a set. """ ws = re.compile(r'[\t\r\n]+') u = open_url(url) ; info = u.info() ; real = u.geturl() out = set() if info.gettype() == 'text/html': for elt in HTMLParser(u.read()).fetch('a'): if elt.has_key('href'): url = urllib.unquote(ws.sub('', elt.get('href').strip())) # Check URL for additional properties, as desired... out.add(url) u.close() return (real, out) # }} # {{ class Area class Area (object): """An Area represents a collection of links that share a common base URL. Create a new link in the area using .makelink(). """ def __init__(self, base_url, allow_query = True): """Create a new area sharing the specified base URL. If allow_query is true, query components of URLs will be considered when expanding URL's; otherwise, they will be discarded. """ self._base = base_url self._queryp = allow_query class Link (object): """Represents a hyperlink within a given area; the area determines the base URL for all instances of this class. """ _area = self def __init__(self, url): self._url = url self._exp = None def original(self): """Fetch the original link text given to the constructor.""" return self._url def expanded(self): """Return the URL expanded by the common base. The result is cached to avoid recomputation. """ if self._exp is None: u = list(urlparse.urlparse( urlparse.urljoin(self.area().base(), self._url))) if not self.area()._queryp: u[4] = '' self._exp = urlparse.urldefrag( urlparse.urlunparse(u))[0] return self._exp def base(self): """Fetch the common base URL.""" return self._area.base() def area(self): """Fetch the area this link belongs to.""" return self._area def __cmp__(self, other): """Compare two links. Comparison is based on the expanded string including the base URL. """ try: return cmp(self.expanded(), other.expanded()) except AttributeError: return False def __str__(self): return self.expanded() def __hash__(self): return hash(self.expanded()) self.LinkType = Link def makelink(self, url): """Construct a new link in this area, if possible; this will fail with a ValueError if the specified URL does not conform to the base URL for this area. """ if self.contains(url): return self.LinkType(url) else: raise ValueError("URL is outside this area: %s" % url) def contains(self, url): """Test whether the specified URL is compatible with the base of this area. A relative URL is always compatible. Returns True or False as appropriate. """ return urlparse.urljoin(self._base, url).startswith(self._base) def base(self): """Return the base URL text given to the constructor.""" return self._base def __cmp__(self, other): """Compare two areas. Comparison is based on the base URL string.""" try: return cmp(self.base(), other.base()) except AttributeError: return False def __contains__(self, other): return self.contains(other) def __repr__(self): return '#<%s base="%s">' % (type(self).__name__, self.base()) # }} # {{ class Spider class Spider (object): """A spider object encapsulates the traversal of a link graph. A subclass should override the methods of this class to achieve a useful effect. This class is abstract, in that it does not provide a complete implementation of the override protocol. See ConstrainedSpider, IncrementalSpider, and GraphSpider for concrete implementations. The default search strategy is DFS. The override protocol consists of the following methods: start, finish, check_node, visit_node, visit_edge, expand_node, keep_node, init_queue, dequeue, update_queue, queue_empty, queue_contains, diagnostic """ def __init__(self, *args, **kwargs): """Default initializer. The arguments are ignored, but a subclass should insure this is called, so that the internal search state will be set up correctly. """ self._debug = False self._queue = None self._vnode = None def search(self, seed): """Perform a search of the link graph starting from the given seed URL. """ first = self.check_node(seed) if first is None: raise ValueError("Seed value does not check as valid: %s" % seed) self.start(first) queue = self.init_queue(first) while not self.queue_empty(queue): elt, queue = self.dequeue(queue) try: real, seqs = self.expand_node(elt) except IOError: self.diagnostic('X: %s', elt) continue self.diagnostic('V: %s', elt) self.visit_node(elt, real) for seq in seqs: if self.keep_node(elt, seq) and \ not self.queue_contains(queue, seq): self.diagnostic(' + %s', seq) queue = self.update_queue(queue, (seq,)) self.visit_edge(elt, seq) self.diagnostic('') return self.finish() def diagnostic(self, msg, *args): """Called to emit diagnostic messages for debugging purposes. The default implementation writes a string to standard error if the ._debug attribute is a true value. """ if self._debug: print >> sys.stderr, msg % args def check_node(self, url): """This is called on each node that is discovered during search, before it is added to the queue. It allows the implementation to check for broken URL's and fix them as needed. Returns either a fixed URL (a string) or None; the latter means the URL should be discarded. """ return url def start(self, url): """This method is called once at the beginning of a traversal, giving the seed URL. """ self._vnode = set() self._queue = [url] def visit_node(self, url, real = None): """This method is called with each vertex that is removed from the queue, to "visit" the vertex. The "url" is the actual URL that was found in the graph; "real" is the actual URL that was retrieved, which may differ due to redirection. """ self._vnode.add(url) if real is not None: self._vnode.add(None) def visit_edge(self, src, dst): """Called each time an edge is discovered, giving the endpoints of the edge as strings. """ pass def expand_node(self, url): """This method is called to obtain a set of the direct neighbors for a vertex. If the URL in question is invalid, this method should raise IOError; otherwise, it returns a tuple (real, links) where real is the "normalized" URL and links is a set of strings. This default implementation just calls get_links() to extract links from HTML documents and filters the result through the .chck_links() method. """ real, links = get_links(url) s = set(self.check_node(elt) for elt in links) s.discard(None) return real, s def keep_node(self, src, dst): """This method is called to determine whether a direct neighbor dst of a given vertex src should be kept for further exploration; returns true or false as appropriate. """ return dst not in self._vnode def init_queue(self, seed): """Called to create an initial search queue given the seed URL. Returns the new queue structure. """ return self._queue def dequeue(self, queue): """Remove one item from the queue, returning a tuple of the form (item, new_queue). If the queue is empty, the behaviour is undefined; an exception is probably appropriate. """ itm = queue.pop(0) return itm, queue def update_queue(self, queue, elts): """Called to add new items to the queue. The resulting queue is returned. Note that the implementation is free to use either a persistent or an ephemeral data structure. """ queue[0:0] = list(elts) return queue def queue_empty(self, queue): """Called to check whether queue is empty; returns true or false accordingly. You do NOT need to override this if your queue structure supports the len(Q) protocol; the default implementation tests for a length of zero. """ return len(queue) == 0 def queue_contains(self, queue, elt): """Called to check whether queue contains elt; returns true or false accordingly. You do NOT need to override this if your queue structure supports the "x in Q" protocol, which is the default implementation. """ return elt in queue def finish(self): """This method is called once at the end of the traversal to clean up any intermediate data. The result returned by this method is returned as the value from .search(). """ self._vnode = self._queue = None return None # }} # {{ class ConstrainedSpider class ConstrainedSpider (Spider): """A version of Spider that constrains search to a collection of user-defined areas; no link that points outside the given area will be followed. """ def __init__(self, good = (), avoid = (), keepq = True): """Initialize a new spider with the specified constraints: good -- sequence of included base URL's. avoid -- sequence of excluded base URL's. keepq -- retain query component of URL's? """ super(ConstrainedSpider, self).__init__() self._incl = tuple(Area(x, keepq) for x in good) self._excl = tuple(Area(x, False) for x in avoid) def inbounds(self, url): """If the given URL is in bounds for this object, return a link object representing that URL; otherwise return False. """ # Check for loops path = urlparse.urlparse(url)[2] comp = path.strip('/').split('/') if len(comp) > 0 and comp[-1] and comp[-1] in comp[:-1]: self.diagnostic('L: %s', url) return False # Check whitelist and blacklist for area in self._incl: if area.contains(url): for bad in self._excl: if bad.contains(url): return False return area.makelink(url) return False def expand_node(self, url): """When expanding a URL, find neighbors only for those URL's that fall within the bounds of this query. Also does ad-hoc surgery to find directories that lack a trailing slash. A common error in URL construction is to write http://host/path/to/dir and omit a trailing slash; this is interpreted correctly by the server, because it can see the underlying resource; but the client has no way to know this without fetching it. The URL spec treats 'dir' as a plain file name, and drops it during base joining. """ if self.inbounds(url) is False: return url, set() real, links = super(ConstrainedSpider, self).expand_node(url) p_url = list(urlparse.urlparse(url)) ptail = p_url[2].split('/').pop() if ptail and '.' not in ptail and not ptail.endswith('/'): p_url[2] += '/' url = urlparse.urlunparse(p_url) seqs = set() for elt in links: u = urlparse.urljoin(url, elt) ok = self.inbounds(u) if not ok: seqs.add(u) # fringe else: seqs.add(ok.expanded()) return real, seqs # }} # {{ class IncrementalSpider class IncrementalSpider (ConstrainedSpider): """Build a graph incrementally with output to a file, one line per edge of the graph. """ def __init__(self, ofp, good = (), avoid = (), keepq = True): """Initialize a new spider. ofp -- file to use for output edges. good -- sequence of included base URL's. avoid -- sequence of excluded base URL's. keepq -- retain query component of URL's? """ super(IncrementalSpider, self).__init__(good, avoid, keepq) self._ofp = ofp def visit_edge(self, src, dst): print >> self._ofp, src + '\t' + dst # }} # {{ class GraphSpider class GraphSpider (ConstrainedSpider): """Build a complete graph representation.""" def start(self, seed): """Initialize the empty graph.""" super(GraphSpider, self).start(seed) self._graph = {} def visit_node(self, url, real = None): """Add a vertex to the graph.""" super(GraphSpider, self).visit_node(url, real) self._graph.setdefault(url, set()) def visit_edge(self, src, dst): """Add an edge to the graph.""" super(GraphSpider, self).visit_edge(src, dst) self._graph[src].add(dst) def finish(self): """Extract and return the completed graph. The result is discarded from the object state so that it can be garbage collected when the user is done with it. """ super(GraphSpider, self).finish() out = self._graph self._graph = None return out # }} # {{ main(argv) def main(argv): """A very simple command-line driver to spider a web site into a file. Usage: spider.py [options] + The graph is written to standard output with one edge per line, in the format , where is the source URL, is the target URL, and is an ASCII tab character. """ def usage(long = False): print >> sys.stderr, "Usage: spider.py [options] +" if long: print >> sys.stderr, """ This program generates a link graph of a web site, generated by following the anchor tags imbedded in HTML documents. The results are written as text output with one line for each link in the graph, having the format: TAB EOL The progress of the search may be restricted using the --include (-i) and --exclude (-e) command-line switches described below.""" print >> sys.stderr, """\nOptions: -D/--debug -- enable debugging output to stderr. -h/--help -- display this help message. -i/--include -- permit URL's under this base to be included. -e/--exclude -- exclude URL's under this base. -q/--query -- keep query portion of URL. -Q/--noquery -- discard query portion of URL (default). Note that the exclusion list overrides the inclusion list. If no URL's are explicitly included, the seed URL's are implicitly added to the inclusion list.\n""" else: print >> sys.stderr, " [use `-h' or `--help' for options]\n" inc = set() # allow URL's under these bases to be visited. exc = set() # never visit URL's under these bases. que = False # keep query portion of URL's? dbg = False # generate diagnostic output? ofp = sys.stdout # send output to this file. try: opts, args = getopt.gnu_getopt(argv, 'Dhi:e:o:qQ', ('include=', 'exclude=', 'output=', 'query', 'noquery', 'debug', 'help')) except getopt.GetoptError, e: print >> sys.stderr, "Error: %s" % e usage() sys.exit(1) for opt, arg in opts: if opt in ('-i', '--include'): inc.add(arg) elif opt in ('-e', '--exclude'): exc.add(arg) elif opt in ('-D', '--debug'): dbg = True elif opt in ('-o', '--output'): try: ofp = file(arg, 'wt') except (IOError, OSError), e: print >> sys.stderr, "Error opening output: %s" % arg print >> sys.stderr, " -- %s\n" % e sys.exit(1) elif opt in ('-q', '--query'): que = True elif opt in ('-Q', '--noquery'): que = False elif opt in ('-h', '--help'): usage(long = True) sys.exit(0) if len(args) == 0: usage() ; sys.exit(1) if not inc: inc.update(args) if dbg: print >> sys.stderr, "[diagnostic output enabled]" for url in args: sp = IncrementalSpider(ofp, inc, exc, que) if dbg: sp._debug = True sp.search(url) # }} if __name__ == "__main__": main(sys.argv[1:]) sys.exit(0) __all__ = ('open_url', 'get_links', 'Area', 'Spider', 'ConstrainedSpider', 'IncrementalSpider', 'GraphSpider') # Here there be dragons