#!/usr/bin/env python ## ## Name: getlinks ## Purpose: Extract hyperlinks from an HTML document. ## Author: M. J. Fromberger ## Copyright (C) 2004-2007 M. J. Fromberger, All Rights Reserved. ## Info: $Id: getlinks 460 2007-10-30 15:58:42Z sting $ ## ## Usage: getlinks [options] [] ## ## Notes: ## Written as a demonstration program for a lecture on Python given to ## W. M. McKeeman's course in Programming Languages at Dartmouth ## College, Hanover, New Hampshire, in the Spring term of 2004. ## import sys, re from urllib2 import urlopen, URLError from urlparse import urlparse, urlunparse from getopt import getopt, GetoptError # From: http://www.crummy.com/software/BeautifulSoup/ import BeautifulSoup as soup HTMLParser = soup.BeautifulSoup # {{ class Link class Link (object): """A class to represent a hyperlink extracted from an HTML document. """ base_url = '' def __init__(self, info): """Construct a new link object from a BeautifulSoup.Tag.""" self._link = info self._text = None self._html = None self._url = None def __cmp__(self, other): """Order links by content first, then by URL.""" return cmp(self.get_text(), other.get_text()) or \ cmp(self.get_url(), other.get_url()) @staticmethod def expand_url(url): """Attempt to expand a URL by filling in the base URL. The method used here is ad hoc, and not guaranteed to work, but it seems to do well in practise. The base URL is obtained from Link.base_url, and should be a string. """ p_base = urlparse(Link.base_url) p_url = list(urlparse(url)) for pos in xrange(len(p_base)): if p_url[pos]: break if p_base[pos]: p_url[pos] = p_base[pos] # Handle relative pathnames if pos == 2 and p_url[pos] and p_url[pos][0] <> '/': slix = p_base[pos].rfind('/') p_url[pos] = p_base[pos][:slix + 1] + p_url[pos] return urlunparse(tuple(p_url)) def get_text(self): """Extract or invent descriptive text for this link. For links whose primary content is an image, the ALT attribute is used if available and nonempty; otherwise the SRC attribute is used if available and nonempty. """ if self._text is None: for item in self._link.contents: if isinstance(item, soup.NavigableString) and \ item.string.strip(): self._text = re.sub('\s{2,}', ' ', item.string.strip()) break elif isinstance(item, soup.Tag) and item.name == 'img': alt = item.get('alt', '').strip() src = item.get('src').strip() or "no source found" if alt: self._text = '[image] "%s" (%s)' % (alt, src) else: self._text = '[image] source: %s' % src break else: self._text = self._link.get('href', '(no description)') return self._text def get_html(self): """Extract a descriptive HTML synopsis of the link. For images, a link to the image source is included. """ if self._html is None: for item in self._link.contents: if isinstance(item, soup.NavigableString) and \ item.string.strip(): self._html = '%s' % \ ( self.expand_url(self._link.get('href')), re.sub('\s{2,}', ' ', item.string.strip()) ) break elif isinstance(item, soup.Tag) and item.name == 'img': t = '[image] ' % \ ( self.expand_url(item.get('src')), self.expand_url(self._link.get('href')) ) if item.get('alt').strip(): t += '%s' % item.get('alt').strip() elif item.get('src').strip(): t += '%s' % item.get('src').strip() else: continue self._html = t ; break else: self._html = '%s' % \ ( self.expand_url(self._link.get('href')), '(no description)' ) return self._html def get_url(self): """Extract a normalized URL from the link. See Link.expand_url() for more information. """ if self._url is None: raw_url = self._link.get('href') if raw_url is None: raise ValueError("Unable to find HREF in link") self._url = self.expand_url(raw_url) return self._url # }} # {{ extract_links(url) def extract_links(url): """Return a tuple of (real_url, title, links) giving the resolved URL for the specified target, the title of the document, and a list of all the hyperlinks found there, possibly empty. """ try: u = urlopen(url) src_url = u.url data = u.read() finally: u.close() p = HTMLParser(data) title = p.first('title') and \ p.first('title').renderContents().strip() Link.base_url = src_url return (src_url, title, sorted(Link(info) for info in p.fetch('a') if info.get('href') is not None)) # }} # {{ main(argv) def main(argv): # Nested function, visible only inside main(). def usage(long = False): print >> sys.stderr, "Usage: getlinks [-d|-p|-h|-t] " \ " []" if long: print >> sys.stderr, "\nOptions:" print >> sys.stderr, \ " -d/--nodups -- only one copy of each URL found.\n" \ " -p/--pipe -- output suitable for piping (default).\n" \ " -h/--html -- output in HTML format.\n" \ " -t/--text -- output in human-readable text.\n" \ " -H/--help -- display this help message.\n" else: print >> sys.stderr, " [use `-H' or `--help' for options]" def diag(msg, *args): if diagnostics[0]: print >> sys.stderr, msg % args # Configuration options output_fmt = 'text' # options: pipe, text, html diagnostics = [True] # if false, suppress diagnostic output filter_dup = False # if true, remove duplicate URLs from output # Check command-line arguments try: opts, args = getopt(argv[1:], 'dpthH', ('nodups', 'help', 'pipe', 'text', 'html')) except GetoptError, e: print >> sys.stderr, "Error: %s; use `--help' for assistance" % e usage() sys.exit(1) for key, arg in opts: if key in ('-H', '--help'): usage(True) sys.exit(0) elif key in ('-d', '--nodups'): filter_dup = True elif key in ('-t', '--text'): output_fmt = 'text' elif key in ('-h', '--html'): output_fmt = 'html' elif key in ('-p', '--pipe'): output_fmt = 'pipe' diagnostics[0] = False if len(args) == 0: print >> sys.stderr, "You must specify a URL to load from." usage() sys.exit(1) # If provided, open the output file named on the command line if len(args) > 1: try: ofp = open(args[1], 'w') except OSError, e: print >> sys.stderr, "Unable to open file `%s' for writing: %s" % \ (args[1], e) sys.exit(2) else: ofp = sys.stdout # Connect to the specified URL, and download whatever is there try: src_url, title, links = extract_links(args[0]) diag('[loaded %s]', src_url) except URLError, e: print >> sys.stderr, "Error loading URL <%s>:\n -- %s" % (args[0], e) sys.exit(3) except ValueError: print >> sys.stderr, "The argument `%s' is not a valid URL" % args[0] sys.exit(4) # If requested, remove duplicate entries if filter_dup: pos = 0 ; nd = 0 while pos < len(links) - 1: if links[pos] == links[pos + 1]: links.pop(pos) nd += 1 else: pos += 1 diag('[filtered out %d duplicate entr%s]', nd, nd == 1 and "y" or "ies") if output_fmt <> 'pipe': diag('[found %d link%s]', len(links), len(links) <> 1 and "s" or "") diag('[writing to %s in %s format]', ofp.name, output_fmt) # Display links in lexicographic order by their descriptive text # Remove the sorted() call above to get them in order of occurrence. if output_fmt == 'text': print >> ofp, "Title: %s" % title print >> ofp, "URL: <%s>" % src_url print >> ofp, "Links: %d" % len(links) print >> ofp for pos, link in enumerate(links): ofp.write("%d.\t%s\n\t%s\n\n" % (pos + 1, link.get_text(), link.get_url())) elif output_fmt == 'pipe': print >> ofp, "# Title: %s" % title print >> ofp, "# URL: <%s>" % src_url for pos, link in enumerate(links): ofp.write('%s\t%s\n' % (link.get_text().replace('\t', ' '), link.get_url())) elif output_fmt == 'html': ofp.write(""" Links from %s

%d links found at <%s>

    """ % ((title and ('"%s"' % title)) or 'untitled document', len(links), src_url, src_url)) for link in links: ofp.write('
  1. %s
    \n' ' %s
  2. \n\n' % ( link.get_html(), link.get_url() )) ofp.write('
\n\n\n\n') else: print >> sys.stderr, "Unknown output format: %s" % output_fmt if ofp is not sys.stdout: ofp.close() # }} # Start driver if this script is the main module if __name__ == '__main__': main(sys.argv) # Here there be dragons