#!/usr/bin/env python
##
## Name: getlinks
## Purpose: Extract hyperlinks from an HTML document.
## Author: M. J. Fromberger
## Copyright (C) 2004-2007 M. J. Fromberger, All Rights Reserved.
## Info: $Id: getlinks 460 2007-10-30 15:58:42Z sting $
##
## Usage: getlinks [options] []
##
## Notes:
## Written as a demonstration program for a lecture on Python given to
## W. M. McKeeman's course in Programming Languages at Dartmouth
## College, Hanover, New Hampshire, in the Spring term of 2004.
##
import sys, re
from urllib2 import urlopen, URLError
from urlparse import urlparse, urlunparse
from getopt import getopt, GetoptError
# From: http://www.crummy.com/software/BeautifulSoup/
import BeautifulSoup as soup
HTMLParser = soup.BeautifulSoup
# {{ class Link
class Link (object):
"""A class to represent a hyperlink extracted from an HTML
document.
"""
base_url = ''
def __init__(self, info):
"""Construct a new link object from a BeautifulSoup.Tag."""
self._link = info
self._text = None
self._html = None
self._url = None
def __cmp__(self, other):
"""Order links by content first, then by URL."""
return cmp(self.get_text(), other.get_text()) or \
cmp(self.get_url(), other.get_url())
@staticmethod
def expand_url(url):
"""Attempt to expand a URL by filling in the base URL. The
method used here is ad hoc, and not guaranteed to work, but it
seems to do well in practise.
The base URL is obtained from Link.base_url, and should be a
string.
"""
p_base = urlparse(Link.base_url)
p_url = list(urlparse(url))
for pos in xrange(len(p_base)):
if p_url[pos]: break
if p_base[pos]:
p_url[pos] = p_base[pos]
# Handle relative pathnames
if pos == 2 and p_url[pos] and p_url[pos][0] <> '/':
slix = p_base[pos].rfind('/')
p_url[pos] = p_base[pos][:slix + 1] + p_url[pos]
return urlunparse(tuple(p_url))
def get_text(self):
"""Extract or invent descriptive text for this link. For links
whose primary content is an image, the ALT attribute is used if
available and nonempty; otherwise the SRC attribute is used if
available and nonempty.
"""
if self._text is None:
for item in self._link.contents:
if isinstance(item, soup.NavigableString) and \
item.string.strip():
self._text = re.sub('\s{2,}', ' ', item.string.strip())
break
elif isinstance(item, soup.Tag) and item.name == 'img':
alt = item.get('alt', '').strip()
src = item.get('src').strip() or "no source found"
if alt:
self._text = '[image] "%s" (%s)' % (alt, src)
else:
self._text = '[image] source: %s' % src
break
else:
self._text = self._link.get('href', '(no description)')
return self._text
def get_html(self):
"""Extract a descriptive HTML synopsis of the link. For
images, a link to the image source is included.
"""
if self._html is None:
for item in self._link.contents:
if isinstance(item, soup.NavigableString) and \
item.string.strip():
self._html = '%s' % \
( self.expand_url(self._link.get('href')),
re.sub('\s{2,}', ' ', item.string.strip()) )
break
elif isinstance(item, soup.Tag) and item.name == 'img':
t = '[image] ' % \
( self.expand_url(item.get('src')),
self.expand_url(self._link.get('href')) )
if item.get('alt').strip():
t += '%s' % item.get('alt').strip()
elif item.get('src').strip():
t += '%s' % item.get('src').strip()
else:
continue
self._html = t ; break
else:
self._html = '%s' % \
( self.expand_url(self._link.get('href')),
'(no description)' )
return self._html
def get_url(self):
"""Extract a normalized URL from the link. See
Link.expand_url() for more information.
"""
if self._url is None:
raw_url = self._link.get('href')
if raw_url is None:
raise ValueError("Unable to find HREF in link")
self._url = self.expand_url(raw_url)
return self._url
# }}
# {{ extract_links(url)
def extract_links(url):
"""Return a tuple of (real_url, title, links) giving the resolved
URL for the specified target, the title of the document, and a
list of all the hyperlinks found there, possibly empty.
"""
try:
u = urlopen(url)
src_url = u.url
data = u.read()
finally:
u.close()
p = HTMLParser(data)
title = p.first('title') and \
p.first('title').renderContents().strip()
Link.base_url = src_url
return (src_url, title,
sorted(Link(info) for info in p.fetch('a') if
info.get('href') is not None))
# }}
# {{ main(argv)
def main(argv):
# Nested function, visible only inside main().
def usage(long = False):
print >> sys.stderr, "Usage: getlinks [-d|-p|-h|-t] " \
" []"
if long:
print >> sys.stderr, "\nOptions:"
print >> sys.stderr, \
" -d/--nodups -- only one copy of each URL found.\n" \
" -p/--pipe -- output suitable for piping (default).\n" \
" -h/--html -- output in HTML format.\n" \
" -t/--text -- output in human-readable text.\n" \
" -H/--help -- display this help message.\n"
else:
print >> sys.stderr, " [use `-H' or `--help' for options]"
def diag(msg, *args):
if diagnostics[0]:
print >> sys.stderr, msg % args
# Configuration options
output_fmt = 'text' # options: pipe, text, html
diagnostics = [True] # if false, suppress diagnostic output
filter_dup = False # if true, remove duplicate URLs from output
# Check command-line arguments
try:
opts, args = getopt(argv[1:], 'dpthH',
('nodups', 'help', 'pipe', 'text', 'html'))
except GetoptError, e:
print >> sys.stderr, "Error: %s; use `--help' for assistance" % e
usage()
sys.exit(1)
for key, arg in opts:
if key in ('-H', '--help'):
usage(True)
sys.exit(0)
elif key in ('-d', '--nodups'):
filter_dup = True
elif key in ('-t', '--text'):
output_fmt = 'text'
elif key in ('-h', '--html'):
output_fmt = 'html'
elif key in ('-p', '--pipe'):
output_fmt = 'pipe'
diagnostics[0] = False
if len(args) == 0:
print >> sys.stderr, "You must specify a URL to load from."
usage()
sys.exit(1)
# If provided, open the output file named on the command line
if len(args) > 1:
try:
ofp = open(args[1], 'w')
except OSError, e:
print >> sys.stderr, "Unable to open file `%s' for writing: %s" % \
(args[1], e)
sys.exit(2)
else:
ofp = sys.stdout
# Connect to the specified URL, and download whatever is there
try:
src_url, title, links = extract_links(args[0])
diag('[loaded %s]', src_url)
except URLError, e:
print >> sys.stderr, "Error loading URL <%s>:\n -- %s" % (args[0], e)
sys.exit(3)
except ValueError:
print >> sys.stderr, "The argument `%s' is not a valid URL" % args[0]
sys.exit(4)
# If requested, remove duplicate entries
if filter_dup:
pos = 0 ; nd = 0
while pos < len(links) - 1:
if links[pos] == links[pos + 1]:
links.pop(pos)
nd += 1
else:
pos += 1
diag('[filtered out %d duplicate entr%s]', nd,
nd == 1 and "y" or "ies")
if output_fmt <> 'pipe':
diag('[found %d link%s]', len(links),
len(links) <> 1 and "s" or "")
diag('[writing to %s in %s format]', ofp.name, output_fmt)
# Display links in lexicographic order by their descriptive text
# Remove the sorted() call above to get them in order of occurrence.
if output_fmt == 'text':
print >> ofp, "Title: %s" % title
print >> ofp, "URL: <%s>" % src_url
print >> ofp, "Links: %d" % len(links)
print >> ofp
for pos, link in enumerate(links):
ofp.write("%d.\t%s\n\t%s\n\n" %
(pos + 1, link.get_text(), link.get_url()))
elif output_fmt == 'pipe':
print >> ofp, "# Title: %s" % title
print >> ofp, "# URL: <%s>" % src_url
for pos, link in enumerate(links):
ofp.write('%s\t%s\n' %
(link.get_text().replace('\t', ' '),
link.get_url()))
elif output_fmt == 'html':
ofp.write("""
Links from %s
%d links found at
<%s>
""" % ((title and ('"%s"' % title)) or 'untitled document',
len(links), src_url, src_url))
for link in links:
ofp.write('- %s
\n'
' %s \n\n' % ( link.get_html(),
link.get_url() ))
ofp.write('
\n\n\n\n')
else:
print >> sys.stderr, "Unknown output format: %s" % output_fmt
if ofp is not sys.stdout:
ofp.close()
# }}
# Start driver if this script is the main module
if __name__ == '__main__':
main(sys.argv)
# Here there be dragons