#!/usr/bin/python

import sys
import re
import urllib2
from urlparse import urljoin
from lxml import html, etree 
from lxml.html import ElementSoup

class Spider(object):
	def __init__(self, base_url):
		self.base_url = base_url

	def pages(self):
		queue = [self.base_url]
		seen = set(queue)

		while queue:
			url = queue.pop(0)
			f = urllib2.urlopen(url)
			if f.info().gettype() not in ['text/html', 'application/xhtml+xml']:
				continue
			doc = ElementSoup.parse(f)
			doc.make_links_absolute(url)
			for element, attribute, link, pos in doc.iterlinks():
				if not link.startswith(self.base_url):
					continue
				if element.tag == 'a' and attribute == 'href':
					l = re.sub(r'#.*$', '', link)
					if l not in seen:
						queue.append(l)
						seen.add(l)

			path = url[len(self.base_url):]
			yield path, doc


class ImageSpider(Spider):
	def images(self):
		seen = set()
		for path, doc in self.pages():
			imgs = []
			for img in doc.findall('.//img'):
				src = img.get('src')
				alt = img.get('alt')
				title = img.get('title')
				i = (src, alt, title)
				if i not in seen:
					seen.add(i)
					imgs.append(i)

			if imgs:
				yield path, imgs

	def text_report(self, out=sys.stdout):
		for path, imgs in self.images():
			print >>out, 'In', path
			for src, alt, title in imgs:
				print >>out, '- src:', src
				if alt is not None:
					print >>out, '  alt:', alt
				else:
					print >>out, '  alt is MISSING'
				if title is not None:
					print >>out, '  title:', title
			print >>out

	def html_report(self, out=sys.stdout):
		from cgi import escape
		print >>out, """<html>
	<head>
		<title>Image Report for %(base_url)s</title>
	</head>
	<body>
		<h1>Image report for %(base_url)s</h1>
		""" % {'base_url': escape(self.base_url)}

		for path, imgs in self.images():
			print >>out, '\t\t<h2>%s</h2>' % escape(path).encode('utf8')
			for src, alt, title in imgs:
				idict = {'src': escape(unicode(src)).encode('utf8'),
					 'alt': escape(unicode(alt)).encode('utf8'),
					 'title': escape(unicode(title)).encode('utf8')}
				print >>out, '\t\t<img src="%(src)s" alt="%(alt)s" />' % idict
				if alt is not None:
					print >>out, '\t\t<p><strong>alt:</strong> %(alt)s</p>' % idict
				else:
					print >>out, '\t\t<p><strong>alt is MISSING</strong></p>'
				if title is not None:
					print >>out, '\t\t<p><strong>title:</strong> %(title)s</p>' % idict
				print >>out
		print >>out, """	</body>
</html>
"""


from optparse import OptionParser

op = OptionParser()
op.add_option('-f', '--format', choices=['text', 'html'])
op.add_option('-o', '--outfile')

options, args = op.parse_args()

if len(args) != 1:
	op.error('You must provide a site URL from which to spider images.')

s = ImageSpider(args[0])

if options.outfile:
	out = open(options.outfile, 'w')
else:
	out = sys.stdout

if options.format == 'html':
	s.html_report(out)
else:
	s.text_report(out)
