1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#!/usr/bin/env python
# ogm-sampples.py
# Author: Matt Mayes
# March 11, 2008
"""
-- This requires the Beautiful Soup mod: http://www.crummy.com/software/BeautifulSoup/ --
Steps:
1. Identify all <ul>'s that are preceded with '<font color="#3C378C" size="2">' (which denotes a header here)
2. Pull that font text, and store as dictionary key
3. Extract all links and link text from the list, generate a link title and type (pdf/html/404) store as tuples in 
appropriate dict key (note that some list items contain more than 1 link, this handles it) If it's a 404, it will
not be added to the list.
4. Identify if it's linking to an HTML page or PDF
5. If it's a local pdf referenced by a root value ("/file.pdf"), it strips the slash. Modify to suit your needs.
6. Generate a CSV file of results
"""

import urllib2, re
from BeautifulSoup import BeautifulSoup

page = urllib2.urlopen("http://www.givegoodweb.com/examples/ogm-samples.html")
soup = BeautifulSoup(page)
fontStart = re.compile(r'<font[a-zA-Z-",0-9= ]*>?')
fontEnd = re.compile(r'</font>')
titleSearch = re.compile(r'title=')
getTitle = re.compile(r'<title>(.*)</title>',re.DOTALL|re.MULTILINE)
emailSearch = re.compile(r'mailto')

def removeNL(x):
	"""cleans a string of new lines and spaces"""
	s = x.split('\n')
	s = [x.strip() for x in s]
	x = " ".join(s)
	return x.lstrip()

ul_tags = {}

for ul in soup.html.body.findAll('ul'):
	links = []
	x = ul.findPrevious('font', color="#3C378C").renderContents()
	if '\n' in x:
		x = removeNL(x)
	for li in ul.findAll('li'):
		line = []
		for a in li.findAll('a'):
			c = removeNL(str(a.contents[0]))
			c = fontStart.sub('', c)
			c = fontEnd.sub('', c)
			href = str(a.get('href'))
			if href[-3:].lower() == 'pdf':
				type = 'pdf'
				title = "PDF sample"
			elif emailSearch.search(href):
				title = 'email'
			else: 
				type = 'html'
				try:
					f = urllib2.urlopen(href)
					# reading in 2000 characters should to it
					t = getTitle.search(f.read(2000))
					if t : 
						title = t.group(1)
						title = removeNL(title)
					else : title = "open link"
				except urllib2.HTTPError, e:
					title = 404
				f.close()
			if title != 404: 
				line.append((c, href.lstrip('/'), type, title))
		links.append(line)
	ul_tags[x] = links		
	
page.close()

f = open('samples.csv', 'w')

for i in ul_tags.iterkeys():
	for x in ul_tags[i]:
		for t in x:
			f.write('%s, %s, %s, %s, %s \n' % (i, t[0], t[1], t[2], t[3]))
	
f.close()