-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathHTMLPage.py
115 lines (104 loc) · 3.9 KB
/
HTMLPage.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#!/usr/bin/env python
import urllib.request
from urllib.parse import urlparse
from urllib.parse import urljoin
from urllib.error import URLError
from bs4 import BeautifulSoup
import re
'''
TO DO:
* multi threading for loading pages
* request gzip pages from server to save download size
'''
class HTMLPage:
def __init__(self,url):
self.url = url
self.soup = self.load(url)
self.links = self.findlinks(self.soup)
self.images = self.soup.find_all('img')
def load(self,url,returnsoup = True):
try:
response = urllib.request.urlopen(url)
# Ensure B'soup is only give html pages
if returnsoup == True and'text/html' in response.info()['Content-type']:
return BeautifulSoup(response.read(),'html.parser')
else:
return response.read()
except:
print("HTMLpage load function failed")
return None
def findlinks(self,soup):
links = soup.find_all('a')
for i,link in enumerate(links):
links[i] = urljoin(self.url,link['href']) #convert all links into absolute urls
return links
def linkstatus(self,url):
soup = self.load(url)
if soup:
frag = urlparse(url)[5]
if frag == '' or soup.find(id = frag) or soup.find(name = frag):
return True
return False
# custom filter for 'gettext' function to find visible text on the page
def visibletext(self, tag):
no_content_tags = ['style', 'script', '[document]', 'head', 'title']
if tag.string is None or tag.name in no_content_tags:
return False
return True
# TO DO: DRY the function.
# NLTK = dedicated python module, probably a better way to tonkenize the strings to words!
def gettext(self, *args):
lst = []
if len(args) is 0:
# Return all text that diplays on the page
tags = self.soup.findAll(self.visibletext)
for tag in tags:
lst += re.sub("[^a-zA-Z0-9]+"," ", tag.string).split(" ")
return lst
else:
# Return text from requested tags
for tag in args:
for tagsoup in self.soup.find_all(tag):
for string in tagsoup.stripped_strings:
lst += string.split(' ')
return lst
def linkreport(self):
report = {'on':0,'off':0}
for link in self.links:
if self.linkstatus(link):
report['on'] += 1
else:
report['off'] += 1
return report
def resource_exists(self,url):
url = urljoin(self.url, url)
try:
res = urllib.request.urlopen(url)
return True
except URLError as e:
pass
return False
# Collate data relating to a tag
# arguments include all potential tag attributes plus the following special ones:
# content: returns the content of the tag
# validate-src: tests whether the src loads
# validate-href: tests whether the href loads
def tag_data(self,tag,*args):
tags = self.soup.find_all(tag)
data = {
'head':[],
'body':[[] for i in range(len(tags))],
}
for i, arg in enumerate(args):
data['head'].append(arg)
for j, tag in enumerate(tags):
if arg == 'content':
value = tag.text
elif arg == 'validate-href':
value = '✓' if self.resource_exists(tag['href']) else '✕'
elif arg == 'validate-src':
value = '✓' if self.resource_exists(tag['src']) else '✕'
else:
value = tag[arg] if tag.has_attr(arg) else ""
data['body'][j].append(value)
return data