-
Notifications
You must be signed in to change notification settings - Fork 11
/
modify.py
executable file
·123 lines (103 loc) · 3.85 KB
/
modify.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#!/usr/bin/env python3
import glob
from bs4 import BeautifulSoup
import urllib
import os
import requests
import sys
# TODO figure out the right new hostname
NEW_HOSTNAME='foo.apps.y.cld.gov.au'
REQUESTS_TIMEOUT=2
class HtmlFile(object):
filename = None
filecontents = None
filechanged = None
soup = None
def __init__(self, filename):
self.filename = filename
def __check_redirects(self):
print('checking for redirects')
# Check all <a>'s to ausgov for redirects. Replace them with the final redirect
# destination
for anchor in self.soup.find_all('a'):
href = anchor.get('href')
if href != None and href.startswith('https://www.australia.gov.au'):
r = requests.get(href, timeout=REQUESTS_TIMEOUT)
if not r.url.startswith('https://www.australia.gov.au'):
print('Replacing %s with %s' % (href, r.url))
anchor['href'] = r.url
self.filechanged = True
def __check_absolute_links(self):
print('checking absolute links to self')
# Replace absolute links to www.australia.gov.au within <link> elements
# with our new hostname
for link in self.soup.find_all('link'):
href = link.get('href')
o = urllib.parse.urlparse(href)
if o.netloc == 'www.australia.gov.au':
link['href'] = link['href'].replace('www.australia.gov.au', NEW_HOSTNAME)
self.filechanged = True
def process(self):
print(self.filename)
with open(self.filename, 'r', encoding='utf-8') as file:
filecontents = file.read()
self.soup = BeautifulSoup(filecontents, features='html.parser')
self.filechanged = False
self.__check_redirects()
# disabling this until fixed
#self.__check_absolute_links()
if self.filechanged:
print('Writing out changed %s' % self.filename)
with open(self.filename, 'w', encoding='utf-8') as file:
file.write(str(self.soup))
else:
print('No changes in %s' % self.filename)
# document = soup.get_text(
# print(document)
# f=open(filename, 'r', 'utf-8')
# print(document)
def process_recursively(directory):
for filename in glob.iglob(directory+'/**/*.html', recursive=True):
processHtml(filename)
for filename in glob.iglob(directory+'/**/*.css', recursive=True):
processCss(filename)
def processHtml(filename):
# wget seems to unnecessarily add the html extension to some fonts/icons
# which messes with our processing, so we'll just rename them to remove the
# html
root, extension = os.path.splitext(filename)
if '.eot?' in filename or \
'.ttf?' in filename or \
'.woff?' in filename:
# Rename to remove the extension
print("Renaming %s to %s" % (filename, root))
os.rename(filename, root)
return
page = HtmlFile(filename)
page.process()
def processCss(filename):
print(filename)
# Read in the file
with open(filename, 'r', encoding='utf-8') as file:
filecontents = file.read()
filecontents = filecontents.replace( \
'ausgov.woff%3F92168388.html', \
'ausgov.woff%3F92168388')
with open(filename, 'w', encoding='utf-8') as file:
file.write(filecontents)
def process(filename):
root, extension = os.path.splitext(filename)
if extension == '.html':
processHtml(filename)
elif extension == '.css':
processCss(filename)
if __name__ == '__main__':
try:
arg = sys.argv[1]
if os.path.isdir(arg):
process_recursively(arg)
else:
process(arg)
except IndexError:
print("Please specify file or directory to scan")
exit(1)