-
Notifications
You must be signed in to change notification settings - Fork 0
/
seaspider.py
189 lines (149 loc) · 5.74 KB
/
seaspider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
import bs4
from csv import reader
import find_errors
import glob
import json
import logging
import os
import re
import requests
import sys
import time
def crawl_csv_url_list():
logging.info('Reading data from CSV')
csv_file_path = get_config_value('csv_file_path')
if len(glob.glob(csv_file_path)) > 0:
with open(csv_file_path, 'r') as read_obj:
csv_reader = reader(read_obj)
for row in csv_reader:
crawl_recursively(row[0], depth=1)
else:
log_error_and_crash_with_message(
'Could not find file: ' + csv_file_path
)
def crawl_from_origin_url():
url = get_config_value('origin_domain')
origin_url = 'https://' + url
logging.info('Performing domain-wide scan on :' + origin_url)
crawl_recursively(origin_url, depth=1)
def crawl_recursively(url, depth=1):
url = url.split('#', 1)[0]
max_crawl_depth = get_config_value('max_crawl_depth')
if depth <= max_crawl_depth:
crawl_target(url)
url_id = get_url_id(url)
with open('data/' + str(url_id) + '.json') as crawl_file:
crawl_json = json.load(crawl_file)
crawl_html = crawl_json['text']
links = extract_links_from_html(crawl_html)
for link in links:
crawl_recursively(link, depth + 1)
def crawl_target(url):
logging.debug('Considering crawl target: ' + url)
url_id = get_url_id(url)
crawl_file_name_pattern = 'data/' + str(url_id) + '.json'
crawl_file_exists = len(glob.glob(crawl_file_name_pattern)) > 0
if not crawl_file_exists:
print('Crawling: ', url)
r = requests.get(url, headers={'User-Agent': 'Sea'})
logging.info('Python requests call completed: ' + str(r.status_code) + ' ' + url)
delay_between_crawls = get_config_value('delay_between_crawls')
time.sleep(delay_between_crawls)
crawl_result = {
"id": url_id,
"url": url,
"response_code": r.status_code,
"timestamp_float": time.time(),
"text": r.text
}
with open(crawl_file_name_pattern, 'w') as outfile:
json.dump(crawl_result, outfile, indent=4)
def create_empty_log_file(log_file_directory, log_file_filename):
log_file_path = f"{log_file_directory}/{log_file_filename}"
if not os.path.exists(log_file_directory):
os.mkdir(log_file_directory)
if not os.path.exists(log_file_path):
open(log_file_path, 'a').close()
def extract_links_from_html(html):
allow_outside = get_config_value('allow_outside_starting_domain')
origin_domain = get_config_value('origin_domain')
soup = bs4.BeautifulSoup(html, features='html.parser')
pattern = '^https?://'
if not allow_outside:
pattern += origin_domain
links = soup.findAll('a', attrs={'href': re.compile(pattern)})
links_list = []
for link in links:
url = link.get('href')
links_list.append(url)
return links_list
def get_max_url_id():
if len(glob.glob('data/url_id_map.json')) > 0:
with open('data/url_id_map.json') as url_id_map_file:
url_id_map = json.load(url_id_map_file)
max_id = 0
for url_id in url_id_map.keys():
if int(url_id) > max_id:
max_id = int(url_id)
return max_id
else:
return 0
def get_url_id(url):
if len(glob.glob('data/url_id_map.json')) > 0:
with open('data/url_id_map.json', 'r') as url_id_map_file:
url_id_map = json.load(url_id_map_file)
for url_id in url_id_map.keys():
if url_id_map[url_id]['url'] == url:
return url_id
new_url_id = get_max_url_id() + 1
register_new_url_id(new_url_id, url)
return new_url_id
def get_config_value(key):
with open('config.json', 'r') as config_file:
config_json = json.load(config_file)
return config_json[key]
def log_error_and_crash_with_message(message):
logging.error(message)
raise ValueError(message)
def register_new_url_id(id, url):
if len(glob.glob('data/url_id_map.json')) > 0:
with open('data/url_id_map.json', 'r') as url_id_map_file:
url_id_map = json.load(url_id_map_file)
else:
url_id_map = {}
url_id_map[id] = {'url': url}
with open('data/url_id_map.json', 'w') as url_id_map_file:
json.dump(url_id_map, url_id_map_file, indent=4)
def validate_config_file():
allow_outside_starting_domain = get_config_value('allow_outside_starting_domain')
origin_domain = get_config_value('origin_domain')
if allow_outside_starting_domain:
logging.warning('Scan mode allows crawling outside origin domain')
if (not allow_outside_starting_domain) and (origin_domain == False or \
len(origin_domain) < 1):
log_error_and_crash_with_message('Domain restriction active but no domain filter set')
def main():
log_file_directory = 'data'
log_file_filename = 'seaspider.log'
log_file_path = 'data/seaspider.log'
create_empty_log_file(log_file_directory, log_file_filename)
logging.basicConfig(
format='%(asctime)s %(levelname)-8s %(message)s',
level=logging.WARNING,
datefmt='%Y-%m-%d %H:%M:%S',
filename=log_file_path
)
validate_config_file()
operation_mode = get_config_value('operation_mode')
if operation_mode == 'domain_scan':
crawl_from_origin_url()
elif operation_mode == 'csv':
crawl_csv_url_list()
else:
log_error_and_crash_with_message('Operation mode unrecognized: ' + \
operation_mode + '\nPlease use \'domain_scan\' or \'csv\' as ' + \
'the operation_mode value and try again.')
pass
find_errors.find_errors()
if __name__ == '__main__':
main()