diff --git a/.gitignore b/.gitignore index b6e4761..3d553d5 100644 --- a/.gitignore +++ b/.gitignore @@ -127,3 +127,7 @@ dmypy.json # Pyre type checker .pyre/ + +# Sea Spider custom ignore rules +data/ +config.json diff --git a/README.md b/README.md index 3804e13..56c2ec5 100644 --- a/README.md +++ b/README.md @@ -2,9 +2,21 @@ A humble SEO spider and link checker # Usage +## Initial setup ``` pip install -r requirements.txt +python setup.py +``` + +## Basic example +``` python seaspider.py https://google.com ``` ![Usage example: checking all links on a given web page](Usage-example-screen-recording.gif) + +## Example with domain restriction +You can limit crawling to a specific domain by providing a second parameter, the domain name. +``` +python seaspider.py https://google.com google.com +``` diff --git a/config-sample.json b/config-sample.json new file mode 100644 index 0000000..0992b3d --- /dev/null +++ b/config-sample.json @@ -0,0 +1,6 @@ +{ + "allow_outside_starting_domain": false, + "max_crawl_count": 2000, + "max_crawl_depth": 3, + "origin_domain": "example.com" +} \ No newline at end of file diff --git a/find_errors.py b/find_errors.py new file mode 100644 index 0000000..51a0310 --- /dev/null +++ b/find_errors.py @@ -0,0 +1,28 @@ +import glob +import json + +def find_errors(): + ignore_list = ['data/url_id_map.json'] + glob_pattern = 'data/*.json' + item_count = 0 + ok_count = 0 + problem_count = 0 + + for item in glob.glob(glob_pattern): + with open(item, 'r') as infile: + json_data = json.load(infile) + + if 'id' in json_data.keys(): + item_count += 1 + response_code = int(json_data['response_code']) + url = json_data['url'] + + if response_code == 200: + ok_count += 1 + else: + problem_count += 1 + + print(response_code, ' ', url) + + print('Statistics:\nTotal items: ', item_count, '\nHealthy signals: ', \ + ok_count, '\nProblems: ', problem_count) diff --git a/requirements.txt b/requirements.txt index 577d67f..b5a91ce 100644 Binary files a/requirements.txt and b/requirements.txt differ diff --git a/seaspider.py b/seaspider.py index de4afff..a50855f 100644 --- a/seaspider.py +++ b/seaspider.py @@ -1,50 +1,115 @@ import bs4 +import find_errors +import glob +import json import re import requests import sys import time -from tqdm import tqdm - -def check_url(url): - url_check_result = {} - r = requests.get(url, headers={'User-Agent': 'Sea'}) - url_check_result['status_code'] = r.status_code - print('\n', url_check_result['status_code'], ' ', url) - return url_check_result - -def crawl_target(target_url): - crawl_result = {} - r = requests.get(target_url, headers={'User-Agent': 'Sea'}) - crawl_result['status_code'] = r.status_code - crawl_result['text'] = r.text - return crawl_result -def main(): - crawl_queue = {} +def crawl_recursively(url, depth=1): + url = url.split('#', 1)[0] + max_crawl_depth = get_config_value('max_crawl_depth') + + if depth <= max_crawl_depth: + crawl_target(url) + url_id = get_url_id(url) - if len(sys.argv) < 2: - print('[ERROR] No target URL supplied. Please provide a URL for seaspider to crawl.') - else: - target_url = sys.argv[1] - crawl_result = crawl_target(target_url) - print(crawl_result['status_code'], ' ', target_url) - soup = bs4.BeautifulSoup(crawl_result['text'], features='html.parser') - links = soup.findAll('a', attrs={'href': re.compile('^https?://')}) - print(len(links), ' links detected') + with open('data/' + str(url_id) + '.json') as crawl_file: + crawl_json = json.load(crawl_file) + + crawl_html = crawl_json['text'] + links = extract_links_from_html(crawl_html) for link in links: - url = link.get('href') - - if not url in crawl_queue.keys(): - crawl_queue[url] = {} + crawl_recursively(link, depth + 1) + +def crawl_target(url): + url_id = get_url_id(url) + crawl_file_name_pattern = 'data/' + str(url_id) + '.json' + crawl_file_exists = len(glob.glob(crawl_file_name_pattern)) > 0 + + if not crawl_file_exists: + print('Crawling: ', url) + r = requests.get(url, headers={'User-Agent': 'Sea'}) + crawl_result = { + "id": url_id, + "url": url, + "response_code": r.status_code, + "timestamp_float": time.time(), + "text": r.text + } + + with open(crawl_file_name_pattern, 'w') as outfile: + json.dump(crawl_result, outfile, indent=4) + +def extract_links_from_html(html): + allow_outside_starting_domain = get_config_value('allow_outside_starting_domain') + origin_domain = get_config_value('origin_domain') + soup = bs4.BeautifulSoup(html, features='html.parser') + pattern = '^https?://' + + if not allow_outside_starting_domain: + pattern += origin_domain + + links = soup.findAll('a', attrs={'href': re.compile(pattern)}) + links_list = [] + + for link in links: + url = link.get('href') + links_list.append(url) + + return links_list + +def get_max_url_id(): + if len(glob.glob('data/url_id_map.json')) > 0: + with open('data/url_id_map.json') as url_id_map_file: + url_id_map = json.load(url_id_map_file) - for key in crawl_queue.keys(): - print(key) + max_id = 0 - progress_bar_label = 'Crawling ' + str(len(crawl_queue)) + ' URLs' + for url_id in url_id_map.keys(): + if int(url_id) > max_id: + max_id = int(url_id) + + return max_id + else: + return 0 + +def get_url_id(url): + if len(glob.glob('data/url_id_map.json')) > 0: + with open('data/url_id_map.json', 'r') as url_id_map_file: + url_id_map = json.load(url_id_map_file) + + for url_id in url_id_map.keys(): + if url_id_map[url_id]['url'] == url: + return url_id + + new_url_id = get_max_url_id() + 1 + register_new_url_id(new_url_id, url) + return new_url_id + +def get_config_value(key): + with open('config.json', 'r') as config_file: + config_json = json.load(config_file) + + return config_json[key] + +def register_new_url_id(id, url): + if len(glob.glob('data/url_id_map.json')) > 0: + with open('data/url_id_map.json', 'r') as url_id_map_file: + url_id_map = json.load(url_id_map_file) + else: + url_id_map = {} + + url_id_map[id] = {'url': url} + + with open('data/url_id_map.json', 'w') as url_id_map_file: + json.dump(url_id_map, url_id_map_file, indent=4) + +def main(): + origin_url = 'https://' + get_config_value('origin_domain') + crawl_recursively(origin_url) + find_errors.find_errors() - for key in tqdm(crawl_queue.keys(), desc=progress_bar_label): - crawl_queue[key]['crawl_result'] = check_url(key) - time.sleep(0.1) - main()