Skip to content

Commit

Permalink
Merge pull request #13 from aceberle/master
Browse files Browse the repository at this point in the history
Fix for timeouts, head not supported, relative links
v3.1.0
  • Loading branch information
ScholliYT committed Jun 5, 2021
2 parents e270db8 + 8f20f14 commit 5461bfe
Show file tree
Hide file tree
Showing 22 changed files with 476 additions and 109 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/python-lint-and-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ name: Lint and Test Project
on: [push, pull_request]

jobs:
build:
lint-and-test:

runs-on: ubuntu-latest

Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/python-mutation.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ name: Mutation Test Project
on: [pull_request]

jobs:
build:
mutation-test:

runs-on: ubuntu-latest

Expand Down
28 changes: 26 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,19 @@ Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)

### `connect_limit_per_host`

**Optional** By default, the crawler will open as many connections at it needs to make the requests. By setting this value to a number greater than zero (0), the crawler will not exceed that number of connections for each host. This can be useful for when crawling a site that has rate limits. (default unlimited).
**Optional** By default, the crawler will open a maximum of 10 connections per host. This can be useful for when crawling a site that has rate limits. Setting this value to zero will cause an unlimited number of connections per host, but this could inadvertently cause timeout errors if the target server gets overwhelmed with connections. (default 10).

### `search_attrs`

**Optional** The names of html element attributes to extract links from. This can be useful in you are crawling a site that uses a library like [lazyload](https://github.com/tuupola/lazyload) to lazy-load images -- you would want to make your search_attrs 'href,src,data-src'. (default 'href,src')

### `resolve_before_filtering`

**Optional** By default, the crawler will apply the includes/excludes filtering criteria to the links as they appear in the html source. For example, if a link has a relative url in the html source, then the includes/excludes will be applied to the link in its relative form. By setting this value to true, the crawler will fully resolve the link to its absolute representation before applying the includes/excludes filtering criteria. If you wanted to only crawl links that are prefixed with your site ('http://mysite.com/') then you would set `resolve_before_filtering` to `'true'` and set `include_url_prefix` to `'http://mysite.com/'`. (default false)

## Example usage

### Basic scan with retry
```yml
uses: ScholliYT/Broken-Links-Crawler-Action@v3
with:
Expand All @@ -80,9 +90,23 @@ with:
verbose: 'true'
max_retry_time: 30
max_retries: 5
max_depth: 1
```
### Basic scan with retry, only fetches URLs on same site
```yml
uses: ScholliYT/Broken-Links-Crawler-Action@v3
with:
website_url: 'https://github.com/ScholliYT/Broken-Links-Crawler-Action'
include_url_prefix: 'https://github.com/ScholliYT/Broken-Links-Crawler-Action'
resolve_before_filtering: 'true'
verbose: 'true'
max_retry_time: 30
max_retries: 5
max_depth: 1
```
## Dev
## Development
The easiest way to run this action locally is to use Docker. Just build a new image and pass the correct env. variables to it.
```
Expand Down
14 changes: 13 additions & 1 deletion action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,19 @@ inputs:
connect_limit_per_host:
description: 'Limit number of tcp connections per host'
required: false
default: ''
default: '10'
timeout:
description: 'Number of seconds to wait for a request to complete'
required: false
default: '60'
search_attrs:
description: 'Names of element attributes to extract links from'
required: false
default: 'href,src'
resolve_before_filtering:
description: 'Enables absolute link resolution before applying filtering patterns'
required: false
default: 'false'
runs:
using: 'docker'
image: 'Dockerfile'
Expand Down
3 changes: 3 additions & 0 deletions deadseeker/action.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,13 @@ def run_action() -> None:
logging.basicConfig(level=verbosity)

config = SeekerConfig()
config.search_attrs = inputvalidator.get_search_attrs()
config.connect_limit_per_host = inputvalidator.get_connect_limit_per_host()
config.timeout = inputvalidator.get_timeout()
config.max_tries = inputvalidator.get_retry_maxtries()
config.max_time = inputvalidator.get_retry_maxtime()
config.alwaysgetonsite = inputvalidator.get_alwaysgetonsite()
config.resolvebeforefilter = inputvalidator.get_resolvebeforefilter()
for inclusion in ['in', 'ex']:
for strategy in ['prefix', 'suffix', 'contained']:
attrname = f'{inclusion}clude{strategy}'
Expand Down
10 changes: 8 additions & 2 deletions deadseeker/clientsession.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
from .common import SeekerConfig
import aiohttp
import asyncio
import logging
from types import SimpleNamespace
from aiohttp import (
ClientSession,
TraceConfig,
TraceRequestStartParams,
TCPConnector
TCPConnector,
ClientTimeout
)
from aiohttp_retry import RetryClient, ExponentialRetry # type: ignore
from abc import abstractmethod, ABC
Expand Down Expand Up @@ -44,10 +46,14 @@ async def _on_request_start(
retry_options = ExponentialRetry(
attempts=config.max_tries,
max_timeout=config.max_time,
exceptions=[aiohttp.ClientError])
exceptions=[
aiohttp.ClientError,
asyncio.TimeoutError
])
return RetryClient(
raise_for_status=True,
connector=connector,
timeout=ClientTimeout(total=config.timeout),
headers={'User-Agent': config.agent},
retry_options=retry_options,
trace_configs=[trace_config])
9 changes: 7 additions & 2 deletions deadseeker/common.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Optional, List
from typing import Optional, List, Set

DEFAULT_WEB_AGENT: str = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)' +\
' AppleWebKit/537.36 (KHTML, like Gecko)' +\
Expand All @@ -7,7 +7,9 @@
DEFAULT_RETRY_MAX_TIME: int = 30
DEFAULT_EXCLUDE_PREFIX: List[str] = ['mailto:', 'tel:']
DEFAULT_MAX_DEPTH: int = -1
DEFAULT_CONNECT_LIMIT_PER_HOST: int = 0
DEFAULT_CONNECT_LIMIT_PER_HOST: int = 10
DEFAULT_TIMEOUT: int = 60
DEFAULT_SEARCH_ATTRS: Set[str] = set(['href', 'src'])


class SeekerConfig:
Expand All @@ -21,10 +23,13 @@ def __init__(self) -> None:
self.excludesuffix: List[str] = []
self.includecontained: List[str] = []
self.excludecontained: List[str] = []
self.search_attrs: Set[str] = DEFAULT_SEARCH_ATTRS
self.agent: str = DEFAULT_WEB_AGENT
self.alwaysgetonsite: bool = False
self.resolvebeforefilter: bool = False
self.connect_limit_per_host: int = \
DEFAULT_CONNECT_LIMIT_PER_HOST
self.timeout: int = DEFAULT_TIMEOUT


class UrlTarget():
Expand Down
15 changes: 7 additions & 8 deletions deadseeker/deadseeker.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import asyncio
from urllib.parse import urlparse, urljoin
from urllib.parse import urljoin
from typing import List, Set, Deque, Optional, Union
from .timer import Timer
import logging
Expand Down Expand Up @@ -53,7 +53,8 @@ async def _main(
visited.add(url)
targets.appendleft(UrlTarget(url, url, self.config.max_depth))
linkacceptor = self.linkacceptorfactory.get_link_acceptor(self.config)
linkparser = self.linkparserfactory.get_link_parser(linkacceptor)
linkparser = \
self.linkparserfactory.get_link_parser(self.config, linkacceptor)
responsefetcher = self.responsefetcherfactory.get_response_fetcher(
self.config)
async with self.clientsessionfactory.get_client_session(
Expand Down Expand Up @@ -86,16 +87,14 @@ def _parse_response(
resp: UrlFetchResponse) -> None:
depth = resp.urltarget.depth
if resp.html and depth != 0:
home = resp.urltarget.home
links = linkparser.parse(resp.html)
links = linkparser.parse(resp)
base = resp.urltarget.url
for newurl in links:
if not bool(
urlparse(newurl).netloc): # relative link?
newurl = urljoin(resp.urltarget.url, newurl)
newurl = urljoin(base, newurl)
if newurl not in visited:
visited.add(newurl)
targets.appendleft(
UrlTarget(home, newurl, depth - 1))
UrlTarget(resp.urltarget.home, newurl, depth - 1))

def seek(
self,
Expand Down
19 changes: 17 additions & 2 deletions deadseeker/inputvalidator.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
import validators # type: ignore
from typing import List, Dict, Union, Optional
from typing import List, Dict, Union, Optional, Set
import re
import logging
from deadseeker.common import (
DEFAULT_RETRY_MAX_TRIES,
DEFAULT_RETRY_MAX_TIME,
DEFAULT_SEARCH_ATTRS,
DEFAULT_WEB_AGENT,
DEFAULT_MAX_DEPTH,
DEFAULT_CONNECT_LIMIT_PER_HOST
DEFAULT_CONNECT_LIMIT_PER_HOST,
DEFAULT_TIMEOUT
)


Expand All @@ -25,6 +27,12 @@ def get_urls(self) -> List[str]:
f" expected to contain valid url: {url}"
return website_urls

def get_search_attrs(self) -> Set[str]:
search_attrs = self._splitAndTrim('INPUT_SEARCH_ATTRS')
if search_attrs:
return set(search_attrs)
return DEFAULT_SEARCH_ATTRS

def get_retry_maxtries(self) -> int:
return self._numeric('INPUT_MAX_RETRIES', DEFAULT_RETRY_MAX_TRIES)

Expand All @@ -38,6 +46,9 @@ def get_connect_limit_per_host(self) -> int:
return self._numeric(
'INPUT_CONNECT_LIMIT_PER_HOST', DEFAULT_CONNECT_LIMIT_PER_HOST)

def get_timeout(self) -> int:
return self._numeric('INPUT_TIMEOUT', DEFAULT_TIMEOUT)

def get_verbosity(self) -> Union[bool, int]:
verboseStr = self.inputs.get('INPUT_VERBOSE')
if(verboseStr):
Expand All @@ -54,6 +65,10 @@ def get_verbosity(self) -> Union[bool, int]:
def get_alwaysgetonsite(self) -> bool:
return self._get_boolean(self.inputs.get('INPUT_ALWAYS_GET_ONSITE'))

def get_resolvebeforefilter(self) -> bool:
return self._get_boolean(
self.inputs.get('INPUT_RESOLVE_BEFORE_FILTERING'))

def _get_boolean(self, valueStr: Optional[str]) -> bool:
truepattern = '^t|true|y|yes|on$'
return bool(
Expand Down
46 changes: 34 additions & 12 deletions deadseeker/linkparser.py
Original file line number Diff line number Diff line change
@@ -1,43 +1,59 @@
from .linkacceptor import LinkAcceptor
from html.parser import HTMLParser
from urllib.parse import urljoin
from typing import List, Tuple, Optional
import logging
from .common import SeekerConfig, UrlFetchResponse
from abc import abstractmethod, ABC


search_attrs = set(['href', 'src'])
logger = logging.getLogger(__name__)


class LinkParser(ABC):
@abstractmethod # pragma: no mutate
def parse(self, html: str) -> List[str]:
def parse(self, resp: UrlFetchResponse) -> List[str]:
pass


class LinkParserFactory(ABC):
@abstractmethod # pragma: no mutate
def get_link_parser(self, linkacceptor: LinkAcceptor) -> LinkParser:
def get_link_parser(
self,
config: SeekerConfig,
linkacceptor: LinkAcceptor) -> LinkParser:
pass


class DefaultLinkParser(LinkParser):
def __init__(self, linkacceptor: LinkAcceptor) -> None:
def __init__(
self,
config: SeekerConfig,
linkacceptor: LinkAcceptor) -> None:
self.config = config
self.linkacceptor = linkacceptor

def parse(self, html: str) -> List[str]:
parser = LinkHtmlParser(self.linkacceptor)
parser.feed(html)
def parse(self, resp: UrlFetchResponse) -> List[str]:
parser = LinkHtmlParser(resp, self.config, self.linkacceptor)
parser.parse()
return parser.links


class DefaultLinkParserFactory(LinkParserFactory):
def get_link_parser(self, linkacceptor: LinkAcceptor) -> LinkParser:
return DefaultLinkParser(linkacceptor)
def get_link_parser(
self,
config: SeekerConfig,
linkacceptor: LinkAcceptor) -> LinkParser:
return DefaultLinkParser(config, linkacceptor)


class LinkHtmlParser(HTMLParser):
def __init__(self, linkacceptor: LinkAcceptor):
def __init__(
self,
resp: UrlFetchResponse,
config: SeekerConfig,
linkacceptor: LinkAcceptor):
self.resp = resp
self.config = config
self.linkacceptor = linkacceptor
self.links: List[str] = list()
super().__init__()
Expand All @@ -51,11 +67,17 @@ def handle_starttag(
'''Override parent method and check tag for our attributes'''
for attr in attrs:
# ('href', 'http://google.com')
if attr[0] in search_attrs:
if attr[0] in self.config.search_attrs:
url = attr[1]
if url:
if self.config.resolvebeforefilter:
url = urljoin(self.resp.urltarget.url, url)
if self.linkacceptor.accepts(url):
logger.debug(f'Accepting url: {url}')
self.links.append(url)
else:
logger.debug(f'Skipping url: {url}')

def parse(self) -> None:
if self.resp.html:
super().feed(self.resp.html)
Loading

0 comments on commit 5461bfe

Please sign in to comment.