Merge pull request #13 from aceberle/master

Fix for timeouts, head not supported, relative links v3.1.0
ScholliYT · Jun 5, 2021 · 5461bfe · 5461bfe
2 parents e270db8 + 8f20f14
commit 5461bfe
Show file tree

Hide file tree

Showing 22 changed files with 476 additions and 109 deletions.
diff --git a/.github/workflows/python-lint-and-test.yml b/.github/workflows/python-lint-and-test.yml
@@ -6,7 +6,7 @@ name: Lint and Test Project
 on: [push, pull_request]
 
 jobs:
-  build:
+  lint-and-test:
 
     runs-on: ubuntu-latest
 

diff --git a/.github/workflows/python-mutation.yml b/.github/workflows/python-mutation.yml
@@ -6,7 +6,7 @@ name: Mutation Test Project
 on: [pull_request]
 
 jobs:
-  build:
+  mutation-test:
 
     runs-on: ubuntu-latest
 

diff --git a/README.md b/README.md
@@ -69,9 +69,19 @@ Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)
 
 ### `connect_limit_per_host`
 
-**Optional** By default, the crawler will open as many connections at it needs to make the requests.  By setting this value to a number greater than zero (0), the crawler will not exceed that number of connections for each host. This can be useful for when crawling a site that has rate limits. (default unlimited).
+**Optional** By default, the crawler will open a maximum of 10 connections per host.  This can be useful for when crawling a site that has rate limits. Setting this value to zero will cause an unlimited number of connections per host, but this could inadvertently cause timeout errors if the target server gets overwhelmed with connections. (default 10).
+
+### `search_attrs`
+
+**Optional** The names of html element attributes to extract links from.  This can be useful in you are crawling a site that uses a library like [lazyload](https://github.com/tuupola/lazyload) to lazy-load images -- you would want to make your search_attrs 'href,src,data-src'.  (default 'href,src')
+
+### `resolve_before_filtering`
+
+**Optional** By default, the crawler will apply the includes/excludes filtering criteria to the links as they appear in the html source.  For example, if a link has a relative url in the html source, then the includes/excludes will be applied to the link in its relative form.  By setting this value to true, the crawler will fully resolve the link to its absolute representation before applying the includes/excludes filtering criteria.  If you wanted to only crawl links that are prefixed with your site ('http://mysite.com/') then you would set `resolve_before_filtering` to `'true'` and set `include_url_prefix` to `'http://mysite.com/'`.  (default false)
 
 ## Example usage
+
+### Basic scan with retry
 ```yml
 uses: ScholliYT/Broken-Links-Crawler-Action@v3
 with:
@@ -80,9 +90,23 @@ with:
   verbose: 'true'
   max_retry_time: 30
   max_retries: 5
+  max_depth: 1
+```
+
+### Basic scan with retry, only fetches URLs on same site
+```yml
+uses: ScholliYT/Broken-Links-Crawler-Action@v3
+with:
+  website_url: 'https://github.com/ScholliYT/Broken-Links-Crawler-Action'
+  include_url_prefix: 'https://github.com/ScholliYT/Broken-Links-Crawler-Action'
+  resolve_before_filtering: 'true'
+  verbose: 'true'
+  max_retry_time: 30
+  max_retries: 5
+  max_depth: 1
 ```
 
-## Dev
+## Development
 
 The easiest way to run this action locally is to use Docker. Just build a new image and pass the correct env. variables to it. 
 ```

diff --git a/action.yml b/action.yml
@@ -54,7 +54,19 @@ inputs:
   connect_limit_per_host:
     description: 'Limit number of tcp connections per host'
     required: false
-    default: ''
+    default: '10'
+  timeout:
+    description: 'Number of seconds to wait for a request to complete'
+    required: false
+    default: '60'
+  search_attrs:
+    description: 'Names of element attributes to extract links from'
+    required: false
+    default: 'href,src'
+  resolve_before_filtering:
+    description: 'Enables absolute link resolution before applying filtering patterns'
+    required: false
+    default: 'false'
 runs:
   using: 'docker'
   image: 'Dockerfile'

diff --git a/deadseeker/action.py b/deadseeker/action.py
@@ -34,10 +34,13 @@ def run_action() -> None:
         logging.basicConfig(level=verbosity)
 
     config = SeekerConfig()
+    config.search_attrs = inputvalidator.get_search_attrs()
     config.connect_limit_per_host = inputvalidator.get_connect_limit_per_host()
+    config.timeout = inputvalidator.get_timeout()
     config.max_tries = inputvalidator.get_retry_maxtries()
     config.max_time = inputvalidator.get_retry_maxtime()
     config.alwaysgetonsite = inputvalidator.get_alwaysgetonsite()
+    config.resolvebeforefilter = inputvalidator.get_resolvebeforefilter()
     for inclusion in ['in', 'ex']:
         for strategy in ['prefix', 'suffix', 'contained']:
             attrname = f'{inclusion}clude{strategy}'

diff --git a/deadseeker/clientsession.py b/deadseeker/clientsession.py
@@ -1,12 +1,14 @@
 from .common import SeekerConfig
 import aiohttp
+import asyncio
 import logging
 from types import SimpleNamespace
 from aiohttp import (
     ClientSession,
     TraceConfig,
     TraceRequestStartParams,
-    TCPConnector
+    TCPConnector,
+    ClientTimeout
 )
 from aiohttp_retry import RetryClient, ExponentialRetry  # type: ignore
 from abc import abstractmethod, ABC
@@ -44,10 +46,14 @@ async def _on_request_start(
         retry_options = ExponentialRetry(
                             attempts=config.max_tries,
                             max_timeout=config.max_time,
-                            exceptions=[aiohttp.ClientError])
+                            exceptions=[
+                                aiohttp.ClientError,
+                                asyncio.TimeoutError
+                            ])
         return RetryClient(
                 raise_for_status=True,
                 connector=connector,
+                timeout=ClientTimeout(total=config.timeout),
                 headers={'User-Agent': config.agent},
                 retry_options=retry_options,
                 trace_configs=[trace_config])
diff --git a/deadseeker/common.py b/deadseeker/common.py
@@ -1,4 +1,4 @@
-from typing import Optional, List
+from typing import Optional, List, Set
 
 DEFAULT_WEB_AGENT: str = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)' +\
     ' AppleWebKit/537.36 (KHTML, like Gecko)' +\
@@ -7,7 +7,9 @@
 DEFAULT_RETRY_MAX_TIME: int = 30
 DEFAULT_EXCLUDE_PREFIX: List[str] = ['mailto:', 'tel:']
 DEFAULT_MAX_DEPTH: int = -1
-DEFAULT_CONNECT_LIMIT_PER_HOST: int = 0
+DEFAULT_CONNECT_LIMIT_PER_HOST: int = 10
+DEFAULT_TIMEOUT: int = 60
+DEFAULT_SEARCH_ATTRS: Set[str] = set(['href', 'src'])
 
 
 class SeekerConfig:
@@ -21,10 +23,13 @@ def __init__(self) -> None:
         self.excludesuffix: List[str] = []
         self.includecontained: List[str] = []
         self.excludecontained: List[str] = []
+        self.search_attrs: Set[str] = DEFAULT_SEARCH_ATTRS
         self.agent: str = DEFAULT_WEB_AGENT
         self.alwaysgetonsite: bool = False
+        self.resolvebeforefilter: bool = False
         self.connect_limit_per_host: int = \
             DEFAULT_CONNECT_LIMIT_PER_HOST
+        self.timeout: int = DEFAULT_TIMEOUT
 
 
 class UrlTarget():

diff --git a/deadseeker/deadseeker.py b/deadseeker/deadseeker.py
@@ -1,5 +1,5 @@
 import asyncio
-from urllib.parse import urlparse, urljoin
+from urllib.parse import urljoin
 from typing import List, Set, Deque, Optional, Union
 from .timer import Timer
 import logging
@@ -53,7 +53,8 @@ async def _main(
             visited.add(url)
             targets.appendleft(UrlTarget(url, url, self.config.max_depth))
         linkacceptor = self.linkacceptorfactory.get_link_acceptor(self.config)
-        linkparser = self.linkparserfactory.get_link_parser(linkacceptor)
+        linkparser = \
+            self.linkparserfactory.get_link_parser(self.config, linkacceptor)
         responsefetcher = self.responsefetcherfactory.get_response_fetcher(
                                 self.config)
         async with self.clientsessionfactory.get_client_session(
@@ -86,16 +87,14 @@ def _parse_response(
             resp: UrlFetchResponse) -> None:
         depth = resp.urltarget.depth
         if resp.html and depth != 0:
-            home = resp.urltarget.home
-            links = linkparser.parse(resp.html)
+            links = linkparser.parse(resp)
+            base = resp.urltarget.url
             for newurl in links:
-                if not bool(
-                        urlparse(newurl).netloc):  # relative link?
-                    newurl = urljoin(resp.urltarget.url, newurl)
+                newurl = urljoin(base, newurl)
                 if newurl not in visited:
                     visited.add(newurl)
                     targets.appendleft(
-                        UrlTarget(home, newurl, depth - 1))
+                        UrlTarget(resp.urltarget.home, newurl, depth - 1))
 
     def seek(
             self,

diff --git a/deadseeker/inputvalidator.py b/deadseeker/inputvalidator.py
@@ -1,13 +1,15 @@
 import validators  # type: ignore
-from typing import List, Dict, Union, Optional
+from typing import List, Dict, Union, Optional, Set
 import re
 import logging
 from deadseeker.common import (
     DEFAULT_RETRY_MAX_TRIES,
     DEFAULT_RETRY_MAX_TIME,
+    DEFAULT_SEARCH_ATTRS,
     DEFAULT_WEB_AGENT,
     DEFAULT_MAX_DEPTH,
-    DEFAULT_CONNECT_LIMIT_PER_HOST
+    DEFAULT_CONNECT_LIMIT_PER_HOST,
+    DEFAULT_TIMEOUT
 )
 
 
@@ -25,6 +27,12 @@ def get_urls(self) -> List[str]:
                 f" expected to contain valid url: {url}"
         return website_urls
 
+    def get_search_attrs(self) -> Set[str]:
+        search_attrs = self._splitAndTrim('INPUT_SEARCH_ATTRS')
+        if search_attrs:
+            return set(search_attrs)
+        return DEFAULT_SEARCH_ATTRS
+
     def get_retry_maxtries(self) -> int:
         return self._numeric('INPUT_MAX_RETRIES', DEFAULT_RETRY_MAX_TRIES)
 
@@ -38,6 +46,9 @@ def get_connect_limit_per_host(self) -> int:
         return self._numeric(
             'INPUT_CONNECT_LIMIT_PER_HOST', DEFAULT_CONNECT_LIMIT_PER_HOST)
 
+    def get_timeout(self) -> int:
+        return self._numeric('INPUT_TIMEOUT', DEFAULT_TIMEOUT)
+
     def get_verbosity(self) -> Union[bool, int]:
         verboseStr = self.inputs.get('INPUT_VERBOSE')
         if(verboseStr):
@@ -54,6 +65,10 @@ def get_verbosity(self) -> Union[bool, int]:
     def get_alwaysgetonsite(self) -> bool:
         return self._get_boolean(self.inputs.get('INPUT_ALWAYS_GET_ONSITE'))
 
+    def get_resolvebeforefilter(self) -> bool:
+        return self._get_boolean(
+            self.inputs.get('INPUT_RESOLVE_BEFORE_FILTERING'))
+
     def _get_boolean(self, valueStr: Optional[str]) -> bool:
         truepattern = '^t|true|y|yes|on$'
         return bool(

diff --git a/deadseeker/linkparser.py b/deadseeker/linkparser.py
@@ -1,43 +1,59 @@
 from .linkacceptor import LinkAcceptor
 from html.parser import HTMLParser
+from urllib.parse import urljoin
 from typing import List, Tuple, Optional
 import logging
+from .common import SeekerConfig, UrlFetchResponse
 from abc import abstractmethod, ABC
 
-
-search_attrs = set(['href', 'src'])
 logger = logging.getLogger(__name__)
 
 
 class LinkParser(ABC):
     @abstractmethod  # pragma: no mutate
-    def parse(self, html: str) -> List[str]:
+    def parse(self, resp: UrlFetchResponse) -> List[str]:
         pass
 
 
 class LinkParserFactory(ABC):
     @abstractmethod  # pragma: no mutate
-    def get_link_parser(self, linkacceptor: LinkAcceptor) -> LinkParser:
+    def get_link_parser(
+            self,
+            config: SeekerConfig,
+            linkacceptor: LinkAcceptor) -> LinkParser:
         pass
 
 
 class DefaultLinkParser(LinkParser):
-    def __init__(self, linkacceptor: LinkAcceptor) -> None:
+    def __init__(
+            self,
+            config: SeekerConfig,
+            linkacceptor: LinkAcceptor) -> None:
+        self.config = config
         self.linkacceptor = linkacceptor
 
-    def parse(self, html: str) -> List[str]:
-        parser = LinkHtmlParser(self.linkacceptor)
-        parser.feed(html)
+    def parse(self, resp: UrlFetchResponse) -> List[str]:
+        parser = LinkHtmlParser(resp, self.config, self.linkacceptor)
+        parser.parse()
         return parser.links
 
 
 class DefaultLinkParserFactory(LinkParserFactory):
-    def get_link_parser(self, linkacceptor: LinkAcceptor) -> LinkParser:
-        return DefaultLinkParser(linkacceptor)
+    def get_link_parser(
+            self,
+            config: SeekerConfig,
+            linkacceptor: LinkAcceptor) -> LinkParser:
+        return DefaultLinkParser(config, linkacceptor)
 
 
 class LinkHtmlParser(HTMLParser):
-    def __init__(self, linkacceptor: LinkAcceptor):
+    def __init__(
+            self,
+            resp: UrlFetchResponse,
+            config: SeekerConfig,
+            linkacceptor: LinkAcceptor):
+        self.resp = resp
+        self.config = config
         self.linkacceptor = linkacceptor
         self.links: List[str] = list()
         super().__init__()
@@ -51,11 +67,17 @@ def handle_starttag(
         '''Override parent method and check tag for our attributes'''
         for attr in attrs:
             # ('href', 'http://google.com')
-            if attr[0] in search_attrs:
+            if attr[0] in self.config.search_attrs:
                 url = attr[1]
                 if url:
+                    if self.config.resolvebeforefilter:
+                        url = urljoin(self.resp.urltarget.url, url)
                     if self.linkacceptor.accepts(url):
                         logger.debug(f'Accepting url: {url}')
                         self.links.append(url)
                     else:
                         logger.debug(f'Skipping url: {url}')
+
+    def parse(self) -> None:
+        if self.resp.html:
+            super().feed(self.resp.html)