diff --git a/external_resources/api.py b/external_resources/api.py index a37670cb1..e7bedb299 100644 --- a/external_resources/api.py +++ b/external_resources/api.py @@ -8,6 +8,8 @@ from external_resources.constants import ( RESOURCE_BROKEN_STATUS_END, RESOURCE_BROKEN_STATUS_START, + USER_AGENT_STRING, + USER_AGENT_TIMEOUT, ) from external_resources.exceptions import CheckFailedError from websites.models import WebsiteContent @@ -26,15 +28,10 @@ def is_url_broken(url: str) -> tuple[bool, Optional[int]]: response = requests.head( url, allow_redirects=True, - timeout=30, + timeout=USER_AGENT_TIMEOUT, headers={ "Accept": "*/*", - "User-Agent": ( - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " - "AppleWebKit/537.36 (KHTML, like Gecko) " - "Chrome/114.0.0.0 " - "Safari/537.36" - ), + "User-Agent": USER_AGENT_STRING, }, ) except Exception as ex: diff --git a/external_resources/constants.py b/external_resources/constants.py index 0f72673d6..ad5bdac2d 100644 --- a/external_resources/constants.py +++ b/external_resources/constants.py @@ -25,3 +25,18 @@ # constants for Celery task EXTERNAL_RESOURCE_TASK_RATE_LIMIT = "100/s" EXTERNAL_RESOURCE_TASK_PRIORITY = 4 # Lowest priority from range (0 - 4) + +# constants for user agent +USER_AGENT_STRING = ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/114.0.0.0 " + "Safari/537.36" +) + +USER_AGENT_TIMEOUT = 30 + +# metadata fields +METADATA_IS_BROKEN = "is_broken" +METADATA_URL_STATUS_CODE = "url_status_code" +METADATA_BACKUP_URL_STATUS_CODE = "backup_url_status_code" diff --git a/external_resources/tasks.py b/external_resources/tasks.py index b9cdfefac..87e435215 100644 --- a/external_resources/tasks.py +++ b/external_resources/tasks.py @@ -10,6 +10,9 @@ from external_resources.constants import ( EXTERNAL_RESOURCE_TASK_PRIORITY, EXTERNAL_RESOURCE_TASK_RATE_LIMIT, + METADATA_BACKUP_URL_STATUS_CODE, + METADATA_IS_BROKEN, + METADATA_URL_STATUS_CODE, RESOURCE_UNCHECKED_STATUSES, ) from external_resources.exceptions import CheckFailedError @@ -58,6 +61,10 @@ def check_external_resources(resources: list[int]): log.debug(ex) state.status = ExternalResourceState.Status.CHECK_FAILED else: + # Update the metadata of the resource with the status codes + resource.metadata[METADATA_URL_STATUS_CODE] = url_status + resource.metadata[METADATA_BACKUP_URL_STATUS_CODE] = backup_url_status + resource.save() # Status and flag should be updated if codes are not in ignored cases if ( url_status not in RESOURCE_UNCHECKED_STATUSES @@ -73,8 +80,8 @@ def check_external_resources(resources: list[int]): # Either external_url or backup_url is valid. state.status = ExternalResourceState.Status.VALID - if resource.metadata.get("is_broken") != is_broken: - resource.metadata["is_broken"] = is_broken + if resource.metadata.get(METADATA_IS_BROKEN) != is_broken: + resource.metadata[METADATA_IS_BROKEN] = is_broken resource.save() finally: state.last_checked = timezone.now()