Skip to content

Commit

Permalink
Save status codes for external resource link checking (#2310)
Browse files Browse the repository at this point in the history
* Save status codes for external resource link checking

* Refactor to use constants for metadata fields
  • Loading branch information
pt2302 committed Sep 18, 2024
1 parent 655e287 commit 9b50303
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 9 deletions.
11 changes: 4 additions & 7 deletions external_resources/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
from external_resources.constants import (
RESOURCE_BROKEN_STATUS_END,
RESOURCE_BROKEN_STATUS_START,
USER_AGENT_STRING,
USER_AGENT_TIMEOUT,
)
from external_resources.exceptions import CheckFailedError
from websites.models import WebsiteContent
Expand All @@ -26,15 +28,10 @@ def is_url_broken(url: str) -> tuple[bool, Optional[int]]:
response = requests.head(
url,
allow_redirects=True,
timeout=30,
timeout=USER_AGENT_TIMEOUT,
headers={
"Accept": "*/*",
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/114.0.0.0 "
"Safari/537.36"
),
"User-Agent": USER_AGENT_STRING,
},
)
except Exception as ex:
Expand Down
15 changes: 15 additions & 0 deletions external_resources/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,18 @@
# constants for Celery task
EXTERNAL_RESOURCE_TASK_RATE_LIMIT = "100/s"
EXTERNAL_RESOURCE_TASK_PRIORITY = 4 # Lowest priority from range (0 - 4)

# constants for user agent
USER_AGENT_STRING = (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/114.0.0.0 "
"Safari/537.36"
)

USER_AGENT_TIMEOUT = 30

# metadata fields
METADATA_IS_BROKEN = "is_broken"
METADATA_URL_STATUS_CODE = "url_status_code"
METADATA_BACKUP_URL_STATUS_CODE = "backup_url_status_code"
11 changes: 9 additions & 2 deletions external_resources/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@
from external_resources.constants import (
EXTERNAL_RESOURCE_TASK_PRIORITY,
EXTERNAL_RESOURCE_TASK_RATE_LIMIT,
METADATA_BACKUP_URL_STATUS_CODE,
METADATA_IS_BROKEN,
METADATA_URL_STATUS_CODE,
RESOURCE_UNCHECKED_STATUSES,
)
from external_resources.exceptions import CheckFailedError
Expand Down Expand Up @@ -58,6 +61,10 @@ def check_external_resources(resources: list[int]):
log.debug(ex)
state.status = ExternalResourceState.Status.CHECK_FAILED
else:
# Update the metadata of the resource with the status codes
resource.metadata[METADATA_URL_STATUS_CODE] = url_status
resource.metadata[METADATA_BACKUP_URL_STATUS_CODE] = backup_url_status
resource.save()
# Status and flag should be updated if codes are not in ignored cases
if (
url_status not in RESOURCE_UNCHECKED_STATUSES
Expand All @@ -73,8 +80,8 @@ def check_external_resources(resources: list[int]):
# Either external_url or backup_url is valid.
state.status = ExternalResourceState.Status.VALID

if resource.metadata.get("is_broken") != is_broken:
resource.metadata["is_broken"] = is_broken
if resource.metadata.get(METADATA_IS_BROKEN) != is_broken:
resource.metadata[METADATA_IS_BROKEN] = is_broken
resource.save()
finally:
state.last_checked = timezone.now()
Expand Down

0 comments on commit 9b50303

Please sign in to comment.