Skip to content

Commit

Permalink
Add declare incident escalation step
Browse files Browse the repository at this point in the history
  • Loading branch information
matiasb committed Oct 4, 2024
1 parent ac7dc97 commit 3af879a
Show file tree
Hide file tree
Showing 23 changed files with 959 additions and 5 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,13 @@
from apps.alerts.models.escalation_policy import EscalationPolicy
from apps.alerts.tasks import (
custom_webhook_result,
declare_incident,
notify_all_task,
notify_group_task,
notify_user_task,
resolve_by_last_step_task,
)
from apps.alerts.utils import is_declare_incident_step_enabled
from apps.schedules.ical_utils import list_users_to_notify_from_ical
from apps.user_management.models import User

Expand Down Expand Up @@ -136,6 +138,7 @@ def execute(self, alert_group: "AlertGroup", reason) -> StepExecutionResultData:
EscalationPolicy.STEP_NOTIFY_IF_NUM_ALERTS_IN_TIME_WINDOW: self._escalation_step_notify_if_num_alerts_in_time_window,
EscalationPolicy.STEP_NOTIFY_MULTIPLE_USERS: self._escalation_step_notify_multiple_users,
EscalationPolicy.STEP_NOTIFY_MULTIPLE_USERS_IMPORTANT: self._escalation_step_notify_multiple_users,
EscalationPolicy.STEP_DECLARE_INCIDENT: self._escalation_step_declare_incident,
None: self._escalation_step_not_configured,
}
result = action_map[self.step](alert_group, reason)
Expand Down Expand Up @@ -410,6 +413,32 @@ def _escalation_step_notify_team_members(self, alert_group: "AlertGroup", reason

self._execute_tasks(tasks)

def _escalation_step_declare_incident(self, alert_group: "AlertGroup", _reason: str) -> None:
grafana_declare_incident_enabled = is_declare_incident_step_enabled(
organization=alert_group.channel.organization
)
if not grafana_declare_incident_enabled:
AlertGroupLogRecord(
type=AlertGroupLogRecord.TYPE_ESCALATION_FAILED,
alert_group=alert_group,
reason="Declare Incident step is not enabled",
escalation_policy=self.escalation_policy,
escalation_error_code=AlertGroupLogRecord.ERROR_ESCALATION_DECLARE_INCIDENT_STEP_IS_NOT_ENABLED,
escalation_policy_step=self.step,
).save()
return
tasks = []
declare_incident_task = declare_incident.signature(
args=(alert_group.pk,),
kwargs={
"escalation_policy_pk": self.id,
"severity": self.severity,
},
immutable=True,
)
tasks.append(declare_incident_task)
self._execute_tasks(tasks)

def _escalation_step_notify_if_time(self, alert_group: "AlertGroup", _reason: str) -> StepExecutionResultData:
eta = None

Expand Down
30 changes: 30 additions & 0 deletions engine/apps/alerts/migrations/0060_relatedincident.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# Generated by Django 4.2.15 on 2024-10-04 16:38

from django.db import migrations, models
import django.db.models.deletion


class Migration(migrations.Migration):

dependencies = [
('user_management', '0022_alter_team_unique_together'),
('alerts', '0059_escalationpolicy_severity_and_more'),
]

operations = [
migrations.CreateModel(
name='RelatedIncident',
fields=[
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('incident_id', models.CharField(db_index=True, max_length=50)),
('created_at', models.DateTimeField(auto_now_add=True)),
('is_active', models.BooleanField(default=True)),
('attached_alert_groups', models.ManyToManyField(related_name='related_incidents', to='alerts.alertgroup')),
('channel_filter', models.ForeignKey(null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='related_incidents', to='alerts.channelfilter')),
('organization', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='related_incidents', to='user_management.organization')),
],
options={
'unique_together': {('organization', 'incident_id')},
},
),
]
1 change: 1 addition & 0 deletions engine/apps/alerts/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from .grafana_alerting_contact_point import GrafanaAlertingContactPoint # noqa: F401
from .invitation import Invitation # noqa: F401
from .maintainable_object import MaintainableObject # noqa: F401
from .related_incident import RelatedIncident # noqa: F401
from .resolution_note import ResolutionNote, ResolutionNoteSlackMessage # noqa: F401
from .user_has_notification import UserHasNotification # noqa: F401
from .user_notification_bundle import BundledNotification, UserNotificationBundle # noqa: F401
2 changes: 2 additions & 0 deletions engine/apps/alerts/models/alert_group.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
AlertGroupLogRecord,
AlertReceiveChannel,
BundledNotification,
RelatedIncident,
ResolutionNote,
ResolutionNoteSlackMessage,
)
Expand Down Expand Up @@ -193,6 +194,7 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models.
acknowledged_by_user: typing.Optional["User"]
alerts: "RelatedManager['Alert']"
bundled_notifications: "RelatedManager['BundledNotification']"
related_incidents: "RelatedManager['RelatedIncident']"
dependent_alert_groups: "RelatedManager['AlertGroup']"
channel: "AlertReceiveChannel"
log_records: "RelatedManager['AlertGroupLogRecord']"
Expand Down
55 changes: 52 additions & 3 deletions engine/apps/alerts/models/alert_group_log_record.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,18 +11,24 @@

from apps.alerts import tasks
from apps.alerts.constants import ActionSource
from apps.alerts.incident_appearance.renderers.constants import DEFAULT_BACKUP_TITLE
from apps.alerts.utils import render_relative_timeline
from apps.slack.slack_formatter import SlackFormatter
from common.utils import clean_markup

if typing.TYPE_CHECKING:
from apps.alerts.models import AlertGroup, CustomButton, EscalationPolicy, Invitation
from apps.user_management.models import User
from apps.user_management.models import Organization, User

logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)


class RelatedIncidentData(typing.TypedDict):
incident_link: typing.Optional[str]
incident_title: str


class AlertGroupLogRecord(models.Model):
alert_group: "AlertGroup"
author: typing.Optional["User"]
Expand Down Expand Up @@ -161,7 +167,9 @@ class AlertGroupLogRecord(models.Model):
ERROR_ESCALATION_TRIGGER_CUSTOM_WEBHOOK_ERROR,
ERROR_ESCALATION_NOTIFY_TEAM_MEMBERS_STEP_IS_NOT_CONFIGURED,
ERROR_ESCALATION_TRIGGER_WEBHOOK_IS_DISABLED,
) = range(20)
ERROR_ESCALATION_DECLARE_INCIDENT_STEP_IS_NOT_ENABLED,
ERROR_ESCALATION_INCIDENT_COULD_NOT_BE_DECLARED,
) = range(22)

type = models.IntegerField(choices=TYPE_CHOICES)

Expand Down Expand Up @@ -225,7 +233,14 @@ class AlertGroupLogRecord(models.Model):
escalation_policy_step = models.IntegerField(null=True, default=None)
step_specific_info = JSONField(null=True, default=None)

STEP_SPECIFIC_INFO_KEYS = ["schedule_name", "custom_button_name", "usergroup_handle", "source_integration_name"]
STEP_SPECIFIC_INFO_KEYS = [
"schedule_name",
"custom_button_name",
"usergroup_handle",
"source_integration_name",
"incident_link",
"incident_title",
]

def _make_log_line_link(self, url, title, html=False, for_slack=False, substitute_with_tag=False):
if html and url:
Expand All @@ -244,6 +259,7 @@ def render_log_line_json(self):
author = self.author.short(organization) if self.author is not None else None
escalation_chain = self.alert_group.channel_filter.escalation_chain if self.alert_group.channel_filter else None
step_info = self.get_step_specific_info()
related_incident = self.render_incident_data_from_step_info(organization, step_info)
escalation_chain_data = (
{
"pk": escalation_chain.public_primary_key,
Expand Down Expand Up @@ -280,6 +296,7 @@ def render_log_line_json(self):
"type": self.type,
"created_at": created_at,
"author": author,
"incident": related_incident,
"escalation_chain": escalation_chain_data,
"schedule": schedule,
"webhook": webhook,
Expand Down Expand Up @@ -425,6 +442,14 @@ def rendered_log_line_action(self, for_slack=False, html=False, substitute_with_
result += f'triggered step "Notify on-call from Schedule {schedule_text}{important_text}"'
elif escalation_policy_step == EscalationPolicy.STEP_REPEAT_ESCALATION_N_TIMES:
result += "escalation started from the beginning"
elif escalation_policy_step == EscalationPolicy.STEP_DECLARE_INCIDENT:
organization = self.alert_group.channel.organization
incident_data = self.render_incident_data_from_step_info(organization, step_specific_info)
incident_link = incident_data["incident_link"]
incident_title = incident_data["incident_title"]
tag = "related_incident" if substitute_with_tag else False
incident_text = self._make_log_line_link(incident_link, incident_title, html, for_slack, tag)
result += self.reason + f": {incident_text}"
else:
result += f'triggered step "{EscalationPolicy.get_step_display_name(escalation_policy_step)}"'
elif self.type == AlertGroupLogRecord.TYPE_SILENCE:
Expand Down Expand Up @@ -640,8 +665,32 @@ def rendered_log_line_action(self, for_slack=False, html=False, substitute_with_
result += f"failed to notify User Group{usergroup_handle_text} in Slack"
elif self.escalation_error_code == AlertGroupLogRecord.ERROR_ESCALATION_TRIGGER_WEBHOOK_IS_DISABLED:
result += 'skipped escalation step "Trigger Outgoing Webhook" because it is disabled'
elif (
self.escalation_error_code == AlertGroupLogRecord.ERROR_ESCALATION_DECLARE_INCIDENT_STEP_IS_NOT_ENABLED
):
result += 'skipped escalation step "Declare Incident": step is not enabled'
elif self.escalation_error_code == AlertGroupLogRecord.ERROR_ESCALATION_INCIDENT_COULD_NOT_BE_DECLARED:
result += "failed to declare an Incident"
if self.reason:
result += f": {self.reason}"
return result

def render_incident_data_from_step_info(
self, organization: "Organization", step_specific_info: dict
) -> RelatedIncidentData | None:
from apps.alerts.models.related_incident import get_incident_url

if not step_specific_info or not all(key in step_specific_info for key in ["incident_title", "incident_id"]):
return None

incident_link = (
get_incident_url(organization, step_specific_info["incident_id"])
if step_specific_info["incident_id"]
else None
)
incident_title = step_specific_info["incident_title"] or DEFAULT_BACKUP_TITLE
return {"incident_link": incident_link, "incident_title": incident_title}

def get_step_specific_info(self):
step_specific_info = None
# in some cases step_specific_info was saved with using json.dumps
Expand Down
3 changes: 3 additions & 0 deletions engine/apps/alerts/models/escalation_policy.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ class EscalationPolicy(OrderedModel):
STEP_NOTIFY_IF_TIME,
STEP_NOTIFY_IF_NUM_ALERTS_IN_TIME_WINDOW,
STEP_REPEAT_ESCALATION_N_TIMES,
STEP_DECLARE_INCIDENT,
]
# Steps can be stored in db while interacting with internal api
# Includes important versions of default steps
Expand Down Expand Up @@ -218,6 +219,7 @@ class EscalationPolicy(OrderedModel):
STEP_NOTIFY_IF_TIME,
STEP_NOTIFY_IF_NUM_ALERTS_IN_TIME_WINDOW,
STEP_REPEAT_ESCALATION_N_TIMES,
STEP_DECLARE_INCIDENT,
]

PUBLIC_STEP_CHOICES_MAP = {
Expand All @@ -239,6 +241,7 @@ class EscalationPolicy(OrderedModel):
STEP_NOTIFY_IF_TIME: "notify_if_time_from_to",
STEP_NOTIFY_IF_NUM_ALERTS_IN_TIME_WINDOW: "notify_if_num_alerts_in_window",
STEP_REPEAT_ESCALATION_N_TIMES: "repeat_escalation",
STEP_DECLARE_INCIDENT: "declare_incident",
}

public_primary_key = models.CharField(
Expand Down
48 changes: 48 additions & 0 deletions engine/apps/alerts/models/related_incident.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import typing
from urllib.parse import urljoin

from django.db import models

from common.constants.plugin_ids import PluginID

if typing.TYPE_CHECKING:
from django.db.models.manager import RelatedManager

from apps.alerts.models import AlertGroup, ChannelFilter
from apps.user_management.models import Organization


def get_incident_url(organization, incident_id) -> str:
return urljoin(organization.grafana_url, f"a/{PluginID.INCIDENT}/incidents/{incident_id}")


class RelatedIncident(models.Model):
attached_alert_groups: "RelatedManager['AlertGroup']"
channel_filter: typing.Optional["ChannelFilter"]
organization: "Organization"

incident_id = models.CharField(db_index=True, max_length=50)
organization = models.ForeignKey(
"user_management.Organization",
on_delete=models.CASCADE,
related_name="related_incidents",
)
channel_filter = models.ForeignKey(
"alerts.ChannelFilter",
on_delete=models.SET_NULL,
null=True,
related_name="related_incidents",
)
created_at = models.DateTimeField(auto_now_add=True)
is_active = models.BooleanField(default=True)

attached_alert_groups = models.ManyToManyField(
"alerts.AlertGroup",
related_name="related_incidents",
)

class Meta:
unique_together = ("organization", "incident_id")

def get_incident_link(self) -> str:
return get_incident_url(self.organization, self.incident_id)
1 change: 1 addition & 0 deletions engine/apps/alerts/tasks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
)
from .check_escalation_finished import check_escalation_finished_task # noqa: F401
from .custom_webhook_result import custom_webhook_result # noqa: F401
from .declare_incident import declare_incident # noqa: F401
from .delete_alert_group import delete_alert_group # noqa: F401
from .delete_alert_group import finish_delete_alert_group # noqa: F401
from .delete_alert_group import send_alert_group_signal_for_delete # noqa: F401
Expand Down
Loading

0 comments on commit 3af879a

Please sign in to comment.