Skip to content

Commit

Permalink
rasdaemon: add mc_event trigger
Browse files Browse the repository at this point in the history
Some user want to execute customuze script when ras event occurs, add
one mc trigger for mc_event.

Signed-off-by: Ruidong Tian <tianruidong@linux.alibaba.com>
  • Loading branch information
Ruidong Tian committed Jan 22, 2024
1 parent 0e82389 commit 10466ea
Show file tree
Hide file tree
Showing 9 changed files with 221 additions and 5 deletions.
8 changes: 4 additions & 4 deletions Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ all-local: $(SYSTEMD_SERVICES)

sbin_PROGRAMS = rasdaemon
rasdaemon_SOURCES = rasdaemon.c ras-events.c ras-mc-handler.c \
bitfield.c
bitfield.c trigger.c
if WITH_SQLITE3
rasdaemon_SOURCES += ras-record.c
endif
Expand Down Expand Up @@ -89,7 +89,7 @@ include_HEADERS = config.h ras-events.h ras-logger.h ras-mc-handler.h \
ras-extlog-handler.h ras-arm-handler.h ras-non-standard-handler.h \
ras-devlink-handler.h ras-diskerror-handler.h rbtree.h ras-page-isolation.h \
non-standard-hisilicon.h non-standard-ampere.h ras-memory-failure-handler.h \
ras-cxl-handler.h ras-cpu-isolation.h queue.h non-standard-yitian.h
ras-cxl-handler.h ras-cpu-isolation.h queue.h non-standard-yitian.h trigger.h

# This rule can't be called with more than one Makefile job (like make -j8)
# I can't figure out a way to fix that
Expand All @@ -116,6 +116,6 @@ upload:
# custom target
install-data-local:
$(install_sh) -d "$(DESTDIR)@sysconfdir@/ras/dimm_labels.d"
if WITH_MEMORY_CE_PFA
$(install_sh) -d "$(DESTDIR)@sysconfdir@/ras/triggers"
$(install_sh) @abs_srcdir@/misc/mc_event_trigger "$(DESTDIR)@sysconfdir@/ras/triggers/mc_event_trigger"
$(install_sh) @abs_srcdir@/misc/rasdaemon.env "$(DESTDIR)@SYSCONFDEFDIR@/rasdaemon"
endif
24 changes: 24 additions & 0 deletions misc/mc_event_trigger
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#!/bin/sh
# This shell script can be executed by rasdaemon in daemon mode when a
# mc_event is occured, environment variables include all information
# reported by tracepoint.
#
# environment:
# TIMESTAMP Timestamp when error occurred
# COUNT Number of errors of the same type
# TYPE Error type from Corrected/Uncorrected
# MESSAGE Error message
# LABEL Label of the affected DIMM(s)
# MC_INDEX DIMM identifier from DMI/SMBIOS if available
# TOP_LAYER Top layer of the error
# MIDDLE_LAYER Middle layer of the error
# LOWER_LAYER Low layer of the error
# ADDRESS Error address
# GRAIN Minimum granularity for an error report, in bytes
# SYNDROME Syndrome of the error (or 0 if unknown or if the syndrome is not applicable)
# DRIVER_DETAIL Other driver-specific detail about the error
#

[ -x ./mc_event_trigger.local ] && . ./mc_event_trigger.local

exit 0
19 changes: 18 additions & 1 deletion misc/rasdaemon.env
Original file line number Diff line number Diff line change
Expand Up @@ -43,4 +43,21 @@ CPU_CE_THRESHOLD="18"
CPU_ISOLATION_CYCLE="24h"

# Prevent excessive isolation from causing an avalanche effect
CPU_ISOLATION_LIMIT="10"
CPU_ISOLATION_LIMIT="10"

# Event Trigger

# Event trigger will be executed when the specified event occurs.
# Only support mc_event now.
#
# Execute triggers in this directory
# TRIGGER_DIR=/etc/ras/triggers
TRIGGER_DIR=

# Execute these triggers when the mc_event occured, the triggers will not
# be executed if the trigger is not specified.
# MC_CE_TRIGGER=mc_event_trigger
# MC_UE_TRIGGER=mc_event_trigger
MC_CE_TRIGGER=
MC_UE_TRIGGER=

1 change: 1 addition & 0 deletions misc/rasdaemon.spec.in
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ rm INSTALL %{buildroot}/usr/include/*.h
%{_mandir}/*/*
%{_unitdir}/*.service
%{_sysconfdir}/ras/dimm_labels.d
%{_sysconfdir}/rasdaemon/triggers/mc_even_trigger
%config(noreplace) %{_sysconfdir}/sysconfig/%{name}

%changelog
Expand Down
16 changes: 16 additions & 0 deletions ras-events.c
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
#include "ras-logger.h"
#include "ras-page-isolation.h"
#include "ras-cpu-isolation.h"
#include "trigger.h"

/*
* Polling time, if read() doesn't block. Currently, trace_pipe_raw never
Expand All @@ -61,6 +62,9 @@
#endif

extern char* choices_disable;
const static struct event_trigger event_triggers[] = {
{ "mc_event", &mc_event_trigger_setup },
};

static int get_debugfs_dir(char *tracing_dir, size_t len)
{
Expand Down Expand Up @@ -275,6 +279,16 @@ int toggle_ras_mc_event(int enable)
return rc;
}

static void setup_event_trigger(char *event)
{
struct event_trigger trigger;
for (int i = 0; i < ARRAY_SIZE(event_triggers); i++) {
trigger = event_triggers[i];
if (!strcmp(event, trigger.name))
trigger.setup();
}
}

#if LINUX_VERSION_CODE < KERNEL_VERSION(5,18,0)
/*
* Set kernel filter. libtrace doesn't provide an API for setting filters
Expand Down Expand Up @@ -870,6 +884,8 @@ static int add_event_handler(struct ras_events *ras, struct tep_handle *pevent,
return EINVAL;
}

setup_event_trigger(event);

log(ALL, LOG_INFO, "Enabled event %s:%s\n", group, event);

return 0;
Expand Down
82 changes: 82 additions & 0 deletions ras-mc-handler.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,17 +15,93 @@
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <traceevent/kbuffer.h>
#include <assert.h>
#include "ras-mc-handler.h"
#include "ras-record.h"
#include "ras-logger.h"
#include "ras-page-isolation.h"
#include "ras-report.h"
#include "trigger.h"

#define MAX_ENV 30
static char *mc_ce_trigger;
static char *mc_ue_trigger;

void mc_event_trigger_setup(void)
{
mc_ce_trigger = getenv("MC_CE_TRIGGER");
if (!mc_ce_trigger || !strcmp(mc_ce_trigger, "")
|| trigger_check(mc_ce_trigger) < 0) {
log(SYSLOG, LOG_ERR, "Cannot access mc_event ce trigger `%s`\n",
mc_ce_trigger);
} else
log(SYSLOG, LOG_INFO, "Setup mc_event ce trigger `%s`\n",
mc_ce_trigger);

mc_ue_trigger = getenv("MC_UE_TRIGGER");
if (!mc_ue_trigger || !strcmp(mc_ue_trigger, "")
|| trigger_check(mc_ue_trigger) < 0) {
log(SYSLOG, LOG_ERR, "Cannot access mc_event ue trigger `%s`\n",
mc_ue_trigger);
} else
log(SYSLOG, LOG_INFO, "Setup mc_event ue trigger `%s`\n",
mc_ue_trigger);
}

static void run_mc_trigger(struct ras_mc_event *ev, const char *mc_trigger)
{
char *env[MAX_ENV];
int ei = 0;
int i;

if (!mc_trigger || !strcmp(mc_trigger, ""))
return;

if (asprintf(&env[ei++], "PATH=%s", getenv("PATH") ?: "/sbin:/usr/sbin:/bin:/usr/bin") < 0)
goto free;
if (asprintf(&env[ei++], "TIMESTAMP=%s", ev->timestamp) < 0)
goto free;
if (asprintf(&env[ei++], "COUNT=%d", ev->error_count) < 0)
goto free;
if (asprintf(&env[ei++], "TYPE=%s", ev->error_type) < 0)
goto free;
if (asprintf(&env[ei++], "MESSAGE=%s", ev->msg) < 0)
goto free;
if (asprintf(&env[ei++], "LABEL=%s", ev->label) < 0)
goto free;
if (asprintf(&env[ei++], "MC_INDEX=%d", ev->mc_index) < 0)
goto free;
if (asprintf(&env[ei++], "TOP_LAYER=%d", ev->top_layer) < 0)
goto free;
if (asprintf(&env[ei++], "MIDDLE_LAYER=%d", ev->middle_layer) < 0)
goto free;
if (asprintf(&env[ei++], "LOWER_LAYER=%d", ev->lower_layer) < 0)
goto free;
if (asprintf(&env[ei++], "ADDRESS=%llx", ev->address) < 0)
goto free;
if (asprintf(&env[ei++], "GRAIN=%lld", ev->grain) < 0)
goto free;
if (asprintf(&env[ei++], "SYNDROME=%llx", ev->syndrome) < 0)
goto free;
if (asprintf(&env[ei++], "DRIVER_DETAIL=%s", ev->driver_detail) < 0)
goto free;
env[ei] = NULL;
assert(ei < MAX_ENV);

run_trigger(mc_trigger, NULL, env, "mc_event");

free:
for (i = 0; i < ei; i++)
free(env[i]);
}


int ras_mc_event_handler(struct trace_seq *s,
struct tep_record *record,
struct tep_event *event, void *context)
Expand Down Expand Up @@ -195,6 +271,12 @@ int ras_mc_event_handler(struct trace_seq *s,
ras_report_mc_event(ras, &ev);
#endif

if (!strcmp(ev.error_type, "Corrected"))
run_mc_trigger(&ev, mc_ce_trigger);

if (!strcmp(ev.error_type, "Uncorrected"))
run_mc_trigger(&ev, mc_ue_trigger);

return 0;

parse_error:
Expand Down
2 changes: 2 additions & 0 deletions ras-mc-handler.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
#include "ras-events.h"
#include <traceevent/event-parse.h>

void mc_event_trigger_setup(void);

int ras_mc_event_handler(struct trace_seq *s,
struct tep_record *record,
struct tep_event *event, void *context);
Expand Down
61 changes: 61 additions & 0 deletions trigger.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
#define _GNU_SOURCE
#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <sys/wait.h>
#include "ras-logger.h"
#include "trigger.h"

void run_trigger(const char *trigger, char *argv[], char **env, const char* reporter)
{
pid_t child;
char *path;
int status;
char *trigger_dir = getenv("TRIGGER_DIR");


log(SYSLOG, LOG_INFO, "Running trigger `%s' (reporter: %s)\n", trigger, reporter);

if (asprintf(&path, "%s/%s", trigger_dir, trigger) < 0)
return;

child = fork();
if (child < 0) {
log(SYSLOG, LOG_ERR, "Cannot create process for trigger");
return;
}

if (child == 0) {
execve(path, argv, env);
_exit(127);
} else {
waitpid(child, &status, 0);
if (WIFEXITED(status) && WEXITSTATUS(status)) {
log(SYSLOG, LOG_INFO, "Trigger %s exited with status %d",
trigger, WEXITSTATUS(status));
} else if (WIFSIGNALED(status)) {
log(SYSLOG, LOG_INFO, "Trigger %s killed by signal %d",
trigger, WTERMSIG(status));
}
}
}

int trigger_check(char *s)
{
char *name;
int rc;
char *trigger_dir = getenv("TRIGGER_DIR");

if (trigger_dir) {
if (asprintf(&name, "%s/%s", trigger_dir, s) < 0)
return -1;
} else
name = s;

rc = access(name, R_OK|X_OK);

if (trigger_dir)
free(name);

return rc;
}
13 changes: 13 additions & 0 deletions trigger.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#ifndef __TRIGGER_H__
#define __TRIGGER_H__

struct event_trigger {
const char *name;
void (*setup)(void);
};

int trigger_check(char *s);
void run_trigger(const char *trigger, char *argv[], char **env, const char* reporter);


#endif

0 comments on commit 10466ea

Please sign in to comment.