Skip to content

Commit

Permalink
Still figuring out how to handle sampling of threads after waiting on…
Browse files Browse the repository at this point in the history
… gvl
  • Loading branch information
ivoanjo committed Sep 3, 2024
1 parent 2887c0f commit 5f953f1
Show file tree
Hide file tree
Showing 3 changed files with 78 additions and 8 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -921,7 +921,6 @@ static void after_gc_from_postponed_job(DDTRACE_UNUSED void *_unused) {

state->during_sample = true;

// Trigger sampling using the Collectors::ThreadState; rescue against any exceptions that happen during sampling
safely_call(thread_context_collector_sample_after_gc, state->thread_context_collector_instance, state->self_instance);

state->during_sample = false;
Expand Down Expand Up @@ -1294,13 +1293,26 @@ static void on_gvl_event(rb_event_flag_t event_id, const rb_internal_thread_even
thread_context_collector_on_gvl_waiting(current_thread);
} else if (event_id == RUBY_INTERNAL_THREAD_EVENT_RESUMED) { /* running/runnable */
bool should_sample = thread_context_collector_on_gvl_running(current_thread);
if (should_sample) rb_postponed_job_trigger(after_gvl_running_from_postponed_job_handle);

if (should_sample) {
// should_sample is only true if a thread belongs to the main Ractor, so we're good to go
rb_postponed_job_trigger(after_gvl_running_from_postponed_job_handle);
}
} else {
// This is a very delicate time and it's hard for us to raise an exception so let's at least complain to stderr
fprintf(stderr, "[ddtrace] Unexpected value in on_gvl_event (%d)\n", event_id);
}
}

static void after_gvl_running_from_postponed_job(DDTRACE_UNUSED void *_unused) {
fprintf(stderr, "After gvl running!\n");
struct cpu_and_wall_time_worker_state *state = active_sampler_instance_state; // Read from global variable, see "sampler global state safety" note above

// This can potentially happen if the CpuAndWallTimeWorker was stopped while the postponed job was waiting to be executed; nothing to do
if (state == NULL) return;

state->during_sample = true;

safely_call(thread_context_collector_sample_after_gvl_running, state->thread_context_collector_instance, state->self_instance);

state->during_sample = false;
}
67 changes: 62 additions & 5 deletions ext/datadog_profiling_native_extension/collectors_thread_context.c
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@
// This is used as a placeholder to mark threads that are allowed to be profiled (enabled)
// (e.g. to avoid trying to gvl profile threads that are not from the main Ractor)
// and for which there's no data yet
#define GVL_WAITING_ENABLED_EMPTY UINTPTR_MAX
#define GVL_WAITING_ENABLED_EMPTY INTPTR_MAX

static ID at_active_span_id; // id of :@active_span in Ruby
static ID at_active_trace_id; // id of :@active_trace in Ruby
Expand Down Expand Up @@ -536,6 +536,13 @@ void update_metrics_and_sample(
thread_context->gc_tracking.cpu_time_at_start_ns,
IS_NOT_WALL_TIME
);

// TODO: This feels like a hack here -- is there a better place to do this?
intptr_t gvl_waiting_at = (intptr_t) rb_thread_local_variable_get(thread_being_sampled, per_thread_gvl_waiting_timestamp_key);
if (gvl_waiting_at != 0 && gvl_waiting_at != GVL_WAITING_ENABLED_EMPTY) {
// TODO: Was here
}

long wall_time_elapsed_ns = update_time_since_previous_sample(
&thread_context->wall_time_at_previous_sample_ns,
current_monotonic_wall_time_ns,
Expand Down Expand Up @@ -1507,11 +1514,11 @@ void thread_context_collector_on_gvl_waiting(VALUE thread) {
//
// Also, this function can get called on the non-main Ractor. We deal with this by checking if the value in the context
// is non-zero, since only `initialize_context` ever sets the value from 0 to non-zero for threads it sees.
uintptr_t thread_being_profiled = (uintptr_t) rb_internal_thread_specific_get(thread, per_thread_gvl_waiting_timestamp_key);
intptr_t thread_being_profiled = (intptr_t) rb_internal_thread_specific_get(thread, per_thread_gvl_waiting_timestamp_key);
if (!thread_being_profiled) return;

long current_monotonic_wall_time_ns = monotonic_wall_time_now_ns(DO_NOT_RAISE_ON_FAILURE);
if (current_monotonic_wall_time_ns <= 0 && ((unsigned long) current_monotonic_wall_time_ns) > UINTPTR_MAX) return;
if (current_monotonic_wall_time_ns <= 0 || current_monotonic_wall_time_ns > GVL_WAITING_ENABLED_EMPTY) return;

rb_internal_thread_specific_set(thread, per_thread_gvl_waiting_timestamp_key, (void *) current_monotonic_wall_time_ns);
}
Expand All @@ -1521,12 +1528,62 @@ void thread_context_collector_on_gvl_waiting(VALUE thread) {
// This function can get called from outside the GVL and even on non-main Ractors
__attribute__((warn_unused_result))
bool thread_context_collector_on_gvl_running(VALUE thread) {
uintptr_t gvl_waiting_at = (uintptr_t) rb_internal_thread_specific_get(thread, per_thread_gvl_waiting_timestamp_key);
intptr_t gvl_waiting_at = (intptr_t) rb_internal_thread_specific_get(thread, per_thread_gvl_waiting_timestamp_key);

// Thread was not being profiled / not waiting on gvl
if (gvl_waiting_at == 0 || gvl_waiting_at == GVL_WAITING_ENABLED_EMPTY) return false;

// @ivoanjo: I'm not sure if this can happen -- It means we should've sampled already but didn't
if (gvl_waiting_at < 0) return true;

long waiting_for_gvl_duration_ns = monotonic_wall_time_now_ns(DO_NOT_RAISE_ON_FAILURE) - gvl_waiting_at;

return waiting_for_gvl_duration_ns >= WAITING_FOR_GVL_THRESHOLD_NS;
bool should_sample = waiting_for_gvl_duration_ns >= WAITING_FOR_GVL_THRESHOLD_NS;

if (should_sample) {
// We flip the gvl_waiting_at to negative to mark that the thread is now running
intptr_t gvl_waiting_at_is_now_running = -gvl_waiting_at;

rb_internal_thread_specific_set(thread, per_thread_gvl_waiting_timestamp_key, (void *) gvl_waiting_at_is_now_running);
} else {
// We decided not to sample. Let's mark the thread back to being profiled, but having no data yet.
rb_internal_thread_specific_set(thread, per_thread_gvl_waiting_timestamp_key, (void *) GVL_WAITING_ENABLED_EMPTY);
}

return should_sample;
}

VALUE thread_context_collector_sample_after_gvl_running(VALUE self_instance) {
struct thread_context_collector_state *state;
TypedData_Get_Struct(self_instance, struct thread_context_collector_state, &thread_context_collector_typed_data, state);

if (!state->timeline_enabled) rb_raise(rb_eRuntimeError, "gvl profiling requires timeline to be enabled");

VALUE current_thread = rb_thread_current();
struct per_thread_context *thread_context = get_or_create_context_for(thread, state);

intptr_t gvl_waiting_at = (intptr_t) rb_internal_thread_specific_get(current_thread, per_thread_gvl_waiting_timestamp_key);

if (gvl_waiting_at >= 0) {
// @ivoanjo: I'm not sure if this can ever happen. This means that we're not on the same thread
// that ran `thread_context_collector_on_gvl_running` and made the decision to sample.
// We do nothing in this case.
return Qnil;
}

// The timestamp is encoded as a negative value in thread_context_collector_on_gvl_running
long gvl_waiting_wall_time_ns = -gvl_waiting_at;

// Wrong
update_metrics_and_sample(
state,
/* thread_being_sampled: */ thread,
/* stack_from_thread: */ thread,
thread_context,
thread_context->sampling_buffer,
FIXME,
FIXME
);

return Qnil; // To allow this to be called from rb_rescue2
}
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,4 @@ __attribute__((warn_unused_result)) bool thread_context_collector_on_gc_finish(V
VALUE enforce_thread_context_collector_instance(VALUE object);
void thread_context_collector_on_gvl_waiting(VALUE thread);
__attribute__((warn_unused_result)) bool thread_context_collector_on_gvl_running(VALUE thread);
VALUE thread_context_collector_sample_after_gvl_running(VALUE self_instance);

0 comments on commit 5f953f1

Please sign in to comment.