DataDog · ivoanjo · Sep 20, 2024 · Sep 20, 2024 · Sep 23, 2024 · Sep 23, 2024
@@ -804,6 +804,10 @@ static VALUE release_gvl_and_run_sampling_trigger_loop(VALUE instance) {
 
   if (state->gvl_profiling_enabled) {
     #ifndef NO_GVL_INSTRUMENTATION
+      #ifdef USE_GVL_PROFILING_3_2_WORKAROUNDS
+        gvl_profiling_state_thread_tracking_workaround();
+      #endif
+
       state->gvl_profiling_hook = rb_internal_thread_add_event_hook(
         on_gvl_event,
         (
@@ -1304,7 +1308,7 @@ static VALUE _native_resume_signals(DDTRACE_UNUSED VALUE self) {
     //
     // In fact, the `target_thread` that this event is about may not even be the current thread. (So be careful with thread locals that
     // are not directly tied to the `target_thread` object and the like)
-    VALUE target_thread = event_data->thread;
+    gvl_profiling_thread target_thread = thread_from_event(event_data);
 
     if (event_id == RUBY_INTERNAL_THREAD_EVENT_READY) { /* waiting for gvl */
       thread_context_collector_on_gvl_waiting(target_thread);
@@ -1315,11 +1319,19 @@ static VALUE _native_resume_signals(DDTRACE_UNUSED VALUE self) {
       // it tags threads it's tracking, so if a thread is tagged then by definition we know that thread belongs to the main
       // Ractor. Thus, if we really really wanted to access the state, we could do it after making sure we're on the correct Ractor.
 
+      #ifdef USE_GVL_PROFILING_3_2_WORKAROUNDS
+        target_thread = gvl_profiling_state_maybe_initialize();
+      #endif
+
       bool should_sample = thread_context_collector_on_gvl_running(target_thread);
 
       if (should_sample) {
         // should_sample is only true if a thread belongs to the main Ractor, so we're good to go
-        rb_postponed_job_trigger(after_gvl_running_from_postponed_job_handle);
+        #ifndef NO_POSTPONED_TRIGGER
+          rb_postponed_job_trigger(after_gvl_running_from_postponed_job_handle);
+        #else
+          rb_postponed_job_register_one(0, after_gvl_running_from_postponed_job, NULL);
+        #endif
       }
     } else {
       // This is a very delicate time and it's hard for us to raise an exception so let's at least complain to stderr

@@ -1,5 +1,4 @@
 #include <ruby.h>
-#include <ruby/thread.h>
 
 #include "collectors_thread_context.h"
 #include "clock_id.h"
@@ -80,7 +79,7 @@
 // This is used as a placeholder to mark threads that are allowed to be profiled (enabled)
 // (e.g. to avoid trying to gvl profile threads that are not from the main Ractor)
 // and for which there's no data yet
-#define GVL_WAITING_ENABLED_EMPTY INTPTR_MAX
+#define GVL_WAITING_ENABLED_EMPTY RUBY_FIXNUM_MAX
 
 static ID at_active_span_id;  // id of :@active_span in Ruby
 static ID at_active_trace_id; // id of :@active_trace in Ruby
@@ -92,10 +91,6 @@ static ID at_otel_values_id;  // id of :@otel_values in Ruby
 static ID at_parent_span_id_id; // id of :@parent_span_id in Ruby
 static ID at_datadog_trace_id;  // id of :@datadog_trace in Ruby
 
-#ifndef NO_GVL_INSTRUMENTATION
-static rb_internal_thread_specific_key_t per_thread_gvl_waiting_timestamp_key;
-#endif
-
 // This is used by `thread_context_collector_on_gvl_running`. Because when that method gets called we're not sure if
 // it's safe to access the state of the thread context collector, we store this setting as a global value. This does
 // mean this setting is shared among all thread context collectors, and thus it's "last writer wins".
@@ -320,7 +315,7 @@ void collectors_thread_context_init(VALUE profiling_module) {
 
   #ifndef NO_GVL_INSTRUMENTATION
     // This will raise if Ruby already ran out of thread-local keys
-    per_thread_gvl_waiting_timestamp_key = rb_internal_thread_specific_key_create();
+    gvl_profiling_init();
   #endif
 
   gc_profiling_init();
@@ -1019,7 +1014,7 @@ static void initialize_context(VALUE thread, struct per_thread_context *thread_c
     // (Setting this is potentially a race, but what we want is to avoid _stale_ data, so
     // if this gets set concurrently with context initialization, then such a value will belong
     // to the current profiler instance, so that's OK)
-    rb_internal_thread_specific_set(thread, per_thread_gvl_waiting_timestamp_key, (void *) GVL_WAITING_ENABLED_EMPTY);
+    gvl_profiling_state_thread_object_set(thread, GVL_WAITING_ENABLED_EMPTY);
   #endif
 }
 
@@ -1083,7 +1078,7 @@ static int per_thread_context_as_ruby_hash(st_data_t key_thread, st_data_t value
     ID2SYM(rb_intern("gc_tracking.wall_time_at_start_ns")),  /* => */ LONG2NUM(thread_context->gc_tracking.wall_time_at_start_ns),
 
     #ifndef NO_GVL_INSTRUMENTATION
-      ID2SYM(rb_intern("gvl_waiting_at")), /* => */ LONG2NUM((intptr_t) rb_internal_thread_specific_get(thread, per_thread_gvl_waiting_timestamp_key)),
+      ID2SYM(rb_intern("gvl_waiting_at")), /* => */ LONG2NUM(gvl_profiling_state_thread_object_get(thread)),
     #endif
   };
   for (long unsigned int i = 0; i < VALUE_COUNT(arguments); i += 2) rb_hash_aset(context_as_hash, arguments[i], arguments[i+1]);
@@ -1558,7 +1553,7 @@ static VALUE _native_sample_skipped_allocation_samples(DDTRACE_UNUSED VALUE self
 
 #ifndef NO_GVL_INSTRUMENTATION
   // This function can get called from outside the GVL and even on non-main Ractors
-  void thread_context_collector_on_gvl_waiting(VALUE thread) {
+  void thread_context_collector_on_gvl_waiting(gvl_profiling_thread thread) {
     // Because this function gets called from a thread that is NOT holding the GVL, we avoid touching the
     // per-thread context directly.
     //
@@ -1567,19 +1562,19 @@ static VALUE _native_sample_skipped_allocation_samples(DDTRACE_UNUSED VALUE self
     //
     // Also, this function can get called on the non-main Ractor. We deal with this by checking if the value in the context
     // is non-zero, since only `initialize_context` ever sets the value from 0 to non-zero for threads it sees.
-    intptr_t thread_being_profiled = (intptr_t) rb_internal_thread_specific_get(thread, per_thread_gvl_waiting_timestamp_key);
+    intptr_t thread_being_profiled = gvl_profiling_state_get(thread);
     if (!thread_being_profiled) return;
 
     long current_monotonic_wall_time_ns = monotonic_wall_time_now_ns(DO_NOT_RAISE_ON_FAILURE);
     if (current_monotonic_wall_time_ns <= 0 || current_monotonic_wall_time_ns > GVL_WAITING_ENABLED_EMPTY) return;
 
-    rb_internal_thread_specific_set(thread, per_thread_gvl_waiting_timestamp_key, (void *) current_monotonic_wall_time_ns);
+    gvl_profiling_state_set(thread, current_monotonic_wall_time_ns);
   }
 
   // This function can get called from outside the GVL and even on non-main Ractors
   __attribute__((warn_unused_result))
-  bool thread_context_collector_on_gvl_running_with_threshold(VALUE thread, uint32_t waiting_for_gvl_threshold_ns) {
-    intptr_t gvl_waiting_at = (intptr_t) rb_internal_thread_specific_get(thread, per_thread_gvl_waiting_timestamp_key);
+  bool thread_context_collector_on_gvl_running_with_threshold(gvl_profiling_thread thread, uint32_t waiting_for_gvl_threshold_ns) {
+    intptr_t gvl_waiting_at = gvl_profiling_state_get(thread);
 
     // Thread was not being profiled / not waiting on gvl
     if (gvl_waiting_at == 0 || gvl_waiting_at == GVL_WAITING_ENABLED_EMPTY) return false;
@@ -1595,17 +1590,17 @@ static VALUE _native_sample_skipped_allocation_samples(DDTRACE_UNUSED VALUE self
       // We flip the gvl_waiting_at to negative to mark that the thread is now running and no longer waiting
       intptr_t gvl_waiting_at_is_now_running = -gvl_waiting_at;
 
-      rb_internal_thread_specific_set(thread, per_thread_gvl_waiting_timestamp_key, (void *) gvl_waiting_at_is_now_running);
+      gvl_profiling_state_set(thread, gvl_waiting_at_is_now_running);
     } else {
       // We decided not to sample. Let's mark the thread back to the initial "enabled but empty" state
-      rb_internal_thread_specific_set(thread, per_thread_gvl_waiting_timestamp_key, (void *) GVL_WAITING_ENABLED_EMPTY);
+      gvl_profiling_state_set(thread, GVL_WAITING_ENABLED_EMPTY);
     }
 
     return should_sample;
   }
 
   __attribute__((warn_unused_result))
-  bool thread_context_collector_on_gvl_running(VALUE thread) {
+  bool thread_context_collector_on_gvl_running(gvl_profiling_thread thread) {
     return thread_context_collector_on_gvl_running_with_threshold(thread, global_waiting_for_gvl_threshold_ns);
   }
 
@@ -1643,9 +1638,7 @@ static VALUE _native_sample_skipped_allocation_samples(DDTRACE_UNUSED VALUE self
 
     if (!state->timeline_enabled) rb_raise(rb_eRuntimeError, "GVL profiling requires timeline to be enabled");
 
-    struct per_thread_context *thread_context = get_or_create_context_for(current_thread, state);
-
-    intptr_t gvl_waiting_at = (intptr_t) rb_internal_thread_specific_get(current_thread, per_thread_gvl_waiting_timestamp_key);
+    intptr_t gvl_waiting_at = gvl_profiling_state_thread_object_get(current_thread);
 
     if (gvl_waiting_at >= 0) {
       // @ivoanjo: I'm not sure if this can ever happen. This means that we're not on the same thread
@@ -1655,11 +1648,15 @@ static VALUE _native_sample_skipped_allocation_samples(DDTRACE_UNUSED VALUE self
       return Qfalse;
     }
 
+    struct per_thread_context *thread_context = get_or_create_context_for(current_thread, state);
+
     // We don't actually account for cpu-time during Waiting for GVL. BUT, we may chose to push an
     // extra sample to represent the period prior to Waiting for GVL. To support that, we retrieve the current
     // cpu-time of the thread and let `update_metrics_and_sample` decide what to do with it.
     long cpu_time_for_thread = cpu_time_now_ns(thread_context);
 
+    // TODO: Should we update the dynamic sampling rate overhead tracking with this sample as well?
+
     update_metrics_and_sample(
       state,
       /* thread_being_sampled: */ current_thread,
@@ -1688,7 +1685,7 @@ static VALUE _native_sample_skipped_allocation_samples(DDTRACE_UNUSED VALUE self
     sampling_buffer* sampling_buffer,
     long current_cpu_time_ns
   ) {
-    intptr_t gvl_waiting_at = (intptr_t) rb_internal_thread_specific_get(thread_being_sampled, per_thread_gvl_waiting_timestamp_key);
+    intptr_t gvl_waiting_at = gvl_profiling_state_thread_object_get(thread_being_sampled);
 
     bool is_gvl_waiting_state = gvl_waiting_at != 0 && gvl_waiting_at != GVL_WAITING_ENABLED_EMPTY;
 
@@ -1738,7 +1735,7 @@ static VALUE _native_sample_skipped_allocation_samples(DDTRACE_UNUSED VALUE self
 
     if (gvl_waiting_at < 0) {
       // Negative means the waiting for GVL just ended, so we clear the state, so next samples no longer represent waiting
-      rb_internal_thread_specific_set(thread_being_sampled, per_thread_gvl_waiting_timestamp_key, (void *) GVL_WAITING_ENABLED_EMPTY);
+      gvl_profiling_state_thread_object_set(thread_being_sampled, GVL_WAITING_ENABLED_EMPTY);
     }
 
     long gvl_waiting_started_wall_time_ns = labs(gvl_waiting_at);
@@ -1779,21 +1776,21 @@ static VALUE _native_sample_skipped_allocation_samples(DDTRACE_UNUSED VALUE self
   static VALUE _native_on_gvl_waiting(DDTRACE_UNUSED VALUE self, VALUE thread) {
     ENFORCE_THREAD(thread);
 
-    thread_context_collector_on_gvl_waiting(thread);
+    thread_context_collector_on_gvl_waiting(thread_from_thread_object(thread));
     return Qnil;
   }
 
   static VALUE _native_gvl_waiting_at_for(DDTRACE_UNUSED VALUE self, VALUE thread) {
     ENFORCE_THREAD(thread);
 
-    intptr_t gvl_waiting_at = (intptr_t) rb_internal_thread_specific_get(thread, per_thread_gvl_waiting_timestamp_key);
+    intptr_t gvl_waiting_at = gvl_profiling_state_thread_object_get(thread);
     return LONG2NUM(gvl_waiting_at);
   }
 
   static VALUE _native_on_gvl_running(DDTRACE_UNUSED VALUE self, VALUE thread) {
     ENFORCE_THREAD(thread);
 
-    return thread_context_collector_on_gvl_running(thread) ? Qtrue : Qfalse;
+    return thread_context_collector_on_gvl_running(thread_from_thread_object(thread)) ? Qtrue : Qfalse;
   }
 
   static VALUE _native_sample_after_gvl_running(DDTRACE_UNUSED VALUE self, VALUE collector_instance, VALUE thread) {

@@ -3,6 +3,8 @@
 #include <ruby.h>
 #include <stdbool.h>
 
+#include "gvl_profiling_helper.h"
+
 void thread_context_collector_sample(
   VALUE self_instance,
   long current_monotonic_wall_time_ns,
@@ -14,6 +16,9 @@ VALUE thread_context_collector_sample_after_gc(VALUE self_instance);
 void thread_context_collector_on_gc_start(VALUE self_instance);
 __attribute__((warn_unused_result)) bool thread_context_collector_on_gc_finish(VALUE self_instance);
 VALUE enforce_thread_context_collector_instance(VALUE object);
-void thread_context_collector_on_gvl_waiting(VALUE thread);
-__attribute__((warn_unused_result)) bool thread_context_collector_on_gvl_running(VALUE thread);
-VALUE thread_context_collector_sample_after_gvl_running(VALUE self_instance);
+
+#ifndef NO_GVL_INSTRUMENTATION
+  void thread_context_collector_on_gvl_waiting(gvl_profiling_thread thread);
+  __attribute__((warn_unused_result)) bool thread_context_collector_on_gvl_running(gvl_profiling_thread thread);
+  VALUE thread_context_collector_sample_after_gvl_running(VALUE self_instance);
+#endif
@@ -131,10 +131,6 @@ def skip_building_extension!(reason)
 
 have_func "malloc_stats"
 
-# On older Rubies, there was no GVL instrumentation API and APIs created to support it
-# TODO: We can probably support Ruby 3.2 as well here, but we haven't done that work yet
-$defs << "-DNO_GVL_INSTRUMENTATION" if RUBY_VERSION < "3.3"
-
 # On older Rubies, rb_postponed_job_preregister/rb_postponed_job_trigger did not exist
 $defs << "-DNO_POSTPONED_TRIGGER" if RUBY_VERSION < "3.3"
 
@@ -147,6 +143,12 @@ def skip_building_extension!(reason)
 # On older Rubies, some of the Ractor internal APIs were directly accessible
 $defs << "-DUSE_RACTOR_INTERNAL_APIS_DIRECTLY" if RUBY_VERSION < "3.3"
 
+# On older Rubies, there was no GVL instrumentation API and APIs created to support it
+$defs << "-DNO_GVL_INSTRUMENTATION" if RUBY_VERSION < "3.2"
+
+# Supporting GVL instrumentation on 3.2 needs some workarounds
+$defs << "-DUSE_GVL_PROFILING_3_2_WORKAROUNDS" if RUBY_VERSION.start_with?("3.2")
+
 # On older Rubies, there was no struct rb_native_thread. See private_vm_api_acccess.c for details.
 $defs << "-DNO_RB_NATIVE_THREAD" if RUBY_VERSION < "3.2"
 

@@ -0,0 +1,50 @@
+#include <ruby.h>
+#include <ruby/thread.h>
+#include "gvl_profiling_helper.h"
+
+#if !defined(NO_GVL_INSTRUMENTATION) && !defined(USE_GVL_PROFILING_3_2_WORKAROUNDS) // Ruby 3.3+
+  rb_internal_thread_specific_key_t gvl_waiting_tls_key;
+
+  void gvl_profiling_init(void) {
+    gvl_waiting_tls_key = rb_internal_thread_specific_key_create();
+  }
+
+#endif
+
+#ifdef USE_GVL_PROFILING_3_2_WORKAROUNDS // Ruby 3.2
+  __thread gvl_profiling_thread gvl_waiting_tls;
+  static bool gvl_profiling_state_thread_tracking_workaround_installed = false;
+
+  static void on_thread_start(
+    DDTRACE_UNUSED rb_event_flag_t _unused1,
+    DDTRACE_UNUSED const rb_internal_thread_event_data_t *_unused2,
+    DDTRACE_UNUSED void *_unused3
+  ) {
+    gvl_waiting_tls = (gvl_profiling_thread) {.thread = NULL};
+  }
+
+  // Hack: We're using the gvl_waiting_tls native thread-local to store per-thread information. Unfortunately, Ruby puts a big hole
+  // in our plan because it reuses native threads -- specifically, in Ruby 3.2, native threads are still 1:1 to Ruby
+  // threads (M:N wasn't a thing yet) BUT once a Ruby thread dies, the VM will keep the native thread around for a
+  // bit, and if another Ruby thread starts right after, Ruby will reuse the native thread, rather than create a new one.
+  //
+  // This will mean that the new Ruby thread will still have the same native thread-local data that we set on the
+  // old thread. For the purposes of our tracking, where we're keeping a pointer to the current thread object in
+  // thread-local storage **this is disastrous** since it means we'll be pointing at the wrong thread (and its
+  // memory may have been freed or reused since!)
+  //
+  // To work around this issue, once GVL profiling is enabled, we install an event hook on thread start
+  // events that clears the thread-local data. This guarantees that there will be no stale data -- any existing
+  // data will be cleared at thread start.
+  //
+  // Note that once installed, this event hook becomes permanent -- stopping the profiler does not stop this event
+  // hook, unlike all others. This is because we can't afford to miss any thread start events while the
+  // profiler is stopped (e.g. during reconfiguration) as that would mean stale data once the profiler starts again.
+  void gvl_profiling_state_thread_tracking_workaround(void) {
+    if (gvl_profiling_state_thread_tracking_workaround_installed) return;
+
+    rb_internal_thread_add_event_hook(on_thread_start, RUBY_INTERNAL_THREAD_EVENT_STARTED, NULL);
+
+    gvl_profiling_state_thread_tracking_workaround_installed = true;
+  }
+#endif
@@ -0,0 +1,75 @@
+#pragma once
+
+// This helper is used by the Datadog::Profiling::Collectors::ThreadContext to store data used when profiling the GVL.
+// It's tested through that class' interfaces.
+// ---
+
+#include "extconf.h"
+
+#if !defined(NO_GVL_INSTRUMENTATION) && !defined(USE_GVL_PROFILING_3_2_WORKAROUNDS) // Ruby 3.3+
+  #include <ruby.h>
+  #include <ruby/thread.h>
+  #include "datadog_ruby_common.h"
+
+  typedef struct { VALUE thread; } gvl_profiling_thread;
+  extern rb_internal_thread_specific_key_t gvl_waiting_tls_key;
+
+  void gvl_profiling_init(void);
+
+  static inline gvl_profiling_thread thread_from_thread_object(VALUE thread) {
+    return (gvl_profiling_thread) {.thread = thread};
+  }
+
+  static inline gvl_profiling_thread thread_from_event(const rb_internal_thread_event_data_t *event_data) {
+    return thread_from_thread_object(event_data->thread);
+  }
+
+  static inline intptr_t gvl_profiling_state_get(gvl_profiling_thread thread) {
+    return (intptr_t) rb_internal_thread_specific_get(thread.thread, gvl_waiting_tls_key);
+  }
+
+  static inline void gvl_profiling_state_set(gvl_profiling_thread thread, intptr_t value) {
+    rb_internal_thread_specific_set(thread.thread, gvl_waiting_tls_key, (void *) value);
+  }
+#endif
+
+#ifdef USE_GVL_PROFILING_3_2_WORKAROUNDS // Ruby 3.2
+  typedef struct { void *thread; } gvl_profiling_thread;
+  extern __thread gvl_profiling_thread gvl_waiting_tls;
+
+  static inline void gvl_profiling_init(void) { }
+
+  // This header gets included in private_vm_access.c which can't include datadog_ruby_common.h so we replicate this
+  // helper here
+  #ifdef __GNUC__
+    #define DDTRACE_UNUSED  __attribute__((unused))
+  #else
+    #define DDTRACE_UNUSED
+  #endif
+
+  // NOTE: This is a hack that relies on the knowledge that on Ruby 3.2 the
+  // RUBY_INTERNAL_THREAD_EVENT_READY and RUBY_INTERNAL_THREAD_EVENT_RESUMED events always get called on the thread they
+  // are about. Thus, we can use our thread local storage hack to get this data, even though the event doesn't include it.
+  static inline gvl_profiling_thread thread_from_event(DDTRACE_UNUSED const void *event_data) {
+    return gvl_waiting_tls;
+  }
+
+  void gvl_profiling_state_thread_tracking_workaround(void);
+  gvl_profiling_thread gvl_profiling_state_maybe_initialize(void);
+
+  // Implementing these on Ruby 3.2 requires access to private VM things, so the following methods are
+  // implemented in `private_vm_api_access.c`
+  gvl_profiling_thread thread_from_thread_object(VALUE thread);
+  intptr_t gvl_profiling_state_get(gvl_profiling_thread thread);
+  void gvl_profiling_state_set(gvl_profiling_thread thread, intptr_t value);
+#endif
+
+#ifndef NO_GVL_INSTRUMENTATION // For all Rubies supporting GVL profiling (3.2+)
+  static inline intptr_t gvl_profiling_state_thread_object_get(VALUE thread) {
+    return gvl_profiling_state_get(thread_from_thread_object(thread));
+  }
+
+  static inline void gvl_profiling_state_thread_object_set(VALUE thread, intptr_t value) {
+    gvl_profiling_state_set(thread_from_thread_object(thread), value);
+  }
+#endif