From 568db8e956e7d54a88f1eec30f2ab5dd7e3672e7 Mon Sep 17 00:00:00 2001
From: Ivo Anjo <ivo.anjo@datadoghq.com>
Date: Fri, 20 Sep 2024 13:45:47 +0100
Subject: [PATCH 1/6] Refactor and abstract API needed for storing GVL
 profiling info in threads

This is a first step towards supporting GVL profiling in Ruby 3.2:
for Ruby 3.2 we'll do an entirely different implementation of this
interface, as a replacement for `rb_internal_thread_specific_*` which
were only added on 3.3.
---
 .../collectors_cpu_and_wall_time_worker.c     |  2 +-
 .../collectors_thread_context.c               | 41 ++++++++----------
 .../collectors_thread_context.h               |  6 ++-
 .../gvl_profiling_helper.c                    | 14 ++++++
 .../gvl_profiling_helper.h                    | 43 +++++++++++++++++++
 .../collectors/thread_context_spec.rb         |  4 +-
 6 files changed, 82 insertions(+), 28 deletions(-)
 create mode 100644 ext/datadog_profiling_native_extension/gvl_profiling_helper.c
 create mode 100644 ext/datadog_profiling_native_extension/gvl_profiling_helper.h

diff --git a/ext/datadog_profiling_native_extension/collectors_cpu_and_wall_time_worker.c b/ext/datadog_profiling_native_extension/collectors_cpu_and_wall_time_worker.c
index 591354a906b..5511c7c6bde 100644
--- a/ext/datadog_profiling_native_extension/collectors_cpu_and_wall_time_worker.c
+++ b/ext/datadog_profiling_native_extension/collectors_cpu_and_wall_time_worker.c
@@ -1304,7 +1304,7 @@ static VALUE _native_resume_signals(DDTRACE_UNUSED VALUE self) {
     //
     // In fact, the `target_thread` that this event is about may not even be the current thread. (So be careful with thread locals that
     // are not directly tied to the `target_thread` object and the like)
-    VALUE target_thread = event_data->thread;
+    gvl_profiling_thread target_thread = thread_from_event(event_data);
 
     if (event_id == RUBY_INTERNAL_THREAD_EVENT_READY) { /* waiting for gvl */
       thread_context_collector_on_gvl_waiting(target_thread);
diff --git a/ext/datadog_profiling_native_extension/collectors_thread_context.c b/ext/datadog_profiling_native_extension/collectors_thread_context.c
index a71aa29d0c2..b4eb5a9b256 100644
--- a/ext/datadog_profiling_native_extension/collectors_thread_context.c
+++ b/ext/datadog_profiling_native_extension/collectors_thread_context.c
@@ -1,5 +1,4 @@
 #include <ruby.h>
-#include <ruby/thread.h>
 
 #include "collectors_thread_context.h"
 #include "clock_id.h"
@@ -80,7 +79,7 @@
 // This is used as a placeholder to mark threads that are allowed to be profiled (enabled)
 // (e.g. to avoid trying to gvl profile threads that are not from the main Ractor)
 // and for which there's no data yet
-#define GVL_WAITING_ENABLED_EMPTY INTPTR_MAX
+#define GVL_WAITING_ENABLED_EMPTY RUBY_FIXNUM_MAX
 
 static ID at_active_span_id;  // id of :@active_span in Ruby
 static ID at_active_trace_id; // id of :@active_trace in Ruby
@@ -92,10 +91,6 @@ static ID at_otel_values_id;  // id of :@otel_values in Ruby
 static ID at_parent_span_id_id; // id of :@parent_span_id in Ruby
 static ID at_datadog_trace_id;  // id of :@datadog_trace in Ruby
 
-#ifndef NO_GVL_INSTRUMENTATION
-static rb_internal_thread_specific_key_t per_thread_gvl_waiting_timestamp_key;
-#endif
-
 // This is used by `thread_context_collector_on_gvl_running`. Because when that method gets called we're not sure if
 // it's safe to access the state of the thread context collector, we store this setting as a global value. This does
 // mean this setting is shared among all thread context collectors, and thus it's "last writer wins".
@@ -320,7 +315,7 @@ void collectors_thread_context_init(VALUE profiling_module) {
 
   #ifndef NO_GVL_INSTRUMENTATION
     // This will raise if Ruby already ran out of thread-local keys
-    per_thread_gvl_waiting_timestamp_key = rb_internal_thread_specific_key_create();
+    gvl_profiling_init();
   #endif
 
   gc_profiling_init();
@@ -1019,7 +1014,7 @@ static void initialize_context(VALUE thread, struct per_thread_context *thread_c
     // (Setting this is potentially a race, but what we want is to avoid _stale_ data, so
     // if this gets set concurrently with context initialization, then such a value will belong
     // to the current profiler instance, so that's OK)
-    rb_internal_thread_specific_set(thread, per_thread_gvl_waiting_timestamp_key, (void *) GVL_WAITING_ENABLED_EMPTY);
+    gvl_profiling_state_thread_object_set(thread, GVL_WAITING_ENABLED_EMPTY);
   #endif
 }
 
@@ -1083,7 +1078,7 @@ static int per_thread_context_as_ruby_hash(st_data_t key_thread, st_data_t value
     ID2SYM(rb_intern("gc_tracking.wall_time_at_start_ns")),  /* => */ LONG2NUM(thread_context->gc_tracking.wall_time_at_start_ns),
 
     #ifndef NO_GVL_INSTRUMENTATION
-      ID2SYM(rb_intern("gvl_waiting_at")), /* => */ LONG2NUM((intptr_t) rb_internal_thread_specific_get(thread, per_thread_gvl_waiting_timestamp_key)),
+      ID2SYM(rb_intern("gvl_waiting_at")), /* => */ LONG2NUM(gvl_profiling_state_thread_object_get(thread)),
     #endif
   };
   for (long unsigned int i = 0; i < VALUE_COUNT(arguments); i += 2) rb_hash_aset(context_as_hash, arguments[i], arguments[i+1]);
@@ -1558,7 +1553,7 @@ static VALUE _native_sample_skipped_allocation_samples(DDTRACE_UNUSED VALUE self
 
 #ifndef NO_GVL_INSTRUMENTATION
   // This function can get called from outside the GVL and even on non-main Ractors
-  void thread_context_collector_on_gvl_waiting(VALUE thread) {
+  void thread_context_collector_on_gvl_waiting(gvl_profiling_thread thread) {
     // Because this function gets called from a thread that is NOT holding the GVL, we avoid touching the
     // per-thread context directly.
     //
@@ -1567,19 +1562,19 @@ static VALUE _native_sample_skipped_allocation_samples(DDTRACE_UNUSED VALUE self
     //
     // Also, this function can get called on the non-main Ractor. We deal with this by checking if the value in the context
     // is non-zero, since only `initialize_context` ever sets the value from 0 to non-zero for threads it sees.
-    intptr_t thread_being_profiled = (intptr_t) rb_internal_thread_specific_get(thread, per_thread_gvl_waiting_timestamp_key);
+    intptr_t thread_being_profiled = gvl_profiling_state_get(thread);
     if (!thread_being_profiled) return;
 
     long current_monotonic_wall_time_ns = monotonic_wall_time_now_ns(DO_NOT_RAISE_ON_FAILURE);
     if (current_monotonic_wall_time_ns <= 0 || current_monotonic_wall_time_ns > GVL_WAITING_ENABLED_EMPTY) return;
 
-    rb_internal_thread_specific_set(thread, per_thread_gvl_waiting_timestamp_key, (void *) current_monotonic_wall_time_ns);
+    gvl_profiling_state_set(thread, current_monotonic_wall_time_ns);
   }
 
   // This function can get called from outside the GVL and even on non-main Ractors
   __attribute__((warn_unused_result))
-  bool thread_context_collector_on_gvl_running_with_threshold(VALUE thread, uint32_t waiting_for_gvl_threshold_ns) {
-    intptr_t gvl_waiting_at = (intptr_t) rb_internal_thread_specific_get(thread, per_thread_gvl_waiting_timestamp_key);
+  bool thread_context_collector_on_gvl_running_with_threshold(gvl_profiling_thread thread, uint32_t waiting_for_gvl_threshold_ns) {
+    intptr_t gvl_waiting_at = gvl_profiling_state_get(thread);
 
     // Thread was not being profiled / not waiting on gvl
     if (gvl_waiting_at == 0 || gvl_waiting_at == GVL_WAITING_ENABLED_EMPTY) return false;
@@ -1595,17 +1590,17 @@ static VALUE _native_sample_skipped_allocation_samples(DDTRACE_UNUSED VALUE self
       // We flip the gvl_waiting_at to negative to mark that the thread is now running and no longer waiting
       intptr_t gvl_waiting_at_is_now_running = -gvl_waiting_at;
 
-      rb_internal_thread_specific_set(thread, per_thread_gvl_waiting_timestamp_key, (void *) gvl_waiting_at_is_now_running);
+      gvl_profiling_state_set(thread, gvl_waiting_at_is_now_running);
     } else {
       // We decided not to sample. Let's mark the thread back to the initial "enabled but empty" state
-      rb_internal_thread_specific_set(thread, per_thread_gvl_waiting_timestamp_key, (void *) GVL_WAITING_ENABLED_EMPTY);
+      gvl_profiling_state_set(thread, GVL_WAITING_ENABLED_EMPTY);
     }
 
     return should_sample;
   }
 
   __attribute__((warn_unused_result))
-  bool thread_context_collector_on_gvl_running(VALUE thread) {
+  bool thread_context_collector_on_gvl_running(gvl_profiling_thread thread) {
     return thread_context_collector_on_gvl_running_with_threshold(thread, global_waiting_for_gvl_threshold_ns);
   }
 
@@ -1645,7 +1640,7 @@ static VALUE _native_sample_skipped_allocation_samples(DDTRACE_UNUSED VALUE self
 
     struct per_thread_context *thread_context = get_or_create_context_for(current_thread, state);
 
-    intptr_t gvl_waiting_at = (intptr_t) rb_internal_thread_specific_get(current_thread, per_thread_gvl_waiting_timestamp_key);
+    intptr_t gvl_waiting_at = gvl_profiling_state_thread_object_get(current_thread);
 
     if (gvl_waiting_at >= 0) {
       // @ivoanjo: I'm not sure if this can ever happen. This means that we're not on the same thread
@@ -1688,7 +1683,7 @@ static VALUE _native_sample_skipped_allocation_samples(DDTRACE_UNUSED VALUE self
     sampling_buffer* sampling_buffer,
     long current_cpu_time_ns
   ) {
-    intptr_t gvl_waiting_at = (intptr_t) rb_internal_thread_specific_get(thread_being_sampled, per_thread_gvl_waiting_timestamp_key);
+    intptr_t gvl_waiting_at = gvl_profiling_state_thread_object_get(thread_being_sampled);
 
     bool is_gvl_waiting_state = gvl_waiting_at != 0 && gvl_waiting_at != GVL_WAITING_ENABLED_EMPTY;
 
@@ -1738,7 +1733,7 @@ static VALUE _native_sample_skipped_allocation_samples(DDTRACE_UNUSED VALUE self
 
     if (gvl_waiting_at < 0) {
       // Negative means the waiting for GVL just ended, so we clear the state, so next samples no longer represent waiting
-      rb_internal_thread_specific_set(thread_being_sampled, per_thread_gvl_waiting_timestamp_key, (void *) GVL_WAITING_ENABLED_EMPTY);
+      gvl_profiling_state_thread_object_set(thread_being_sampled, GVL_WAITING_ENABLED_EMPTY);
     }
 
     long gvl_waiting_started_wall_time_ns = labs(gvl_waiting_at);
@@ -1779,21 +1774,21 @@ static VALUE _native_sample_skipped_allocation_samples(DDTRACE_UNUSED VALUE self
   static VALUE _native_on_gvl_waiting(DDTRACE_UNUSED VALUE self, VALUE thread) {
     ENFORCE_THREAD(thread);
 
-    thread_context_collector_on_gvl_waiting(thread);
+    thread_context_collector_on_gvl_waiting(thread_from_thread_object(thread));
     return Qnil;
   }
 
   static VALUE _native_gvl_waiting_at_for(DDTRACE_UNUSED VALUE self, VALUE thread) {
     ENFORCE_THREAD(thread);
 
-    intptr_t gvl_waiting_at = (intptr_t) rb_internal_thread_specific_get(thread, per_thread_gvl_waiting_timestamp_key);
+    intptr_t gvl_waiting_at = gvl_profiling_state_thread_object_get(thread);
     return LONG2NUM(gvl_waiting_at);
   }
 
   static VALUE _native_on_gvl_running(DDTRACE_UNUSED VALUE self, VALUE thread) {
     ENFORCE_THREAD(thread);
 
-    return thread_context_collector_on_gvl_running(thread) ? Qtrue : Qfalse;
+    return thread_context_collector_on_gvl_running(thread_from_thread_object(thread)) ? Qtrue : Qfalse;
   }
 
   static VALUE _native_sample_after_gvl_running(DDTRACE_UNUSED VALUE self, VALUE collector_instance, VALUE thread) {
diff --git a/ext/datadog_profiling_native_extension/collectors_thread_context.h b/ext/datadog_profiling_native_extension/collectors_thread_context.h
index 52ce051c74b..1e34a5450d8 100644
--- a/ext/datadog_profiling_native_extension/collectors_thread_context.h
+++ b/ext/datadog_profiling_native_extension/collectors_thread_context.h
@@ -3,6 +3,8 @@
 #include <ruby.h>
 #include <stdbool.h>
 
+#include "gvl_profiling_helper.h"
+
 void thread_context_collector_sample(
   VALUE self_instance,
   long current_monotonic_wall_time_ns,
@@ -14,6 +16,6 @@ VALUE thread_context_collector_sample_after_gc(VALUE self_instance);
 void thread_context_collector_on_gc_start(VALUE self_instance);
 __attribute__((warn_unused_result)) bool thread_context_collector_on_gc_finish(VALUE self_instance);
 VALUE enforce_thread_context_collector_instance(VALUE object);
-void thread_context_collector_on_gvl_waiting(VALUE thread);
-__attribute__((warn_unused_result)) bool thread_context_collector_on_gvl_running(VALUE thread);
+void thread_context_collector_on_gvl_waiting(gvl_profiling_thread thread);
+__attribute__((warn_unused_result)) bool thread_context_collector_on_gvl_running(gvl_profiling_thread thread);
 VALUE thread_context_collector_sample_after_gvl_running(VALUE self_instance);
diff --git a/ext/datadog_profiling_native_extension/gvl_profiling_helper.c b/ext/datadog_profiling_native_extension/gvl_profiling_helper.c
new file mode 100644
index 00000000000..7e1f992a86e
--- /dev/null
+++ b/ext/datadog_profiling_native_extension/gvl_profiling_helper.c
@@ -0,0 +1,14 @@
+#include <ruby.h>
+#include <ruby/thread.h>
+#include "gvl_profiling_helper.h"
+
+#if !defined(NO_GVL_INSTRUMENTATION) // Ruby 3.3+
+
+rb_internal_thread_specific_key_t gvl_waiting_tls_key;
+
+void gvl_profiling_init(void) {
+  gvl_waiting_tls_key = rb_internal_thread_specific_key_create();
+}
+
+#endif
+
diff --git a/ext/datadog_profiling_native_extension/gvl_profiling_helper.h b/ext/datadog_profiling_native_extension/gvl_profiling_helper.h
new file mode 100644
index 00000000000..3764c352cd7
--- /dev/null
+++ b/ext/datadog_profiling_native_extension/gvl_profiling_helper.h
@@ -0,0 +1,43 @@
+#pragma once
+
+// This helper is used by the Datadog::Profiling::Collectors::ThreadContext to store data used when profiling the GVL.
+// It's tested through that class' interfaces.
+// ---
+
+#include "extconf.h"
+
+#if !defined(NO_GVL_INSTRUMENTATION) // Ruby 3.3+
+
+#include <ruby.h>
+#include <ruby/thread.h>
+#include "datadog_ruby_common.h"
+
+typedef struct { VALUE thread; } gvl_profiling_thread;
+extern rb_internal_thread_specific_key_t gvl_waiting_tls_key;
+
+void gvl_profiling_init(void);
+
+static inline gvl_profiling_thread thread_from_thread_object(VALUE thread) {
+  return (gvl_profiling_thread) {.thread = thread};
+}
+
+static inline gvl_profiling_thread thread_from_event(const rb_internal_thread_event_data_t *event_data) {
+  return thread_from_thread_object(event_data->thread);
+}
+
+static inline intptr_t gvl_profiling_state_get(gvl_profiling_thread thread) {
+  return (intptr_t) rb_internal_thread_specific_get(thread.thread, gvl_waiting_tls_key);
+}
+
+static inline void gvl_profiling_state_set(gvl_profiling_thread thread, intptr_t value) {
+  rb_internal_thread_specific_set(thread.thread, gvl_waiting_tls_key, (void *) value);
+}
+
+static inline intptr_t gvl_profiling_state_thread_object_get(VALUE thread) {
+  return gvl_profiling_state_get(thread_from_thread_object(thread));
+}
+
+static inline void gvl_profiling_state_thread_object_set(VALUE thread, intptr_t value) {
+  gvl_profiling_state_set(thread_from_thread_object(thread), value);
+}
+#endif
diff --git a/spec/datadog/profiling/collectors/thread_context_spec.rb b/spec/datadog/profiling/collectors/thread_context_spec.rb
index 424cc9d1265..e36dfc08cea 100644
--- a/spec/datadog/profiling/collectors/thread_context_spec.rb
+++ b/spec/datadog/profiling/collectors/thread_context_spec.rb
@@ -40,9 +40,9 @@
   let(:endpoint_collection_enabled) { true }
   let(:timeline_enabled) { false }
   let(:allocation_type_enabled) { true }
-  # This mirrors the use of INTPTR_MAX for GVL_WAITING_ENABLED_EMPTY in the native code; it may need adjusting if we ever
+  # This mirrors the use of RUBY_FIXNUM_MAX for GVL_WAITING_ENABLED_EMPTY in the native code; it may need adjusting if we ever
   # want to support more platforms
-  let(:gvl_waiting_enabled_empty_magic_value) { 2**63 - 1 }
+  let(:gvl_waiting_enabled_empty_magic_value) { 2**62 - 1 }
   let(:waiting_for_gvl_threshold_ns) { 222_333_444 }
 
   subject(:cpu_and_wall_time_collector) do

From 98f968b27d4926daed66e9cd60ed19c6c21b9306 Mon Sep 17 00:00:00 2001
From: Ivo Anjo <ivo.anjo@datadoghq.com>
Date: Fri, 20 Sep 2024 16:43:56 +0100
Subject: [PATCH 2/6] [PROF-10588] Support GVL profiling preview on Ruby 3.2

**What does this PR do?**

This PR builds atop #3929 that added support for GVL profiling for
Ruby 3.3+ and makes GVL profiling also work for Ruby 3.2 .

Supporting GVL profiling on Ruby 3.2 needed special additional work.
That's because while in Ruby 3.2 we have the GVL instrumentation API
giving us the events we need to profile the GVL, we're missing:

1. Getting the Ruby thread `VALUE` as an argument in GVL instrumentation
   API events
2. The `rb_internal_thread_specific` API that allows us to attach in
   a thread-safe way data to Ruby thread objects

Both 1 and 2 were only introduced in Ruby 3.3, and our implementation
of GVL profiling relies/relied on them.

This PR... reimplements 1 & 2 in an alternative way, allowing us to
keep our existing design for 3.3+, while also supporting the older
Ruby version.

I've split it into two commits:
 i. Abstracting access and management of 1 & 2 into a new set of files
    (`gvl_profiling_helper.c`/`gvl_profiling_helper.h`). These new files
    are zero-overhead abstractions for most situations.
ii. Implementing 1 & 2 for Ruby 3.2.

**Motivation:**

We believe GVL profiling is quite an important observability feature
for Ruby, and thus we want to support it on all Ruby versions that
provide the GVL instrumentation API.

**Additional Notes:**

To solve 1, we're using native level thread-locals (GCC's
`__thread`) to keep a pointer to the underlying Ruby `rb_thread_t`
structure.

This is more complex than than "just keep it on a thread-local"
because:

a) Ruby reuses native threads. When a Ruby thread dies, Ruby keeps the
   underlying native thread around for a bit, and if another Ruby
   thread is born very quickly after the previous one, Ruby
   will reuse the native thread and attach it to the new Ruby thread.

   To avoid incorrectly reusing the thread-locals, we install an event
   hook on Ruby thread start, and make sure to clean any native
   thread-locals when a new thread stats.

b) Some of the GVL instrumentation API events are emitted while the
   thread does not have the GVL and so we need to be careful when we can
   and cannot read VM information.

   Thus, we only initialize the thread-local during the
   `RUBY_INTERNAL_THREAD_EVENT_RESUMED` which is emitted while the
   thread owns the GVL.

c) Since we don't get the current thread in events, we need to get a
   bit... creative. Thus, what we do is in
   `RUBY_INTERNAL_THREAD_EVENT_RESUMED`, because we know the current
   thread MUST own the GVL, we read from the internal Ruby VM state
   which thread is the GVL owner to find the info we need.

With a + b + c together we are able to keep a pointer to the underlying
`rb_thread_t` up-to-date in a native thread local, thus replacing the
need to get a `VALUE thread` as an argument.

To solve 2, we rely on an important observation: there's a
`VALUE stat_insn_usage` field inside `rb_thread_t` that's unused and
seems to have effectively been forgotten about.

There's nowhere in the VM code that's writing or reading it (other
than marking it for GC), and not even git history reveals a time
where this field was used. I could not find any other references to
this field anywhere else. Thus, we make use of this field to store
the information we need, as a replacement for
`rb_internal_thread_specific`.

Since presumably Ruby 3.2 will never see this field either removed or used
during its remaining maintenance release period this should work fine,
and we have a nice clean solution for 3.3+.

**How to test the change?**

Happily, with the changes on this PR, the existing test coverage
we added for GVL profiling on 3.3 is also green on 3.2! :)
---
 .../collectors_cpu_and_wall_time_worker.c     | 14 +++-
 .../extconf.rb                                | 10 ++-
 .../gvl_profiling_helper.c                    | 48 +++++++++--
 .../gvl_profiling_helper.h                    | 82 +++++++++++++------
 .../private_vm_api_access.c                   | 51 ++++++++++++
 lib/datadog/profiling/component.rb            |  4 +-
 .../cpu_and_wall_time_worker_spec.rb          |  2 +-
 .../collectors/thread_context_spec.rb         |  2 +-
 spec/datadog/profiling/component_spec.rb      |  8 +-
 spec/datadog/profiling/spec_helper.rb         |  4 +-
 10 files changed, 179 insertions(+), 46 deletions(-)

diff --git a/ext/datadog_profiling_native_extension/collectors_cpu_and_wall_time_worker.c b/ext/datadog_profiling_native_extension/collectors_cpu_and_wall_time_worker.c
index 5511c7c6bde..c1b872bf7ef 100644
--- a/ext/datadog_profiling_native_extension/collectors_cpu_and_wall_time_worker.c
+++ b/ext/datadog_profiling_native_extension/collectors_cpu_and_wall_time_worker.c
@@ -804,6 +804,10 @@ static VALUE release_gvl_and_run_sampling_trigger_loop(VALUE instance) {
 
   if (state->gvl_profiling_enabled) {
     #ifndef NO_GVL_INSTRUMENTATION
+      #ifdef USE_GVL_PROFILING_3_2_WORKAROUNDS
+        gvl_profiling_state_thread_tracking_workaround();
+      #endif
+
       state->gvl_profiling_hook = rb_internal_thread_add_event_hook(
         on_gvl_event,
         (
@@ -1315,11 +1319,19 @@ static VALUE _native_resume_signals(DDTRACE_UNUSED VALUE self) {
       // it tags threads it's tracking, so if a thread is tagged then by definition we know that thread belongs to the main
       // Ractor. Thus, if we really really wanted to access the state, we could do it after making sure we're on the correct Ractor.
 
+      #ifdef USE_GVL_PROFILING_3_2_WORKAROUNDS
+        target_thread = gvl_profiling_state_maybe_initialize();
+      #endif
+
       bool should_sample = thread_context_collector_on_gvl_running(target_thread);
 
       if (should_sample) {
         // should_sample is only true if a thread belongs to the main Ractor, so we're good to go
-        rb_postponed_job_trigger(after_gvl_running_from_postponed_job_handle);
+        #ifndef NO_POSTPONED_TRIGGER
+          rb_postponed_job_trigger(after_gvl_running_from_postponed_job_handle);
+        #else
+          rb_postponed_job_register_one(0, after_gvl_running_from_postponed_job, NULL);
+        #endif
       }
     } else {
       // This is a very delicate time and it's hard for us to raise an exception so let's at least complain to stderr
diff --git a/ext/datadog_profiling_native_extension/extconf.rb b/ext/datadog_profiling_native_extension/extconf.rb
index 09ac833bd1d..5f83c6dad87 100644
--- a/ext/datadog_profiling_native_extension/extconf.rb
+++ b/ext/datadog_profiling_native_extension/extconf.rb
@@ -131,10 +131,6 @@ def skip_building_extension!(reason)
 
 have_func "malloc_stats"
 
-# On older Rubies, there was no GVL instrumentation API and APIs created to support it
-# TODO: We can probably support Ruby 3.2 as well here, but we haven't done that work yet
-$defs << "-DNO_GVL_INSTRUMENTATION" if RUBY_VERSION < "3.3"
-
 # On older Rubies, rb_postponed_job_preregister/rb_postponed_job_trigger did not exist
 $defs << "-DNO_POSTPONED_TRIGGER" if RUBY_VERSION < "3.3"
 
@@ -147,6 +143,12 @@ def skip_building_extension!(reason)
 # On older Rubies, some of the Ractor internal APIs were directly accessible
 $defs << "-DUSE_RACTOR_INTERNAL_APIS_DIRECTLY" if RUBY_VERSION < "3.3"
 
+# On older Rubies, there was no GVL instrumentation API and APIs created to support it
+$defs << "-DNO_GVL_INSTRUMENTATION" if RUBY_VERSION < "3.2"
+
+# Supporting GVL instrumentation on 3.2 needs some workarounds
+$defs << "-DUSE_GVL_PROFILING_3_2_WORKAROUNDS" if RUBY_VERSION.start_with?("3.2")
+
 # On older Rubies, there was no struct rb_native_thread. See private_vm_api_acccess.c for details.
 $defs << "-DNO_RB_NATIVE_THREAD" if RUBY_VERSION < "3.2"
 
diff --git a/ext/datadog_profiling_native_extension/gvl_profiling_helper.c b/ext/datadog_profiling_native_extension/gvl_profiling_helper.c
index 7e1f992a86e..e07acb49854 100644
--- a/ext/datadog_profiling_native_extension/gvl_profiling_helper.c
+++ b/ext/datadog_profiling_native_extension/gvl_profiling_helper.c
@@ -2,13 +2,49 @@
 #include <ruby/thread.h>
 #include "gvl_profiling_helper.h"
 
-#if !defined(NO_GVL_INSTRUMENTATION) // Ruby 3.3+
+#if !defined(NO_GVL_INSTRUMENTATION) && !defined(USE_GVL_PROFILING_3_2_WORKAROUNDS) // Ruby 3.3+
+  rb_internal_thread_specific_key_t gvl_waiting_tls_key;
 
-rb_internal_thread_specific_key_t gvl_waiting_tls_key;
-
-void gvl_profiling_init(void) {
-  gvl_waiting_tls_key = rb_internal_thread_specific_key_create();
-}
+  void gvl_profiling_init(void) {
+    gvl_waiting_tls_key = rb_internal_thread_specific_key_create();
+  }
 
 #endif
 
+#ifdef USE_GVL_PROFILING_3_2_WORKAROUNDS // Ruby 3.2
+  __thread gvl_profiling_thread gvl_waiting_tls;
+  static bool gvl_profiling_state_thread_tracking_workaround_installed = false;
+
+  static void on_thread_start(
+    DDTRACE_UNUSED rb_event_flag_t _unused1,
+    DDTRACE_UNUSED const rb_internal_thread_event_data_t *_unused2,
+    DDTRACE_UNUSED void *_unused3
+  ) {
+    gvl_waiting_tls = (gvl_profiling_thread) {.thread = NULL};
+  }
+
+  // Hack: We're using the gvl_waiting_tls native thread-local to store per-thread information. Unfortunately, Ruby puts a big hole
+  // in our plan because it reuses native threads -- specifically, in Ruby 3.2, native threads are still 1:1 to Ruby
+  // threads (M:N wasn't a thing yet) BUT once a Ruby thread dies, the VM will keep the native thread around for a
+  // bit, and if another Ruby thread starts right after, Ruby will reuse the native thread, rather than create a new one.
+  //
+  // This will mean that the new Ruby thread will still have the same native thread-local data that we set on the
+  // old thread. For the purposes of our tracking, where we're keeping a pointer to the current thread object in
+  // thread-local storage **this is disastrous** since it means we'll be pointing at the wrong thread (and its
+  // memory may have been freed or reused since!)
+  //
+  // To work around this issue, once GVL profiling is enabled, we install an event hook on thread start
+  // events that clears the thread-local data. This guarantees that there will be no stale data -- any existing
+  // data will be cleared at thread start.
+  //
+  // Note that once installed, this event hook becomes permanent -- stopping the profiler does not stop this event
+  // hook, unlike all others. This is because we can't afford to miss any thread start events while the
+  // profiler is stopped (e.g. during reconfiguration) as that would mean stale data once the profiler starts again.
+  void gvl_profiling_state_thread_tracking_workaround(void) {
+    if (gvl_profiling_state_thread_tracking_workaround_installed) return;
+
+    rb_internal_thread_add_event_hook(on_thread_start, RUBY_INTERNAL_THREAD_EVENT_STARTED, NULL);
+
+    gvl_profiling_state_thread_tracking_workaround_installed = true;
+  }
+#endif
diff --git a/ext/datadog_profiling_native_extension/gvl_profiling_helper.h b/ext/datadog_profiling_native_extension/gvl_profiling_helper.h
index 3764c352cd7..071a45151d2 100644
--- a/ext/datadog_profiling_native_extension/gvl_profiling_helper.h
+++ b/ext/datadog_profiling_native_extension/gvl_profiling_helper.h
@@ -6,38 +6,70 @@
 
 #include "extconf.h"
 
-#if !defined(NO_GVL_INSTRUMENTATION) // Ruby 3.3+
+#if !defined(NO_GVL_INSTRUMENTATION) && !defined(USE_GVL_PROFILING_3_2_WORKAROUNDS) // Ruby 3.3+
+  #include <ruby.h>
+  #include <ruby/thread.h>
+  #include "datadog_ruby_common.h"
 
-#include <ruby.h>
-#include <ruby/thread.h>
-#include "datadog_ruby_common.h"
+  typedef struct { VALUE thread; } gvl_profiling_thread;
+  extern rb_internal_thread_specific_key_t gvl_waiting_tls_key;
 
-typedef struct { VALUE thread; } gvl_profiling_thread;
-extern rb_internal_thread_specific_key_t gvl_waiting_tls_key;
+  void gvl_profiling_init(void);
 
-void gvl_profiling_init(void);
+  static inline gvl_profiling_thread thread_from_thread_object(VALUE thread) {
+    return (gvl_profiling_thread) {.thread = thread};
+  }
 
-static inline gvl_profiling_thread thread_from_thread_object(VALUE thread) {
-  return (gvl_profiling_thread) {.thread = thread};
-}
+  static inline gvl_profiling_thread thread_from_event(const rb_internal_thread_event_data_t *event_data) {
+    return thread_from_thread_object(event_data->thread);
+  }
 
-static inline gvl_profiling_thread thread_from_event(const rb_internal_thread_event_data_t *event_data) {
-  return thread_from_thread_object(event_data->thread);
-}
+  static inline intptr_t gvl_profiling_state_get(gvl_profiling_thread thread) {
+    return (intptr_t) rb_internal_thread_specific_get(thread.thread, gvl_waiting_tls_key);
+  }
 
-static inline intptr_t gvl_profiling_state_get(gvl_profiling_thread thread) {
-  return (intptr_t) rb_internal_thread_specific_get(thread.thread, gvl_waiting_tls_key);
-}
+  static inline void gvl_profiling_state_set(gvl_profiling_thread thread, intptr_t value) {
+    rb_internal_thread_specific_set(thread.thread, gvl_waiting_tls_key, (void *) value);
+  }
+#endif
+
+#ifdef USE_GVL_PROFILING_3_2_WORKAROUNDS // Ruby 3.2
+  typedef struct { void *thread; } gvl_profiling_thread;
+  extern __thread gvl_profiling_thread gvl_waiting_tls;
+
+  static inline void gvl_profiling_init(void) { }
+
+  // This header gets included in private_vm_access.c which can't include datadog_ruby_common.h so we replicate this
+  // helper here
+  #ifdef __GNUC__
+    #define DDTRACE_UNUSED  __attribute__((unused))
+  #else
+    #define DDTRACE_UNUSED
+  #endif
 
-static inline void gvl_profiling_state_set(gvl_profiling_thread thread, intptr_t value) {
-  rb_internal_thread_specific_set(thread.thread, gvl_waiting_tls_key, (void *) value);
-}
+  // NOTE: This is a hack that relies on the knowledge that on Ruby 3.2 the
+  // RUBY_INTERNAL_THREAD_EVENT_READY and RUBY_INTERNAL_THREAD_EVENT_RESUMED events always get called on the thread they
+  // are about. Thus, we can use our thread local storage hack to get this data, even though the event doesn't include it.
+  static inline gvl_profiling_thread thread_from_event(DDTRACE_UNUSED const void *event_data) {
+    return gvl_waiting_tls;
+  }
+
+  void gvl_profiling_state_thread_tracking_workaround(void);
+  gvl_profiling_thread gvl_profiling_state_maybe_initialize(void);
+
+  // Implementing these on Ruby 3.2 requires access to private VM things, so the following methods are
+  // implemented in `private_vm_api_access.c`
+  gvl_profiling_thread thread_from_thread_object(VALUE thread);
+  intptr_t gvl_profiling_state_get(gvl_profiling_thread thread);
+  void gvl_profiling_state_set(gvl_profiling_thread thread, intptr_t value);
+#endif
 
-static inline intptr_t gvl_profiling_state_thread_object_get(VALUE thread) {
-  return gvl_profiling_state_get(thread_from_thread_object(thread));
-}
+#ifndef NO_GVL_INSTRUMENTATION // For all Rubies supporting GVL profiling (3.2+)
+  static inline intptr_t gvl_profiling_state_thread_object_get(VALUE thread) {
+    return gvl_profiling_state_get(thread_from_thread_object(thread));
+  }
 
-static inline void gvl_profiling_state_thread_object_set(VALUE thread, intptr_t value) {
-  gvl_profiling_state_set(thread_from_thread_object(thread), value);
-}
+  static inline void gvl_profiling_state_thread_object_set(VALUE thread, intptr_t value) {
+    gvl_profiling_state_set(thread_from_thread_object(thread), value);
+  }
 #endif
diff --git a/ext/datadog_profiling_native_extension/private_vm_api_access.c b/ext/datadog_profiling_native_extension/private_vm_api_access.c
index ea6b4c3b6e3..72d8877aea7 100644
--- a/ext/datadog_profiling_native_extension/private_vm_api_access.c
+++ b/ext/datadog_profiling_native_extension/private_vm_api_access.c
@@ -755,3 +755,54 @@ static inline int ddtrace_imemo_type(VALUE imemo) {
     return GET_VM()->objspace;
   }
 #endif
+
+#ifdef USE_GVL_PROFILING_3_2_WORKAROUNDS // Ruby 3.2
+  #include "gvl_profiling_helper.h"
+
+  gvl_profiling_thread thread_from_thread_object(VALUE thread) {
+    return (gvl_profiling_thread) {.thread = thread_struct_from_object(thread)};
+  }
+
+  // Hack: In Ruby 3.3+ we attach gvl profiling state to Ruby threads using the
+  // rb_internal_thread_specific_* APIs. These APIs did not exist on Ruby 3.2. On Ruby 3.2 we instead store the
+  // needed data inside the `rb_thread_t` structure, specifically in `stat_insn_usage` as a Ruby FIXNUM.
+  //
+  // Why `stat_insn_usage`? We needed some per-thread storage, and while looking at the Ruby VM sources I noticed
+  // that `stat_insn_usage` has been in `rb_thread_t` for a long time, but is not used anywhere in the VM
+  // code. There's a comment attached to it "/* statistics data for profiler */" but other than marking this
+  // field for GC, I could not find any place in the VM commit history or on GitHub where this has ever been used.
+  //
+  // Thus, since this hack is only for 3.2, which presumably will never see this field either removed or used
+  // during its remaining maintenance release period we... kinda take it for our own usage. It's ugly, I know...
+  intptr_t gvl_profiling_state_get(gvl_profiling_thread thread) {
+    if (thread.thread == NULL) return 0;
+
+    VALUE current_value = ((rb_thread_t *)thread.thread)->stat_insn_usage;
+    intptr_t result = current_value == Qnil ? 0 : FIX2LONG(current_value);
+    return result;
+  }
+
+  void gvl_profiling_state_set(gvl_profiling_thread thread, intptr_t value) {
+    if (thread.thread == NULL) return;
+    ((rb_thread_t *)thread.thread)->stat_insn_usage = LONG2FIX(value);
+  }
+
+  // Because Ruby 3.2 does not give us the current thread when calling the RUBY_INTERNAL_THREAD_EVENT_READY and
+  // RUBY_INTERNAL_THREAD_EVENT_RESUMED APIs, we need to figure out this info ourselves.
+  //
+  // Specifically, this method was created to be called from a RUBY_INTERNAL_THREAD_EVENT_RESUMED callback --
+  // when it's triggered, we know the thread the code gets executed on is holding the GVL, so we use this
+  // opportunity to initialize our thread-local value.
+  gvl_profiling_thread gvl_profiling_state_maybe_initialize(void) {
+    gvl_profiling_thread current_thread = gvl_waiting_tls;
+
+    if (current_thread.thread == NULL) {
+      // threads.sched.running is the thread currently holding the GVL, which when this gets executed is the
+      // current thread!
+      current_thread = (gvl_profiling_thread) {.thread = (void *) rb_current_ractor()->threads.sched.running};
+      gvl_waiting_tls = current_thread;
+    }
+
+    return current_thread;
+  }
+#endif
diff --git a/lib/datadog/profiling/component.rb b/lib/datadog/profiling/component.rb
index d99d5d9e42a..98848024caf 100644
--- a/lib/datadog/profiling/component.rb
+++ b/lib/datadog/profiling/component.rb
@@ -443,9 +443,9 @@ def self.build_profiler_component(settings:, agent_settings:, optional_tracer:)
       end
 
       private_class_method def self.enable_gvl_profiling?(settings)
-        if RUBY_VERSION < "3.3"
+        if RUBY_VERSION < "3.2"
           if settings.profiling.advanced.preview_gvl_enabled
-            Datadog.logger.warn("GVL profiling is currently not supported in Ruby < 3.3 and will not be enabled.")
+            Datadog.logger.warn("GVL profiling is currently not supported in Ruby < 3.2 and will not be enabled.")
           end
 
           return false
diff --git a/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb b/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb
index bd51b87dd27..449bc688daa 100644
--- a/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb
+++ b/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb
@@ -149,7 +149,7 @@
     end
 
     context "when gvl_profiling_enabled is true on an unsupported Ruby" do
-      before { skip "Behavior does not apply to current Ruby version" if RUBY_VERSION >= "3.3." }
+      before { skip "Behavior does not apply to current Ruby version" if RUBY_VERSION >= "3.2." }
 
       let(:gvl_profiling_enabled) { true }
 
diff --git a/spec/datadog/profiling/collectors/thread_context_spec.rb b/spec/datadog/profiling/collectors/thread_context_spec.rb
index e36dfc08cea..20fbfb600fd 100644
--- a/spec/datadog/profiling/collectors/thread_context_spec.rb
+++ b/spec/datadog/profiling/collectors/thread_context_spec.rb
@@ -1833,7 +1833,7 @@ def sample_and_check(expected_state:)
         end
 
         context "on legacy Rubies" do
-          before { skip "Behavior does not apply to current Ruby version" if RUBY_VERSION >= "3.3." }
+          before { skip "Behavior does not apply to current Ruby version" if RUBY_VERSION >= "3.2." }
 
           it "is not set" do
             per_thread_context.each do |_thread, context|
diff --git a/spec/datadog/profiling/component_spec.rb b/spec/datadog/profiling/component_spec.rb
index 83b11f0c7ed..926345be5e1 100644
--- a/spec/datadog/profiling/component_spec.rb
+++ b/spec/datadog/profiling/component_spec.rb
@@ -598,8 +598,8 @@
           settings.profiling.advanced.gc_enabled = false
         end
 
-        context "on Ruby < 3.3" do
-          before { skip "Behavior does not apply to current Ruby version" if RUBY_VERSION >= "3.3." }
+        context "on Ruby < 3.2" do
+          before { skip "Behavior does not apply to current Ruby version" if RUBY_VERSION >= "3.2." }
 
           it "does not enable GVL profiling" do
             expect(Datadog::Profiling::Collectors::CpuAndWallTimeWorker)
@@ -615,8 +615,8 @@
           end
         end
 
-        context "on Ruby >= 3.3" do
-          before { skip "Behavior does not apply to current Ruby version" if RUBY_VERSION < "3.3." }
+        context "on Ruby >= 3.2" do
+          before { skip "Behavior does not apply to current Ruby version" if RUBY_VERSION < "3.2." }
 
           context "when timeline is enabled" do
             before { settings.profiling.advanced.timeline_enabled = true }
diff --git a/spec/datadog/profiling/spec_helper.rb b/spec/datadog/profiling/spec_helper.rb
index f8131391114..b3d7a3a97dd 100644
--- a/spec/datadog/profiling/spec_helper.rb
+++ b/spec/datadog/profiling/spec_helper.rb
@@ -123,8 +123,8 @@ def self.maybe_fix_label_range(key, value)
   end
 
   def skip_if_gvl_profiling_not_supported(testcase)
-    if RUBY_VERSION < "3.3."
-      testcase.skip "GVL profiling is only supported on Ruby >= 3.3"
+    if RUBY_VERSION < "3.2."
+      testcase.skip "GVL profiling is only supported on Ruby >= 3.2"
     end
   end
 end

From fb340425665a7a1c418a23b6eda2b66edbba7e7a Mon Sep 17 00:00:00 2001
From: Ivo Anjo <ivo.anjo@datadoghq.com>
Date: Mon, 23 Sep 2024 11:18:56 +0100
Subject: [PATCH 3/6] Fix rubocop lint

---
 spec/datadog/profiling/collectors/thread_context_spec.rb | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/spec/datadog/profiling/collectors/thread_context_spec.rb b/spec/datadog/profiling/collectors/thread_context_spec.rb
index 20fbfb600fd..ebbc02d27cc 100644
--- a/spec/datadog/profiling/collectors/thread_context_spec.rb
+++ b/spec/datadog/profiling/collectors/thread_context_spec.rb
@@ -40,8 +40,8 @@
   let(:endpoint_collection_enabled) { true }
   let(:timeline_enabled) { false }
   let(:allocation_type_enabled) { true }
-  # This mirrors the use of RUBY_FIXNUM_MAX for GVL_WAITING_ENABLED_EMPTY in the native code; it may need adjusting if we ever
-  # want to support more platforms
+  # This mirrors the use of RUBY_FIXNUM_MAX for GVL_WAITING_ENABLED_EMPTY in the native code; it may need adjusting if we
+  # ever want to support more platforms
   let(:gvl_waiting_enabled_empty_magic_value) { 2**62 - 1 }
   let(:waiting_for_gvl_threshold_ns) { 222_333_444 }
 

From f2bc8e91e7c451d1786af28cfff8c75c44381830 Mon Sep 17 00:00:00 2001
From: Ivo Anjo <ivo.anjo@datadoghq.com>
Date: Mon, 23 Sep 2024 11:20:52 +0100
Subject: [PATCH 4/6] Fix broken profiler build on Ruby < 3.2

---
 .../collectors_thread_context.h                          | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/ext/datadog_profiling_native_extension/collectors_thread_context.h b/ext/datadog_profiling_native_extension/collectors_thread_context.h
index 1e34a5450d8..1204e4fe97f 100644
--- a/ext/datadog_profiling_native_extension/collectors_thread_context.h
+++ b/ext/datadog_profiling_native_extension/collectors_thread_context.h
@@ -16,6 +16,9 @@ VALUE thread_context_collector_sample_after_gc(VALUE self_instance);
 void thread_context_collector_on_gc_start(VALUE self_instance);
 __attribute__((warn_unused_result)) bool thread_context_collector_on_gc_finish(VALUE self_instance);
 VALUE enforce_thread_context_collector_instance(VALUE object);
-void thread_context_collector_on_gvl_waiting(gvl_profiling_thread thread);
-__attribute__((warn_unused_result)) bool thread_context_collector_on_gvl_running(gvl_profiling_thread thread);
-VALUE thread_context_collector_sample_after_gvl_running(VALUE self_instance);
+
+#ifndef NO_GVL_INSTRUMENTATION
+  void thread_context_collector_on_gvl_waiting(gvl_profiling_thread thread);
+  __attribute__((warn_unused_result)) bool thread_context_collector_on_gvl_running(gvl_profiling_thread thread);
+  VALUE thread_context_collector_sample_after_gvl_running(VALUE self_instance);
+#endif

From f04aab834c6e3d98522705b722bec85d512b1073 Mon Sep 17 00:00:00 2001
From: Ivo Anjo <ivo.anjo@datadoghq.com>
Date: Mon, 23 Sep 2024 11:27:35 +0100
Subject: [PATCH 5/6] Minor: Delay getting thread context until checking the
 GVL profiling state

This is a "just in case" change --
`thread_context_collector_sample_after_gvl_running_with_thread`
_should_ always be called with a thread that has finished Waiting for
the GVL.

...but just in case it ended up being called by the wrong thread, let's
not touch the context until we're sure we're in the right thread.
---
 .../collectors_thread_context.c                               | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ext/datadog_profiling_native_extension/collectors_thread_context.c b/ext/datadog_profiling_native_extension/collectors_thread_context.c
index b4eb5a9b256..600be7a2232 100644
--- a/ext/datadog_profiling_native_extension/collectors_thread_context.c
+++ b/ext/datadog_profiling_native_extension/collectors_thread_context.c
@@ -1638,8 +1638,6 @@ static VALUE _native_sample_skipped_allocation_samples(DDTRACE_UNUSED VALUE self
 
     if (!state->timeline_enabled) rb_raise(rb_eRuntimeError, "GVL profiling requires timeline to be enabled");
 
-    struct per_thread_context *thread_context = get_or_create_context_for(current_thread, state);
-
     intptr_t gvl_waiting_at = gvl_profiling_state_thread_object_get(current_thread);
 
     if (gvl_waiting_at >= 0) {
@@ -1650,6 +1648,8 @@ static VALUE _native_sample_skipped_allocation_samples(DDTRACE_UNUSED VALUE self
       return Qfalse;
     }
 
+    struct per_thread_context *thread_context = get_or_create_context_for(current_thread, state);
+
     // We don't actually account for cpu-time during Waiting for GVL. BUT, we may chose to push an
     // extra sample to represent the period prior to Waiting for GVL. To support that, we retrieve the current
     // cpu-time of the thread and let `update_metrics_and_sample` decide what to do with it.

From a010650fb3e7108faeca41edc572ad0c2dde3cb3 Mon Sep 17 00:00:00 2001
From: Ivo Anjo <ivo.anjo@datadoghq.com>
Date: Mon, 23 Sep 2024 11:30:22 +0100
Subject: [PATCH 6/6] Minor: Add TODO about tracking overhead

---
 .../collectors_thread_context.c                                 | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ext/datadog_profiling_native_extension/collectors_thread_context.c b/ext/datadog_profiling_native_extension/collectors_thread_context.c
index 600be7a2232..0f74e6d9a36 100644
--- a/ext/datadog_profiling_native_extension/collectors_thread_context.c
+++ b/ext/datadog_profiling_native_extension/collectors_thread_context.c
@@ -1655,6 +1655,8 @@ static VALUE _native_sample_skipped_allocation_samples(DDTRACE_UNUSED VALUE self
     // cpu-time of the thread and let `update_metrics_and_sample` decide what to do with it.
     long cpu_time_for_thread = cpu_time_now_ns(thread_context);
 
+    // TODO: Should we update the dynamic sampling rate overhead tracking with this sample as well?
+
     update_metrics_and_sample(
       state,
       /* thread_being_sampled: */ current_thread,