DataDog · ivoanjo · Nov 10, 2022 · Nov 9, 2022 · Nov 9, 2022 · Nov 9, 2022
@@ -85,6 +85,7 @@ static VALUE _native_initialize(
 static void cpu_and_wall_time_worker_typed_data_mark(void *state_ptr);
 static VALUE _native_sampling_loop(VALUE self, VALUE instance);
 static VALUE _native_stop(DDTRACE_UNUSED VALUE _self, VALUE self_instance);
+static VALUE stop(VALUE self_instance, VALUE optional_exception);
 static void install_sigprof_signal_handler(void (*signal_handler_function)(int, siginfo_t *, void *));
 static void remove_sigprof_signal_handler(void);
 static void block_sigprof_signal_handler_from_running_in_current_thread(void);
@@ -205,11 +206,25 @@ static VALUE _native_sampling_loop(DDTRACE_UNUSED VALUE _self, VALUE instance) {
   struct cpu_and_wall_time_worker_state *state;
   TypedData_Get_Struct(instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
 
-  if (active_sampler_owner_thread != Qnil && is_thread_alive(active_sampler_owner_thread)) {
-    rb_raise(
-      rb_eRuntimeError,
-      "Could not start CpuAndWallTimeWorker: There's already another instance of CpuAndWallTimeWorker active in a different thread"
-    );
+  if (active_sampler_owner_thread != Qnil) {
+    if (is_thread_alive(active_sampler_owner_thread)) {
+      rb_raise(
+        rb_eRuntimeError,
+        "Could not start CpuAndWallTimeWorker: There's already another instance of CpuAndWallTimeWorker active in a different thread"
+      );
+    } else {
+      // The previously active thread seems to have died without cleaning up after itself.
+      // In this case, we can still go ahead and start the profiler BUT we make sure to disable any existing GC tracepoint
+      // first as:
+      // a) If this is a new instance of the CpuAndWallTimeWorker, we don't want the tracepoint from the old instance
+      //    being kept around
+      // b) If this is the same instance of the CpuAndWallTimeWorker if we call enable on a tracepoint that is already
+      //    enabled, it will start firing more than once, see https://bugs.ruby-lang.org/issues/19114 for details.
+
+      struct cpu_and_wall_time_worker_state *old_state;
+      TypedData_Get_Struct(active_sampler_instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, old_state);
+      rb_tracepoint_disable(old_state->gc_tracepoint);
+    }
   }
 
   // This write to a global is thread-safe BECAUSE we're still holding on to the global VM lock at this point
@@ -243,10 +258,18 @@ static VALUE _native_sampling_loop(DDTRACE_UNUSED VALUE _self, VALUE instance) {
 }
 
 static VALUE _native_stop(DDTRACE_UNUSED VALUE _self, VALUE self_instance) {
+  return stop(self_instance, /* optional_exception: */ Qnil);
+}
+
+static VALUE stop(VALUE self_instance, VALUE optional_exception) {
   struct cpu_and_wall_time_worker_state *state;
   TypedData_Get_Struct(self_instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
 
   state->should_run = false;
+  state->failure_exception = optional_exception;
+
+  // Disable the GC tracepoint as soon as possible, so the VM doesn't keep on calling it
+  rb_tracepoint_disable(state->gc_tracepoint);
 
   return Qtrue;
 }
@@ -368,18 +391,7 @@ static void sample_from_postponed_job(DDTRACE_UNUSED void *_unused) {
   safely_call(cpu_and_wall_time_collector_sample, state->cpu_and_wall_time_collector_instance, instance);
 }
 
-static VALUE handle_sampling_failure(VALUE self_instance, VALUE exception) {
-  struct cpu_and_wall_time_worker_state *state;
-  TypedData_Get_Struct(self_instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
-
-  state->should_run = false;
-  state->failure_exception = exception;
-
-  // Disable the GC tracepoint as soon as possible, so the VM doesn't keep on calling it
-  rb_tracepoint_disable(state->gc_tracepoint);
-
-  return Qnil;
-}
+static VALUE handle_sampling_failure(VALUE self_instance, VALUE exception) { return stop(self_instance, exception); }
 
 // This method exists only to enable testing Datadog::Profiling::Collectors::CpuAndWallTimeWorker behavior using RSpec.
 // It SHOULD NOT be used for other purposes.

@@ -30,7 +30,7 @@ def initialize(
 
         def start
           @start_stop_mutex.synchronize do
-            return if @worker_thread
+            return if @worker_thread && @worker_thread.alive?
 
             Datadog.logger.debug { "Starting thread for: #{self}" }
             @worker_thread = Thread.new do

@@ -178,6 +178,52 @@
         current_thread_gc_samples.inject(0) { |sum, sample| sum + sample.fetch(:values).fetch(:'cpu-samples') }
       ).to be >= invoke_gc_times
     end
+
+    context 'when the background thread dies without cleaning up (after Ruby forks)' do
+      it 'allows the CpuAndWallTimeWorker to be restarted' do
+        start
+
+        expect_in_fork do
+          cpu_and_wall_time_worker.start
+          wait_until_running
+        end
+      end
+
+      it 'allows a different instance of the CpuAndWallTimeWorker to be started' do
+        start
+
+        expect_in_fork do
+          another_instance = described_class.new(
+            recorder: Datadog::Profiling::StackRecorder.new,
+            max_frames: 400,
+            tracer: nil,
+            gc_profiling_enabled: gc_profiling_enabled,
+          )
+          another_instance.start
+
+          try_wait_until(backoff: 0.01) { described_class::Testing._native_is_running?(another_instance) }
+        end
+      end
+
+      it 'disables the existing gc_tracepoint before starting another CpuAndWallTimeWorker' do
+        start
+
+        expect_in_fork do
+          another_instance = described_class.new(
+            recorder: Datadog::Profiling::StackRecorder.new,
+            max_frames: 400,
+            tracer: nil,
+            gc_profiling_enabled: gc_profiling_enabled,
+          )
+          another_instance.start
+
+          try_wait_until(backoff: 0.01) { described_class::Testing._native_is_running?(another_instance) }
+
+          expect(described_class::Testing._native_gc_tracepoint(cpu_and_wall_time_worker)).to_not be_enabled
+          expect(described_class::Testing._native_gc_tracepoint(another_instance)).to be_enabled
+        end
+      end
+    end
   end
 
   describe 'Ractor safety' do