diff --git a/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time_worker.c b/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time_worker.c index 83b05b916d5..599b7747427 100644 --- a/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time_worker.c +++ b/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time_worker.c @@ -85,6 +85,7 @@ static VALUE _native_initialize( static void cpu_and_wall_time_worker_typed_data_mark(void *state_ptr); static VALUE _native_sampling_loop(VALUE self, VALUE instance); static VALUE _native_stop(DDTRACE_UNUSED VALUE _self, VALUE self_instance); +static VALUE stop(VALUE self_instance, VALUE optional_exception); static void install_sigprof_signal_handler(void (*signal_handler_function)(int, siginfo_t *, void *)); static void remove_sigprof_signal_handler(void); static void block_sigprof_signal_handler_from_running_in_current_thread(void); @@ -205,11 +206,25 @@ static VALUE _native_sampling_loop(DDTRACE_UNUSED VALUE _self, VALUE instance) { struct cpu_and_wall_time_worker_state *state; TypedData_Get_Struct(instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state); - if (active_sampler_owner_thread != Qnil && is_thread_alive(active_sampler_owner_thread)) { - rb_raise( - rb_eRuntimeError, - "Could not start CpuAndWallTimeWorker: There's already another instance of CpuAndWallTimeWorker active in a different thread" - ); + if (active_sampler_owner_thread != Qnil) { + if (is_thread_alive(active_sampler_owner_thread)) { + rb_raise( + rb_eRuntimeError, + "Could not start CpuAndWallTimeWorker: There's already another instance of CpuAndWallTimeWorker active in a different thread" + ); + } else { + // The previously active thread seems to have died without cleaning up after itself. + // In this case, we can still go ahead and start the profiler BUT we make sure to disable any existing GC tracepoint + // first as: + // a) If this is a new instance of the CpuAndWallTimeWorker, we don't want the tracepoint from the old instance + // being kept around + // b) If this is the same instance of the CpuAndWallTimeWorker if we call enable on a tracepoint that is already + // enabled, it will start firing more than once, see https://bugs.ruby-lang.org/issues/19114 for details. + + struct cpu_and_wall_time_worker_state *old_state; + TypedData_Get_Struct(active_sampler_instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, old_state); + rb_tracepoint_disable(old_state->gc_tracepoint); + } } // This write to a global is thread-safe BECAUSE we're still holding on to the global VM lock at this point @@ -243,10 +258,18 @@ static VALUE _native_sampling_loop(DDTRACE_UNUSED VALUE _self, VALUE instance) { } static VALUE _native_stop(DDTRACE_UNUSED VALUE _self, VALUE self_instance) { + return stop(self_instance, /* optional_exception: */ Qnil); +} + +static VALUE stop(VALUE self_instance, VALUE optional_exception) { struct cpu_and_wall_time_worker_state *state; TypedData_Get_Struct(self_instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state); state->should_run = false; + state->failure_exception = optional_exception; + + // Disable the GC tracepoint as soon as possible, so the VM doesn't keep on calling it + rb_tracepoint_disable(state->gc_tracepoint); return Qtrue; } @@ -368,18 +391,7 @@ static void sample_from_postponed_job(DDTRACE_UNUSED void *_unused) { safely_call(cpu_and_wall_time_collector_sample, state->cpu_and_wall_time_collector_instance, instance); } -static VALUE handle_sampling_failure(VALUE self_instance, VALUE exception) { - struct cpu_and_wall_time_worker_state *state; - TypedData_Get_Struct(self_instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state); - - state->should_run = false; - state->failure_exception = exception; - - // Disable the GC tracepoint as soon as possible, so the VM doesn't keep on calling it - rb_tracepoint_disable(state->gc_tracepoint); - - return Qnil; -} +static VALUE handle_sampling_failure(VALUE self_instance, VALUE exception) { return stop(self_instance, exception); } // This method exists only to enable testing Datadog::Profiling::Collectors::CpuAndWallTimeWorker behavior using RSpec. // It SHOULD NOT be used for other purposes. diff --git a/lib/datadog/profiling/collectors/cpu_and_wall_time_worker.rb b/lib/datadog/profiling/collectors/cpu_and_wall_time_worker.rb index c3ad9581030..5d95700d4fb 100644 --- a/lib/datadog/profiling/collectors/cpu_and_wall_time_worker.rb +++ b/lib/datadog/profiling/collectors/cpu_and_wall_time_worker.rb @@ -30,7 +30,7 @@ def initialize( def start @start_stop_mutex.synchronize do - return if @worker_thread + return if @worker_thread && @worker_thread.alive? Datadog.logger.debug { "Starting thread for: #{self}" } @worker_thread = Thread.new do diff --git a/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb b/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb index fb5150367a4..c67dea31f70 100644 --- a/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb +++ b/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb @@ -178,6 +178,52 @@ current_thread_gc_samples.inject(0) { |sum, sample| sum + sample.fetch(:values).fetch(:'cpu-samples') } ).to be >= invoke_gc_times end + + context 'when the background thread dies without cleaning up (after Ruby forks)' do + it 'allows the CpuAndWallTimeWorker to be restarted' do + start + + expect_in_fork do + cpu_and_wall_time_worker.start + wait_until_running + end + end + + it 'allows a different instance of the CpuAndWallTimeWorker to be started' do + start + + expect_in_fork do + another_instance = described_class.new( + recorder: Datadog::Profiling::StackRecorder.new, + max_frames: 400, + tracer: nil, + gc_profiling_enabled: gc_profiling_enabled, + ) + another_instance.start + + try_wait_until(backoff: 0.01) { described_class::Testing._native_is_running?(another_instance) } + end + end + + it 'disables the existing gc_tracepoint before starting another CpuAndWallTimeWorker' do + start + + expect_in_fork do + another_instance = described_class.new( + recorder: Datadog::Profiling::StackRecorder.new, + max_frames: 400, + tracer: nil, + gc_profiling_enabled: gc_profiling_enabled, + ) + another_instance.start + + try_wait_until(backoff: 0.01) { described_class::Testing._native_is_running?(another_instance) } + + expect(described_class::Testing._native_gc_tracepoint(cpu_and_wall_time_worker)).to_not be_enabled + expect(described_class::Testing._native_gc_tracepoint(another_instance)).to be_enabled + end + end + end end describe 'Ractor safety' do