From 82a955882e4f8ed350b617726ae17a2b1e87de55 Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Thu, 10 Nov 2022 09:51:18 +0000 Subject: [PATCH 1/6] Refactor hacky `exporter.flush` after process fork into `exporter.clear` Calling `exporter.flush` after fork to clear any existing profiling data works, but it's rather heavy-handed and can generate a few confusing log messages: ``` DEBUG (dd-trace-rb/lib/datadog/profiling/scheduler.rb:73:in `after_fork') Flushing exporter in child process #after_fork and discarding data DEBUG (dd-trace-rb/lib/datadog/profiling/stack_recorder.rb:25:in `serialize') Encoded profile covering 2022-11-10T09:42:22Z to 2022-11-10T09:42:22Z DEBUG (dd-trace-rb/lib/datadog/profiling/exporter.rb:52:in `flush') Skipped exporting profiling events as profile duration is below minimum ``` To improve this, I've cleared a new `Exporter#clear` that is still (currently) implemented using `flush`, but now the exporter knows what's going on more explicitly, and so it can implement `clear` in a better way. --- lib/datadog/profiling/exporter.rb | 7 ++++++ lib/datadog/profiling/scheduler.rb | 2 +- spec/datadog/profiling/exporter_spec.rb | 30 +++++++++++++++++++++++- spec/datadog/profiling/scheduler_spec.rb | 2 +- 4 files changed, 38 insertions(+), 3 deletions(-) diff --git a/lib/datadog/profiling/exporter.rb b/lib/datadog/profiling/exporter.rb index 831efe3dfa3..16fc16715b9 100644 --- a/lib/datadog/profiling/exporter.rb +++ b/lib/datadog/profiling/exporter.rb @@ -70,6 +70,13 @@ def can_flush? !duration_below_threshold?(last_flush_finish_at || created_at, time_provider.now.utc) end + def clear + # TODO: This is a really heavy-handed way of clearing the buffer + flush + + nil + end + private def duration_below_threshold?(start, finish) diff --git a/lib/datadog/profiling/scheduler.rb b/lib/datadog/profiling/scheduler.rb index 0933d438826..fafcdcc002a 100644 --- a/lib/datadog/profiling/scheduler.rb +++ b/lib/datadog/profiling/scheduler.rb @@ -70,7 +70,7 @@ def after_fork # Clear any existing profiling state. # We don't want the child process to report profiling data from its parent. Datadog.logger.debug('Flushing exporter in child process #after_fork and discarding data') - exporter.flush + exporter.clear end # Configure Workers::IntervalLoop to not report immediately when scheduler starts diff --git a/spec/datadog/profiling/exporter_spec.rb b/spec/datadog/profiling/exporter_spec.rb index f07c1df7d25..8187d608eb5 100644 --- a/spec/datadog/profiling/exporter_spec.rb +++ b/spec/datadog/profiling/exporter_spec.rb @@ -74,6 +74,18 @@ end end + describe '#clear' do + subject(:clear) { exporter.clear } + + it { is_expected.to be nil } + + it 'triggers pprof_recorder serialization' do + expect(pprof_recorder).to receive(:serialize) + + clear + end + end + describe '#can_flush?' do let(:time_provider) { class_double(Time) } let(:created_at) { start - 60 } @@ -102,7 +114,23 @@ end end - context 'when exporter has never flushed' do + context 'when exporter has been cleared before' do + before { exporter.clear } + + context 'when less than 1s has elapsed since last clear' do + before { expect(time_provider).to receive(:now).and_return(finish + 0.99).once } + + it { is_expected.to be false } + end + + context 'when 1s or more has elapsed since last clear' do + before { expect(time_provider).to receive(:now).and_return(finish + 1).once } + + it { is_expected.to be true } + end + end + + context 'when exporter has never flushed or cleared' do context 'when less than 1s has elapsed since exporter was created' do before { expect(time_provider).to receive(:now).and_return(created_at + 0.99).once } diff --git a/spec/datadog/profiling/scheduler_spec.rb b/spec/datadog/profiling/scheduler_spec.rb index d414d5ae0e7..09313e49910 100644 --- a/spec/datadog/profiling/scheduler_spec.rb +++ b/spec/datadog/profiling/scheduler_spec.rb @@ -90,7 +90,7 @@ subject(:after_fork) { scheduler.after_fork } it 'clears the buffer' do - expect(exporter).to receive(:flush) + expect(exporter).to receive(:clear) after_fork end end From 35e0bbe6d98f98234c95fdbd8131910ca2587af6 Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Thu, 10 Nov 2022 09:58:41 +0000 Subject: [PATCH 2/6] Improve `Exporter#clear` by extracting only the operations actually needed The previous `flush` call was a placeholder, and we can now improve it. --- lib/datadog/profiling/exporter.rb | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/datadog/profiling/exporter.rb b/lib/datadog/profiling/exporter.rb index 16fc16715b9..a47e78e56fc 100644 --- a/lib/datadog/profiling/exporter.rb +++ b/lib/datadog/profiling/exporter.rb @@ -72,7 +72,8 @@ def can_flush? def clear # TODO: This is a really heavy-handed way of clearing the buffer - flush + _, finish, _ = pprof_recorder.serialize + @last_flush_finish_at = finish nil end From ad10f75bb06d2358982051f8a8204af0d288d151 Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Thu, 10 Nov 2022 10:38:07 +0000 Subject: [PATCH 3/6] Add `StackRecorder#clear` This will be used by the `Exporter` when needing to clear data without serializing it. --- .../stack_recorder.c | 16 ++ lib/datadog/profiling/stack_recorder.rb | 18 +++ spec/datadog/profiling/stack_recorder_spec.rb | 140 +++++++++++++----- 3 files changed, 140 insertions(+), 34 deletions(-) diff --git a/ext/ddtrace_profiling_native_extension/stack_recorder.c b/ext/ddtrace_profiling_native_extension/stack_recorder.c index 62082f070e3..4f7bf7e234d 100644 --- a/ext/ddtrace_profiling_native_extension/stack_recorder.c +++ b/ext/ddtrace_profiling_native_extension/stack_recorder.c @@ -174,6 +174,7 @@ static VALUE _native_is_slot_one_mutex_locked(DDTRACE_UNUSED VALUE _self, VALUE static VALUE _native_is_slot_two_mutex_locked(DDTRACE_UNUSED VALUE _self, VALUE recorder_instance); static VALUE test_slot_mutex_state(VALUE recorder_instance, int slot); static ddog_Timespec time_now(); +static VALUE _native_clear(DDTRACE_UNUSED VALUE _self, VALUE recorder_instance); void stack_recorder_init(VALUE profiling_module) { stack_recorder_class = rb_define_class_under(profiling_module, "StackRecorder", rb_cObject); @@ -191,6 +192,7 @@ void stack_recorder_init(VALUE profiling_module) { rb_define_alloc_func(stack_recorder_class, _native_new); rb_define_singleton_method(stack_recorder_class, "_native_serialize", _native_serialize, 1); + rb_define_singleton_method(stack_recorder_class, "_native_clear", _native_clear, 1); rb_define_singleton_method(testing_module, "_native_active_slot", _native_active_slot, 1); rb_define_singleton_method(testing_module, "_native_slot_one_mutex_locked?", _native_is_slot_one_mutex_locked, 1); rb_define_singleton_method(testing_module, "_native_slot_two_mutex_locked?", _native_is_slot_two_mutex_locked, 1); @@ -436,3 +438,17 @@ static ddog_Timespec time_now() { return (ddog_Timespec) {.seconds = current_time.tv_sec, .nanoseconds = (uint32_t) current_time.tv_nsec}; } + +static VALUE _native_clear(DDTRACE_UNUSED VALUE _self, VALUE recorder_instance) { + struct stack_recorder_state *state; + TypedData_Get_Struct(recorder_instance, struct stack_recorder_state, &stack_recorder_typed_data, state); + + ddog_Timespec finish_timestamp = time_now(); + + ddog_Profile *profile = serializer_flip_active_and_inactive_slots(state, finish_timestamp); + if (!ddog_Profile_reset(profile, NULL /* start_time is optional */ )) { + return rb_ary_new_from_args(2, error_symbol, rb_str_new_cstr("Failed to reset profile")); + } + + return rb_ary_new_from_args(2, ok_symbol, ruby_time_from(finish_timestamp)); +} diff --git a/lib/datadog/profiling/stack_recorder.rb b/lib/datadog/profiling/stack_recorder.rb index 42b965759f8..9f1a542460f 100644 --- a/lib/datadog/profiling/stack_recorder.rb +++ b/lib/datadog/profiling/stack_recorder.rb @@ -34,6 +34,24 @@ def serialize end end + def clear + status, result = @no_concurrent_synchronize_mutex.synchronize { self.class._native_clear(self) } + + if status == :ok + finish_timestamp = result + + Datadog.logger.debug { "Cleared profile at #{finish_timestamp}" } + + finish_timestamp + else + error_message = result + + Datadog.logger.error("Failed to clear profiling data: #{error_message}") + + nil + end + end + # Used only for Ruby 2.2 which doesn't have the native `rb_time_timespec_new` API; called from native code def self.ruby_time_from(timespec_seconds, timespec_nanoseconds) Time.at(0).utc + timespec_seconds + (timespec_nanoseconds.to_r / 1_000_000_000) diff --git a/spec/datadog/profiling/stack_recorder_spec.rb b/spec/datadog/profiling/stack_recorder_spec.rb index b69d261e7dc..8d0fa3ad3c1 100644 --- a/spec/datadog/profiling/stack_recorder_spec.rb +++ b/spec/datadog/profiling/stack_recorder_spec.rb @@ -39,6 +39,41 @@ def slot_two_mutex_locked? end end + shared_examples_for 'locking behavior' do |operation| + context 'when slot one was the active slot' do + it 'sets slot two as the active slot' do + expect { stack_recorder.public_send(operation) }.to change { active_slot }.from(1).to(2) + end + + it 'locks the slot one mutex and keeps it locked' do + expect { stack_recorder.public_send(operation) }.to change { slot_one_mutex_locked? }.from(false).to(true) + end + + it 'unlocks the slot two mutex and keeps it unlocked' do + expect { stack_recorder.public_send(operation) }.to change { slot_two_mutex_locked? }.from(true).to(false) + end + end + + context 'when slot two was the active slot' do + before do + # Trigger operation once, so that active slots get flipped + stack_recorder.public_send(operation) + end + + it 'sets slot one as the active slot' do + expect { stack_recorder.public_send(operation) }.to change { active_slot }.from(2).to(1) + end + + it 'unlocks the slot one mutex and keeps it unlocked' do + expect { stack_recorder.public_send(operation) }.to change { slot_one_mutex_locked? }.from(true).to(false) + end + + it 'locks the slow two mutex and keeps it locked' do + expect { stack_recorder.public_send(operation) }.to change { slot_two_mutex_locked? }.from(false).to(true) + end + end + end + describe '#serialize' do subject(:serialize) { stack_recorder.serialize } @@ -61,40 +96,7 @@ def slot_two_mutex_locked? expect(message).to include finish.iso8601 end - describe 'locking behavior' do - context 'when slot one was the active slot' do - it 'sets slot two as the active slot' do - expect { serialize }.to change { active_slot }.from(1).to(2) - end - - it 'locks the slot one mutex and keeps it locked' do - expect { serialize }.to change { slot_one_mutex_locked? }.from(false).to(true) - end - - it 'unlocks the slot two mutex and keeps it unlocked' do - expect { serialize }.to change { slot_two_mutex_locked? }.from(true).to(false) - end - end - - context 'when slot two was the active slot' do - before do - # Trigger serialization once, so that active slots get flipped - stack_recorder.serialize - end - - it 'sets slot one as the active slot' do - expect { serialize }.to change { active_slot }.from(2).to(1) - end - - it 'unlocks the slot one mutex and keeps it unlocked' do - expect { serialize }.to change { slot_one_mutex_locked? }.from(true).to(false) - end - - it 'locks the slow two mutex and keeps it locked' do - expect { serialize }.to change { slot_two_mutex_locked? }.from(false).to(true) - end - end - end + include_examples 'locking behavior', :serialize context 'when the profile is empty' do it 'uses the current time as the start and finish time' do @@ -219,4 +221,74 @@ def sample_types_from(decoded_profile) end end end + + describe '#clear' do + subject(:clear) { stack_recorder.clear } + + it 'debug logs that clear was invoked' do + message = nil + + expect(Datadog.logger).to receive(:debug) do |&message_block| + message = message_block.call + end + + clear + + expect(message).to match(/Cleared profile/) + end + + include_examples 'locking behavior', :clear + + it 'uses the current time as the finish time' do + before_clear = Time.now.utc + finish = clear + after_clear = Time.now.utc + + expect(finish).to be_between(before_clear, after_clear) + end + + context 'when profile has a sample' do + let(:collectors_stack) { Datadog::Profiling::Collectors::Stack.new } + + let(:metric_values) { { 'cpu-time' => 123, 'cpu-samples' => 456, 'wall-time' => 789 } } + let(:labels) { { 'label_a' => 'value_a', 'label_b' => 'value_b' }.to_a } + + it 'makes the next calls to serialize return no data' do + # Add some data + Datadog::Profiling::Collectors::Stack::Testing + ._native_sample(Thread.current, stack_recorder, metric_values, labels, 400, false) + + # Sanity check: validate that data is there, to avoid the test passing because of other issues + sanity_check_samples = samples_from_pprof(stack_recorder.serialize.last) + expect(sanity_check_samples.size).to be 1 + + # Add some data, again + Datadog::Profiling::Collectors::Stack::Testing + ._native_sample(Thread.current, stack_recorder, metric_values, labels, 400, false) + + clear + + # Test twice in a row to validate that both profile slots are empty + expect(samples_from_pprof(stack_recorder.serialize.last)).to be_empty + expect(samples_from_pprof(stack_recorder.serialize.last)).to be_empty + end + end + + context 'when there is a failure during serialization' do + before do + allow(Datadog.logger).to receive(:error) + + # Real failures in serialization are hard to trigger, so we're using a mock failure instead + expect(described_class).to receive(:_native_clear).and_return([:error, 'test error message']) + end + + it { is_expected.to be nil } + + it 'logs an error message' do + expect(Datadog.logger).to receive(:error).with(/test error message/) + + clear + end + end + end end From d3698430e9876a1c30a2c7bce08e152ef6bf7a28 Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Thu, 10 Nov 2022 10:47:28 +0000 Subject: [PATCH 4/6] Use `StackRecorder#clear` in `Exporter` As the `StackRecorder` implements `#clear` natively, no need to simulate it with a call to `#serialize`. I decided not to bother updating the `OldRecorder`, so for that class we'll still use the old behavior. --- lib/datadog/profiling/exporter.rb | 9 ++++--- spec/datadog/profiling/exporter_spec.rb | 32 ++++++++++++++++++++++--- 2 files changed, 35 insertions(+), 6 deletions(-) diff --git a/lib/datadog/profiling/exporter.rb b/lib/datadog/profiling/exporter.rb index a47e78e56fc..b84c1ae8104 100644 --- a/lib/datadog/profiling/exporter.rb +++ b/lib/datadog/profiling/exporter.rb @@ -71,9 +71,12 @@ def can_flush? end def clear - # TODO: This is a really heavy-handed way of clearing the buffer - _, finish, _ = pprof_recorder.serialize - @last_flush_finish_at = finish + if pprof_recorder.respond_to?(:clear) + @last_flush_finish_at = pprof_recorder.clear + else # TODO: Remove this when the OldRecorder is retired and we can assume all recorders implement #clear + _, finish, = pprof_recorder.serialize + @last_flush_finish_at = finish + end nil end diff --git a/spec/datadog/profiling/exporter_spec.rb b/spec/datadog/profiling/exporter_spec.rb index 8187d608eb5..141008e51b7 100644 --- a/spec/datadog/profiling/exporter_spec.rb +++ b/spec/datadog/profiling/exporter_spec.rb @@ -79,10 +79,36 @@ it { is_expected.to be nil } - it 'triggers pprof_recorder serialization' do - expect(pprof_recorder).to receive(:serialize) + context 'when pprof_recorder does not support clear' do + let(:pprof_recorder) { instance_double(Datadog::Profiling::OldRecorder, serialize: pprof_recorder_serialize) } - clear + it 'triggers pprof_recorder serialization' do + expect(pprof_recorder).to receive(:serialize) + + clear + end + + it 'sets the last_flush_finish_at to the result of serialize' do + clear + + expect(exporter.send(:last_flush_finish_at)).to be finish + end + end + + context 'when pprof_recorder supports clear' do + let(:pprof_recorder) { instance_double(Datadog::Profiling::StackRecorder, clear: finish) } + + it 'triggers pprof_recorder clear' do + expect(pprof_recorder).to receive(:clear) + + clear + end + + it 'sets the last_flush_finish_at to the result of clear' do + clear + + expect(exporter.send(:last_flush_finish_at)).to be finish + end end end From 1f2fdf5b8d45fb43dfbd1fff21a66923036a6255 Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Fri, 11 Nov 2022 10:54:29 +0000 Subject: [PATCH 5/6] Improve spec descriptions after PR feedback --- spec/datadog/profiling/stack_recorder_spec.rb | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/spec/datadog/profiling/stack_recorder_spec.rb b/spec/datadog/profiling/stack_recorder_spec.rb index 8d0fa3ad3c1..363d81dd63d 100644 --- a/spec/datadog/profiling/stack_recorder_spec.rb +++ b/spec/datadog/profiling/stack_recorder_spec.rb @@ -45,11 +45,11 @@ def slot_two_mutex_locked? expect { stack_recorder.public_send(operation) }.to change { active_slot }.from(1).to(2) end - it 'locks the slot one mutex and keeps it locked' do + it 'locks the slot one mutex' do expect { stack_recorder.public_send(operation) }.to change { slot_one_mutex_locked? }.from(false).to(true) end - it 'unlocks the slot two mutex and keeps it unlocked' do + it 'unlocks the slot two mutex' do expect { stack_recorder.public_send(operation) }.to change { slot_two_mutex_locked? }.from(true).to(false) end end @@ -64,11 +64,11 @@ def slot_two_mutex_locked? expect { stack_recorder.public_send(operation) }.to change { active_slot }.from(2).to(1) end - it 'unlocks the slot one mutex and keeps it unlocked' do + it 'unlocks the slot one mutex' do expect { stack_recorder.public_send(operation) }.to change { slot_one_mutex_locked? }.from(true).to(false) end - it 'locks the slow two mutex and keeps it locked' do + it 'locks the slot two mutex' do expect { stack_recorder.public_send(operation) }.to change { slot_two_mutex_locked? }.from(false).to(true) end end From 96acfb253f84fc78778f6ea62a26a5a591a6dc17 Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Fri, 11 Nov 2022 10:57:39 +0000 Subject: [PATCH 6/6] Add note about slot flipping inside clear --- ext/ddtrace_profiling_native_extension/stack_recorder.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ext/ddtrace_profiling_native_extension/stack_recorder.c b/ext/ddtrace_profiling_native_extension/stack_recorder.c index 4f7bf7e234d..53a46c6b81d 100644 --- a/ext/ddtrace_profiling_native_extension/stack_recorder.c +++ b/ext/ddtrace_profiling_native_extension/stack_recorder.c @@ -445,6 +445,10 @@ static VALUE _native_clear(DDTRACE_UNUSED VALUE _self, VALUE recorder_instance) ddog_Timespec finish_timestamp = time_now(); + // Why flip the slots as part of clearing? This makes clear behave exactly like serialize in terms of concurrency + // properties, even though right now there probably won't be any concurrency between calling clear and adding samples + // to the profile because we do not release the global VM lock while clearing. + // See also https://github.com/DataDog/dd-trace-rb/pull/2362/files#r1019659427 for more details. ddog_Profile *profile = serializer_flip_active_and_inactive_slots(state, finish_timestamp); if (!ddog_Profile_reset(profile, NULL /* start_time is optional */ )) { return rb_ary_new_from_args(2, error_symbol, rb_str_new_cstr("Failed to reset profile"));