Skip to content

Commit

Permalink
Merge pull request #2362 from DataDog/ivoanjo/better-profile-clearing
Browse files Browse the repository at this point in the history
Improve clearing of profile data after Ruby app forks
  • Loading branch information
ivoanjo authored Nov 16, 2022
2 parents 2731a04 + 96acfb2 commit f93d1b3
Show file tree
Hide file tree
Showing 7 changed files with 212 additions and 37 deletions.
20 changes: 20 additions & 0 deletions ext/ddtrace_profiling_native_extension/stack_recorder.c
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,7 @@ static VALUE _native_is_slot_one_mutex_locked(DDTRACE_UNUSED VALUE _self, VALUE
static VALUE _native_is_slot_two_mutex_locked(DDTRACE_UNUSED VALUE _self, VALUE recorder_instance);
static VALUE test_slot_mutex_state(VALUE recorder_instance, int slot);
static ddog_Timespec time_now();
static VALUE _native_clear(DDTRACE_UNUSED VALUE _self, VALUE recorder_instance);

void stack_recorder_init(VALUE profiling_module) {
stack_recorder_class = rb_define_class_under(profiling_module, "StackRecorder", rb_cObject);
Expand All @@ -191,6 +192,7 @@ void stack_recorder_init(VALUE profiling_module) {
rb_define_alloc_func(stack_recorder_class, _native_new);

rb_define_singleton_method(stack_recorder_class, "_native_serialize", _native_serialize, 1);
rb_define_singleton_method(stack_recorder_class, "_native_clear", _native_clear, 1);
rb_define_singleton_method(testing_module, "_native_active_slot", _native_active_slot, 1);
rb_define_singleton_method(testing_module, "_native_slot_one_mutex_locked?", _native_is_slot_one_mutex_locked, 1);
rb_define_singleton_method(testing_module, "_native_slot_two_mutex_locked?", _native_is_slot_two_mutex_locked, 1);
Expand Down Expand Up @@ -436,3 +438,21 @@ static ddog_Timespec time_now() {

return (ddog_Timespec) {.seconds = current_time.tv_sec, .nanoseconds = (uint32_t) current_time.tv_nsec};
}

static VALUE _native_clear(DDTRACE_UNUSED VALUE _self, VALUE recorder_instance) {
struct stack_recorder_state *state;
TypedData_Get_Struct(recorder_instance, struct stack_recorder_state, &stack_recorder_typed_data, state);

ddog_Timespec finish_timestamp = time_now();

// Why flip the slots as part of clearing? This makes clear behave exactly like serialize in terms of concurrency
// properties, even though right now there probably won't be any concurrency between calling clear and adding samples
// to the profile because we do not release the global VM lock while clearing.
// See also https://github.com/DataDog/dd-trace-rb/pull/2362/files#r1019659427 for more details.
ddog_Profile *profile = serializer_flip_active_and_inactive_slots(state, finish_timestamp);
if (!ddog_Profile_reset(profile, NULL /* start_time is optional */ )) {
return rb_ary_new_from_args(2, error_symbol, rb_str_new_cstr("Failed to reset profile"));
}

return rb_ary_new_from_args(2, ok_symbol, ruby_time_from(finish_timestamp));
}
11 changes: 11 additions & 0 deletions lib/datadog/profiling/exporter.rb
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,17 @@ def can_flush?
!duration_below_threshold?(last_flush_finish_at || created_at, time_provider.now.utc)
end

def clear
if pprof_recorder.respond_to?(:clear)
@last_flush_finish_at = pprof_recorder.clear
else # TODO: Remove this when the OldRecorder is retired and we can assume all recorders implement #clear
_, finish, = pprof_recorder.serialize
@last_flush_finish_at = finish
end

nil
end

private

def duration_below_threshold?(start, finish)
Expand Down
2 changes: 1 addition & 1 deletion lib/datadog/profiling/scheduler.rb
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def after_fork
# Clear any existing profiling state.
# We don't want the child process to report profiling data from its parent.
Datadog.logger.debug('Flushing exporter in child process #after_fork and discarding data')
exporter.flush
exporter.clear
end

# Configure Workers::IntervalLoop to not report immediately when scheduler starts
Expand Down
18 changes: 18 additions & 0 deletions lib/datadog/profiling/stack_recorder.rb
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,24 @@ def serialize
end
end

def clear
status, result = @no_concurrent_synchronize_mutex.synchronize { self.class._native_clear(self) }

if status == :ok
finish_timestamp = result

Datadog.logger.debug { "Cleared profile at #{finish_timestamp}" }

finish_timestamp
else
error_message = result

Datadog.logger.error("Failed to clear profiling data: #{error_message}")

nil
end
end

# Used only for Ruby 2.2 which doesn't have the native `rb_time_timespec_new` API; called from native code
def self.ruby_time_from(timespec_seconds, timespec_nanoseconds)
Time.at(0).utc + timespec_seconds + (timespec_nanoseconds.to_r / 1_000_000_000)
Expand Down
56 changes: 55 additions & 1 deletion spec/datadog/profiling/exporter_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,44 @@
end
end

describe '#clear' do
subject(:clear) { exporter.clear }

it { is_expected.to be nil }

context 'when pprof_recorder does not support clear' do
let(:pprof_recorder) { instance_double(Datadog::Profiling::OldRecorder, serialize: pprof_recorder_serialize) }

it 'triggers pprof_recorder serialization' do
expect(pprof_recorder).to receive(:serialize)

clear
end

it 'sets the last_flush_finish_at to the result of serialize' do
clear

expect(exporter.send(:last_flush_finish_at)).to be finish
end
end

context 'when pprof_recorder supports clear' do
let(:pprof_recorder) { instance_double(Datadog::Profiling::StackRecorder, clear: finish) }

it 'triggers pprof_recorder clear' do
expect(pprof_recorder).to receive(:clear)

clear
end

it 'sets the last_flush_finish_at to the result of clear' do
clear

expect(exporter.send(:last_flush_finish_at)).to be finish
end
end
end

describe '#can_flush?' do
let(:time_provider) { class_double(Time) }
let(:created_at) { start - 60 }
Expand Down Expand Up @@ -102,7 +140,23 @@
end
end

context 'when exporter has never flushed' do
context 'when exporter has been cleared before' do
before { exporter.clear }

context 'when less than 1s has elapsed since last clear' do
before { expect(time_provider).to receive(:now).and_return(finish + 0.99).once }

it { is_expected.to be false }
end

context 'when 1s or more has elapsed since last clear' do
before { expect(time_provider).to receive(:now).and_return(finish + 1).once }

it { is_expected.to be true }
end
end

context 'when exporter has never flushed or cleared' do
context 'when less than 1s has elapsed since exporter was created' do
before { expect(time_provider).to receive(:now).and_return(created_at + 0.99).once }

Expand Down
2 changes: 1 addition & 1 deletion spec/datadog/profiling/scheduler_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@
subject(:after_fork) { scheduler.after_fork }

it 'clears the buffer' do
expect(exporter).to receive(:flush)
expect(exporter).to receive(:clear)
after_fork
end
end
Expand Down
140 changes: 106 additions & 34 deletions spec/datadog/profiling/stack_recorder_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,41 @@ def slot_two_mutex_locked?
end
end

shared_examples_for 'locking behavior' do |operation|
context 'when slot one was the active slot' do
it 'sets slot two as the active slot' do
expect { stack_recorder.public_send(operation) }.to change { active_slot }.from(1).to(2)
end

it 'locks the slot one mutex' do
expect { stack_recorder.public_send(operation) }.to change { slot_one_mutex_locked? }.from(false).to(true)
end

it 'unlocks the slot two mutex' do
expect { stack_recorder.public_send(operation) }.to change { slot_two_mutex_locked? }.from(true).to(false)
end
end

context 'when slot two was the active slot' do
before do
# Trigger operation once, so that active slots get flipped
stack_recorder.public_send(operation)
end

it 'sets slot one as the active slot' do
expect { stack_recorder.public_send(operation) }.to change { active_slot }.from(2).to(1)
end

it 'unlocks the slot one mutex' do
expect { stack_recorder.public_send(operation) }.to change { slot_one_mutex_locked? }.from(true).to(false)
end

it 'locks the slot two mutex' do
expect { stack_recorder.public_send(operation) }.to change { slot_two_mutex_locked? }.from(false).to(true)
end
end
end

describe '#serialize' do
subject(:serialize) { stack_recorder.serialize }

Expand All @@ -61,40 +96,7 @@ def slot_two_mutex_locked?
expect(message).to include finish.iso8601
end

describe 'locking behavior' do
context 'when slot one was the active slot' do
it 'sets slot two as the active slot' do
expect { serialize }.to change { active_slot }.from(1).to(2)
end

it 'locks the slot one mutex and keeps it locked' do
expect { serialize }.to change { slot_one_mutex_locked? }.from(false).to(true)
end

it 'unlocks the slot two mutex and keeps it unlocked' do
expect { serialize }.to change { slot_two_mutex_locked? }.from(true).to(false)
end
end

context 'when slot two was the active slot' do
before do
# Trigger serialization once, so that active slots get flipped
stack_recorder.serialize
end

it 'sets slot one as the active slot' do
expect { serialize }.to change { active_slot }.from(2).to(1)
end

it 'unlocks the slot one mutex and keeps it unlocked' do
expect { serialize }.to change { slot_one_mutex_locked? }.from(true).to(false)
end

it 'locks the slow two mutex and keeps it locked' do
expect { serialize }.to change { slot_two_mutex_locked? }.from(false).to(true)
end
end
end
include_examples 'locking behavior', :serialize

context 'when the profile is empty' do
it 'uses the current time as the start and finish time' do
Expand Down Expand Up @@ -219,4 +221,74 @@ def sample_types_from(decoded_profile)
end
end
end

describe '#clear' do
subject(:clear) { stack_recorder.clear }

it 'debug logs that clear was invoked' do
message = nil

expect(Datadog.logger).to receive(:debug) do |&message_block|
message = message_block.call
end

clear

expect(message).to match(/Cleared profile/)
end

include_examples 'locking behavior', :clear

it 'uses the current time as the finish time' do
before_clear = Time.now.utc
finish = clear
after_clear = Time.now.utc

expect(finish).to be_between(before_clear, after_clear)
end

context 'when profile has a sample' do
let(:collectors_stack) { Datadog::Profiling::Collectors::Stack.new }

let(:metric_values) { { 'cpu-time' => 123, 'cpu-samples' => 456, 'wall-time' => 789 } }
let(:labels) { { 'label_a' => 'value_a', 'label_b' => 'value_b' }.to_a }

it 'makes the next calls to serialize return no data' do
# Add some data
Datadog::Profiling::Collectors::Stack::Testing
._native_sample(Thread.current, stack_recorder, metric_values, labels, 400, false)

# Sanity check: validate that data is there, to avoid the test passing because of other issues
sanity_check_samples = samples_from_pprof(stack_recorder.serialize.last)
expect(sanity_check_samples.size).to be 1

# Add some data, again
Datadog::Profiling::Collectors::Stack::Testing
._native_sample(Thread.current, stack_recorder, metric_values, labels, 400, false)

clear

# Test twice in a row to validate that both profile slots are empty
expect(samples_from_pprof(stack_recorder.serialize.last)).to be_empty
expect(samples_from_pprof(stack_recorder.serialize.last)).to be_empty
end
end

context 'when there is a failure during serialization' do
before do
allow(Datadog.logger).to receive(:error)

# Real failures in serialization are hard to trigger, so we're using a mock failure instead
expect(described_class).to receive(:_native_clear).and_return([:error, 'test error message'])
end

it { is_expected.to be nil }

it 'logs an error message' do
expect(Datadog.logger).to receive(:error).with(/test error message/)

clear
end
end
end
end

0 comments on commit f93d1b3

Please sign in to comment.