From 6ba1e5fab6736cacee72e7865d6d39eb073f1109 Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Mon, 19 Aug 2024 15:16:56 +0100 Subject: [PATCH 01/18] Bootstrap support for running profiling tests with ruby_memcheck The [`ruby_memcheck`](https://github.com/Shopify/ruby_memcheck) gem provides a Ruby-specific user-friendly wrapper around valgrind. In some cases, some of our tests were hanging or timing out due to how valgrind runs code in a sequential manner. I've added a new tag to be able to skip such tests. I also explored using valgrind's `fair-sched` option (see https://github.com/Shopify/ruby_memcheck/issues/51) but ran into issues with its output being incomplete, so decided to go the skip route instead. You can run the new job with `bundle exec rake spec:profiling:memcheck`. --- Gemfile | 1 + Rakefile | 31 +++++++++++++++++++ .../cpu_and_wall_time_worker_spec.rb | 7 +++-- .../profiling/native_extension_spec.rb | 2 +- .../profiling/validate_benchmarks_spec.rb | 2 +- suppressions/ruby-3.4.supp | 17 ++++++++++ 6 files changed, 55 insertions(+), 5 deletions(-) create mode 100644 suppressions/ruby-3.4.supp diff --git a/Gemfile b/Gemfile index 5bf7a0f4255..06669aec558 100644 --- a/Gemfile +++ b/Gemfile @@ -83,6 +83,7 @@ group :check do gem 'steep', '~> 1.6.0', require: false end gem 'standard', require: false + gem 'ruby_memcheck', '>= 3' if RUBY_VERSION >= '3.4.0' && RUBY_PLATFORM != 'java' end group :dev do diff --git a/Rakefile b/Rakefile index a71e9ea3259..0b0fb1a2c9a 100644 --- a/Rakefile +++ b/Rakefile @@ -6,6 +6,18 @@ require 'rspec/core/rake_task' require 'rake/extensiontask' require 'yard' require 'os' +if Gem.loaded_specs.key? 'ruby_memcheck' + require 'ruby_memcheck' + require 'ruby_memcheck/rspec/rake_task' + + RubyMemcheck.config( + # If there's an error, print the suppression for that error, to allow us to easily skip such an error if it's + # a false-positive / something in the VM we can't fix. + valgrind_generate_suppressions: true, + # This feature provides better quality data -- I couldn't get good output out of ruby_memcheck without it. + use_only_ruby_free_at_exit: true, + ) +end Dir.glob('tasks/*.rake').each { |r| import r } @@ -314,6 +326,25 @@ namespace :spec do t.rspec_opts = [*args.to_a, '-t ractors'].join(' ') end + desc 'Run spec:profiling:main tests with memory leak checking' + if Gem.loaded_specs.key?('ruby_memcheck') + RubyMemcheck::RSpec::RakeTask.new(:memcheck) do |t, args| + t.pattern = 'spec/datadog/profiling/**/*_spec.rb,spec/datadog/profiling_spec.rb' + # Some of our specs use multi-threading + busy looping, or multiple processes, or are just really really slow. + # We skip running these when running under valgrind. + # (As a reminder, by default valgrind simulates a sequential/single-threaded execution). + # + # @ivoanjo: I previously tried https://github.com/Shopify/ruby_memcheck/issues/51 but in some cases valgrind + # would give incomplete output, causing a "FATAL: Premature end of data in tag valgrindoutput line 3" error in + # ruby_memcheck. I did not figure out why exactly. + t.rspec_opts = [*args.to_a, '-t ~ractors -t ~memcheck_valgrind_skip'].join(' ') + end + else + task :memcheck do + raise 'Memcheck requires the ruby_memcheck gem to be installed' + end + end + # Make sure each profiling test suite has a dependency on compiled native extensions Rake::Task[:all].prerequisite_tasks.each { |t| t.enhance([:compile_native_extensions]) } end diff --git a/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb b/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb index 0acca631148..4f4f2ac8391 100644 --- a/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb +++ b/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb @@ -168,7 +168,7 @@ # See native bits for more details. let(:options) { {**super(), skip_idle_samples_for_testing: true} } - it "triggers sampling and records the results" do + it "triggers sampling and records the results", :memcheck_valgrind_skip do start all_samples = loop_until do @@ -182,6 +182,7 @@ it( "keeps statistics on how many samples were triggered by the background thread, " \ "as well as how many samples were requested from the VM", + :memcheck_valgrind_skip, ) do start @@ -337,7 +338,7 @@ background_thread.join end - it "is able to sample even when the main thread is sleeping" do + it "is able to sample even when the main thread is sleeping", :memcheck_valgrind_skip do background_thread ready_queue.pop @@ -546,7 +547,7 @@ # and we clamp it if it goes over the limit. # But the total amount of allocations recorded should match the number we observed, and thus we record the # remainder above the clamped value as a separate "Skipped Samples" step. - context "with a high allocation rate" do + context "with a high allocation rate", :memcheck_valgrind_skip do let(:options) { {**super(), dynamic_sampling_rate_overhead_target_percentage: 0.1} } let(:thread_that_allocates_as_fast_as_possible) { Thread.new { loop { BasicObject.new } } } diff --git a/spec/datadog/profiling/native_extension_spec.rb b/spec/datadog/profiling/native_extension_spec.rb index 3a082f624f2..f68ace2a796 100644 --- a/spec/datadog/profiling/native_extension_spec.rb +++ b/spec/datadog/profiling/native_extension_spec.rb @@ -120,7 +120,7 @@ it { is_expected.to be false } end - describe "correctness" do + describe "correctness", :memcheck_valgrind_skip do let(:ready_queue) { Queue.new } let(:background_thread) do Thread.new do diff --git a/spec/datadog/profiling/validate_benchmarks_spec.rb b/spec/datadog/profiling/validate_benchmarks_spec.rb index 9172b788b29..d7d85c029c9 100644 --- a/spec/datadog/profiling/validate_benchmarks_spec.rb +++ b/spec/datadog/profiling/validate_benchmarks_spec.rb @@ -1,6 +1,6 @@ require "datadog/profiling/spec_helper" -RSpec.describe "Profiling benchmarks" do +RSpec.describe "Profiling benchmarks", :memcheck_valgrind_skip do before { skip_if_profiling_not_supported(self) } around do |example| diff --git a/suppressions/ruby-3.4.supp b/suppressions/ruby-3.4.supp new file mode 100644 index 00000000000..0e03f7515c8 --- /dev/null +++ b/suppressions/ruby-3.4.supp @@ -0,0 +1,17 @@ +# This is a valgrind suppression configuration file. +# +# We use it together with the ruby_memcheck gem to find issues in the dd-trace-rb native extensions; in some cases +# we need to ignore potential issues as they're not something we can fix (e.g. outside our code.) +# +# See https://valgrind.org/docs/manual/manual-core.html#manual-core.suppress for details. + +{ + ruby-weak-map + Memcheck:Addr8 + fun:wmap_cmp + fun:find_table_bin_ind + fun:st_general_foreach + fun:rb_st_foreach + ... +} + From 252ce6bfd5b8209a3ae04be8325d7a5d2adc5200 Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Mon, 19 Aug 2024 16:36:38 +0100 Subject: [PATCH 02/18] Avoid leaking memory in stack collector spec This memory leak is specific to the testing code -- `Datadog::Profiling::Collectors::Stack::Testing._native_sample` was missing exception handling, and thus would leak memory for the specs that were triggering exceptions. In production, all this memory is managed via Ruby objects (specifically the `Collectors::ThreadContext`) which make sure to free it when they get garbage collected. (I found this memory leak using the ruby_memleak gem.) --- .../collectors_stack.c | 56 +++++++++++++++---- 1 file changed, 45 insertions(+), 11 deletions(-) diff --git a/ext/datadog_profiling_native_extension/collectors_stack.c b/ext/datadog_profiling_native_extension/collectors_stack.c index b8c56bafb93..c5df3026956 100644 --- a/ext/datadog_profiling_native_extension/collectors_stack.c +++ b/ext/datadog_profiling_native_extension/collectors_stack.c @@ -30,6 +30,8 @@ static VALUE _native_sample( VALUE max_frames, VALUE in_gc ); +static VALUE native_sample_do(VALUE args); +static VALUE native_sample_ensure(VALUE args); static void maybe_add_placeholder_frames_omitted(VALUE thread, sampling_buffer* buffer, char *frames_omitted_message, int frames_omitted_message_size); static void record_placeholder_stack_in_native_code(VALUE recorder_instance, sample_values values, sample_labels labels); static void maybe_trim_template_random_ids(ddog_CharSlice *name_slice, ddog_CharSlice *filename_slice); @@ -51,6 +53,16 @@ void collectors_stack_init(VALUE profiling_module) { rb_global_variable(&missing_string); } +struct native_sample_args { + VALUE in_gc; + VALUE recorder_instance; + sample_values values; + sample_labels labels; + VALUE thread; + ddog_prof_Location *locations; + sampling_buffer *buffer; +}; + // This method exists only to enable testing Datadog::Profiling::Collectors::Stack behavior using RSpec. // It SHOULD NOT be used for other purposes. static VALUE _native_sample( @@ -113,25 +125,47 @@ static VALUE _native_sample( ddog_prof_Slice_Label slice_labels = {.ptr = labels, .len = labels_count}; - if (in_gc == Qtrue) { + struct native_sample_args args_struct = { + .in_gc = in_gc, + .recorder_instance = recorder_instance, + .values = values, + .labels = (sample_labels) {.labels = slice_labels, .state_label = state_label}, + .thread = thread, + .locations = locations, + .buffer = buffer, + }; + + return rb_ensure(native_sample_do, (VALUE) &args_struct, native_sample_ensure, (VALUE) &args_struct); +} + +static VALUE native_sample_do(VALUE args) { + struct native_sample_args *args_struct = (struct native_sample_args *) args; + + if (args_struct->in_gc == Qtrue) { record_placeholder_stack( - recorder_instance, - values, - (sample_labels) {.labels = slice_labels, .state_label = state_label}, + args_struct->recorder_instance, + args_struct->values, + args_struct->labels, DDOG_CHARSLICE_C("Garbage Collection") ); } else { sample_thread( - thread, - buffer, - recorder_instance, - values, - (sample_labels) {.labels = slice_labels, .state_label = state_label} + args_struct->thread, + args_struct->buffer, + args_struct->recorder_instance, + args_struct->values, + args_struct->labels ); } - ruby_xfree(locations); - sampling_buffer_free(buffer); + return Qtrue; +} + +static VALUE native_sample_ensure(VALUE args) { + struct native_sample_args *args_struct = (struct native_sample_args *) args; + + ruby_xfree(args_struct->locations); + sampling_buffer_free(args_struct->buffer); return Qtrue; } From 451a6ba3a65e8b74c7893e50eac8ad0a145a72b4 Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Mon, 19 Aug 2024 16:47:55 +0100 Subject: [PATCH 03/18] Fix memory leak when Ruby process used fork While re-examining the memory management code for the per-thread contexts, I realize we were going a bit too "high-level" during `_native_reset_after_fork` and clearing the hashtable containing the per-thread contexts without also cleaning the contexts. This was a real production memory leak: every time the Ruby VM used fork, we were leaking the thread contexts for all threads active prior to the fork. I suspect we got lucky because most apps don't refork: there's a master/original process, and every new child gets created from it. The biggest exception is probably the pitchfork web server, which explicitly employs reforking. --- .../collectors_thread_context.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ext/datadog_profiling_native_extension/collectors_thread_context.c b/ext/datadog_profiling_native_extension/collectors_thread_context.c index 8a449dc7a28..6c8501aeb80 100644 --- a/ext/datadog_profiling_native_extension/collectors_thread_context.c +++ b/ext/datadog_profiling_native_extension/collectors_thread_context.c @@ -1223,6 +1223,9 @@ static VALUE _native_reset_after_fork(DDTRACE_UNUSED VALUE self, VALUE collector struct thread_context_collector_state *state; TypedData_Get_Struct(collector_instance, struct thread_context_collector_state, &thread_context_collector_typed_data, state); + // Release all context memory before clearing the existing context + st_foreach(state->hash_map_per_thread_context, hash_map_per_thread_context_free_values, 0 /* unused */); + st_clear(state->hash_map_per_thread_context); state->stats = (struct stats) {}; // Resets all stats back to zero From bf841773c316e3f4d1a2fdc8cc481950c7c0efc1 Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Mon, 19 Aug 2024 17:14:44 +0100 Subject: [PATCH 04/18] Ignore any leaks reported from pkg-config, since we don't control it --- suppressions/ruby-3.4.supp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/suppressions/ruby-3.4.supp b/suppressions/ruby-3.4.supp index 0e03f7515c8..f5a82430d0c 100644 --- a/suppressions/ruby-3.4.supp +++ b/suppressions/ruby-3.4.supp @@ -15,3 +15,10 @@ ... } +{ + pkg-config-memory + Memcheck:Leak + ... + obj:/usr/bin/pkg-config + ... +} From df9b2427604c428088da6f5956a991cefd5fdda0 Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Mon, 19 Aug 2024 17:23:34 +0100 Subject: [PATCH 05/18] Avoid leaking memory in stack collector spec when max_frames is invalid Since `sampling_buffer_new` validates the input, we weren't properly handling an exception from there. To avoid introducing a lot of boilerplate to solve this leak, let's instead call `check_max_frames` before doing any allocation: this is what we do in production, and it keeps `_native_sample` simple. --- ext/datadog_profiling_native_extension/collectors_stack.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ext/datadog_profiling_native_extension/collectors_stack.c b/ext/datadog_profiling_native_extension/collectors_stack.c index c5df3026956..ae9897b0b24 100644 --- a/ext/datadog_profiling_native_extension/collectors_stack.c +++ b/ext/datadog_profiling_native_extension/collectors_stack.c @@ -117,8 +117,7 @@ static VALUE _native_sample( }; } - int max_frames_requested = NUM2INT(max_frames); - if (max_frames_requested < 0) rb_raise(rb_eArgError, "Invalid max_frames: value must not be negative"); + int max_frames_requested = sampling_buffer_check_max_frames(NUM2INT(max_frames)); ddog_prof_Location *locations = ruby_xcalloc(max_frames_requested, sizeof(ddog_prof_Location)); sampling_buffer *buffer = sampling_buffer_new(max_frames_requested, locations); From 94fa2c4e62e0d9508475bcaa2b2d733e4e808274 Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Tue, 20 Aug 2024 08:54:03 +0100 Subject: [PATCH 06/18] Fix exception in ruby 3.4 development builds While using a recent Ruby 3.4 development build (to hunt down memory leaks using `ruby_memcheck`), one of our tests started failing with > rb_raise(rb_eRuntimeError, "BUG: rb_native_thread* is null. Is this Ruby running with RUBY_MN_THREADS=1?"); This exception had been added as a "not sure what to do, just in case", but it looks like that even without MN threads it can happen. (I suspect we may have found a Ruby thread that's so new that it hasn't had a native thread assigned). For now let's instead return 0 -- this value is only used for displaying the thread id, so it seems better to have missing info (a zero), rather than stopping profiling. We can always circle back on this once Ruby 3.4 is stable and if we see this is happening often. --- ext/datadog_profiling_native_extension/private_vm_api_access.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ext/datadog_profiling_native_extension/private_vm_api_access.c b/ext/datadog_profiling_native_extension/private_vm_api_access.c index ca1e03b50c0..5dcb5d2e434 100644 --- a/ext/datadog_profiling_native_extension/private_vm_api_access.c +++ b/ext/datadog_profiling_native_extension/private_vm_api_access.c @@ -182,7 +182,7 @@ uint64_t native_thread_id_for(VALUE thread) { #if !defined(NO_THREAD_TID) && defined(RB_THREAD_T_HAS_NATIVE_ID) #ifndef NO_RB_NATIVE_THREAD struct rb_native_thread* native_thread = thread_struct_from_object(thread)->nt; - if (native_thread == NULL) rb_raise(rb_eRuntimeError, "BUG: rb_native_thread* is null. Is this Ruby running with RUBY_MN_THREADS=1?"); + if (native_thread == NULL) return 0; return native_thread->tid; #else return thread_struct_from_object(thread)->tid; From 8f35a5e29594416dc94f32affacc0f7515495f8b Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Tue, 20 Aug 2024 08:58:31 +0100 Subject: [PATCH 07/18] Add suppression for Ruby leaking thread data in a fork It looks like Ruby is leaking thread data when a fork happens. I'm not sure we can do anything about this, so added to the suppressions for now. --- suppressions/ruby-3.4.supp | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/suppressions/ruby-3.4.supp b/suppressions/ruby-3.4.supp index f5a82430d0c..70e8aa2c6ae 100644 --- a/suppressions/ruby-3.4.supp +++ b/suppressions/ruby-3.4.supp @@ -5,6 +5,7 @@ # # See https://valgrind.org/docs/manual/manual-core.html#manual-core.suppress for details. +# Ruby weak maps seem to be able to cause invalid reads? { ruby-weak-map Memcheck:Addr8 @@ -15,6 +16,7 @@ ... } +# We don't care about the pkg-config external tool { pkg-config-memory Memcheck:Leak @@ -22,3 +24,17 @@ obj:/usr/bin/pkg-config ... } + +# When a Ruby process forks, it looks like Ruby doesn't clean up the memory of old threads? +{ + ruby-native-thread-memory + Memcheck:Leak + fun:calloc + fun:calloc1 + fun:rb_gc_impl_calloc + fun:native_thread_alloc + fun:native_thread_create_dedicated + fun:native_thread_create + fun:thread_create_core + ... +} From 5f48a9b1aa2c754f0d98d0aa2f4513e4b47f7108 Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Tue, 20 Aug 2024 09:33:31 +0100 Subject: [PATCH 08/18] Fix profiler spec by using webrick from git on Ruby 3.4 --- Gemfile | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/Gemfile b/Gemfile index 06669aec558..9a5cf829d7e 100644 --- a/Gemfile +++ b/Gemfile @@ -44,7 +44,12 @@ gem 'webmock', '>= 3.10.0' gem 'rexml', '>= 3.2.7' # https://www.ruby-lang.org/en/news/2024/05/16/dos-rexml-cve-2024-35176/ -gem 'webrick', '>= 1.7.0' if RUBY_VERSION >= '3.0.0' # No longer bundled by default since Ruby 3.0 +if RUBY_VERSION.start_with?('3.4.') + # ruby 3.4 breaks stable webrick; we need this fix until a version later than 1.8.1 comes out + gem 'webrick', git: 'https://github.com/ruby/webrick.git', ref: '0c600e169bd4ae267cb5eeb6197277c848323bbe' +elsif RUBY_VERSION >= '3.0.0' # No longer bundled by default since Ruby 3.0 + gem 'webrick', '>= 1.7.0' +end gem 'yard', '~> 0.9' # NOTE: YardDoc is generated with ruby 3.2 in GitHub Actions From 92e8dd5612b324a5529925be0dc3f304975a8dbc Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Tue, 20 Aug 2024 10:08:26 +0100 Subject: [PATCH 09/18] Add another suppression for Ruby native thread leak for Ruby 3.4.0-preview1 It looks like for 3.4.0-preview1, the stack shown is slightly different, so let's add a second suppression to support it. (I opted to not generalize the existing suppression as I'm thinking we may not want to make it too generic, so as to not end up with false-positives.) --- suppressions/ruby-3.4.supp | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/suppressions/ruby-3.4.supp b/suppressions/ruby-3.4.supp index 70e8aa2c6ae..76e985a595e 100644 --- a/suppressions/ruby-3.4.supp +++ b/suppressions/ruby-3.4.supp @@ -38,3 +38,18 @@ fun:thread_create_core ... } + +# When a Ruby process forks, it looks like Ruby doesn't clean up the memory of old threads? +{ + ruby-native-thread-memory-2 + Memcheck:Leak + fun:calloc + fun:calloc1 + fun:objspace_xcalloc + fun:ruby_xcalloc_body + fun:native_thread_alloc + fun:native_thread_create_dedicated + fun:native_thread_create + fun:thread_create_core + ... +} From e188e3221ebca43f2f3433d61af1ad8c8c71e3ee Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Tue, 20 Aug 2024 10:13:25 +0100 Subject: [PATCH 10/18] Add GitHub Action for testing for memory leaks in CI I've chosen to use GitHub Actions for the new CI job, rather than CircleCI. I don't have a very strong preference for either, it's just that it was easier to use the latest valgrind version by picking the latest Ubuntu image on GHA. --- .github/workflows/test-memory-leaks.yaml | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 .github/workflows/test-memory-leaks.yaml diff --git a/.github/workflows/test-memory-leaks.yaml b/.github/workflows/test-memory-leaks.yaml new file mode 100644 index 00000000000..eedc80cff2d --- /dev/null +++ b/.github/workflows/test-memory-leaks.yaml @@ -0,0 +1,14 @@ +name: Test for memory leaks +on: [push] +jobs: + test-memcheck: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: ruby/setup-ruby@v1 + with: + ruby-version: 3.4.0-preview1 # TODO: Use stable version once 3.4 is out + bundler-cache: true # runs 'bundle install' and caches installed gems automatically + bundler: latest + cache-version: v1 # bump this to invalidate cache + - run: bundle exec rake compile spec:profiling:memcheck From 877abf34fdc7ec3b3ae1620a120f9814c124a860 Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Tue, 20 Aug 2024 10:20:55 +0100 Subject: [PATCH 11/18] Install valgrind in CI before using it --- .github/workflows/test-memory-leaks.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/test-memory-leaks.yaml b/.github/workflows/test-memory-leaks.yaml index eedc80cff2d..62885cbe4cc 100644 --- a/.github/workflows/test-memory-leaks.yaml +++ b/.github/workflows/test-memory-leaks.yaml @@ -11,4 +11,5 @@ jobs: bundler-cache: true # runs 'bundle install' and caches installed gems automatically bundler: latest cache-version: v1 # bump this to invalidate cache + - run: sudo apt install -y valgrind && valgrind --version - run: bundle exec rake compile spec:profiling:memcheck From 1688da4eaf0705f45a868784c36652f0e0dac461 Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Tue, 20 Aug 2024 10:22:07 +0100 Subject: [PATCH 12/18] Confirm that suppressions folder is expected not to be packaged --- spec/datadog/release_gem_spec.rb | 1 + 1 file changed, 1 insertion(+) diff --git a/spec/datadog/release_gem_spec.rb b/spec/datadog/release_gem_spec.rb index 0eafbe5d967..cc925538647 100644 --- a/spec/datadog/release_gem_spec.rb +++ b/spec/datadog/release_gem_spec.rb @@ -59,6 +59,7 @@ |tasks |yard |vendor/rbs + |suppressions )/ }x From a8ee4864eeb3236c2f47d7e321221ed21f59e26a Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Tue, 20 Aug 2024 10:34:29 +0100 Subject: [PATCH 13/18] Make rubocop happy --- Gemfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Gemfile b/Gemfile index 9a5cf829d7e..fae3a17685a 100644 --- a/Gemfile +++ b/Gemfile @@ -87,8 +87,8 @@ group :check do gem 'rbs', '~> 3.2.0', require: false gem 'steep', '~> 1.6.0', require: false end - gem 'standard', require: false gem 'ruby_memcheck', '>= 3' if RUBY_VERSION >= '3.4.0' && RUBY_PLATFORM != 'java' + gem 'standard', require: false end group :dev do From 9785a30a83db4505c0cae7e2210f889013e1d38e Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Tue, 20 Aug 2024 10:38:30 +0100 Subject: [PATCH 14/18] Use ubuntu 24.04 as a base image to get latest valgrind Otherwise, at least at time of writing, "ubuntu-latest" is still Ubuntu 22.04 (lol). --- .github/workflows/test-memory-leaks.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-memory-leaks.yaml b/.github/workflows/test-memory-leaks.yaml index 62885cbe4cc..9ac76c9ee22 100644 --- a/.github/workflows/test-memory-leaks.yaml +++ b/.github/workflows/test-memory-leaks.yaml @@ -2,7 +2,7 @@ name: Test for memory leaks on: [push] jobs: test-memcheck: - runs-on: ubuntu-latest + runs-on: ubuntu-24.04 steps: - uses: actions/checkout@v4 - uses: ruby/setup-ruby@v1 From b1dc345e81ab70bd0a52b1767b9ed0b12d1c396f Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Tue, 20 Aug 2024 10:55:24 +0100 Subject: [PATCH 15/18] Add suppression for tr tool --- suppressions/ruby-3.4.supp | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/suppressions/ruby-3.4.supp b/suppressions/ruby-3.4.supp index 76e985a595e..9910e76b275 100644 --- a/suppressions/ruby-3.4.supp +++ b/suppressions/ruby-3.4.supp @@ -16,15 +16,6 @@ ... } -# We don't care about the pkg-config external tool -{ - pkg-config-memory - Memcheck:Leak - ... - obj:/usr/bin/pkg-config - ... -} - # When a Ruby process forks, it looks like Ruby doesn't clean up the memory of old threads? { ruby-native-thread-memory @@ -53,3 +44,21 @@ fun:thread_create_core ... } + +# We don't care about the pkg-config external tool +{ + pkg-config-memory + Memcheck:Leak + ... + obj:/usr/bin/pkg-config + ... +} + +# We don't care about the tr external tool +{ + pkg-config-memory + Memcheck:Leak + ... + obj:/usr/bin/tr + ... +} From 168452ebebffb8a164e4c2f0ea56abaec7bef049 Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Wed, 21 Aug 2024 15:22:22 +0100 Subject: [PATCH 16/18] Empty commit to re-trigger CI From c8f2338b201230849c6cc255c565eed06dc5d214 Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Thu, 22 Aug 2024 11:05:52 +0100 Subject: [PATCH 17/18] Attempt to fix flaky spec due to timing issues This test failed in CI when running inside valgrind: ``` Failures: 1) Datadog::Profiling::Collectors::ThreadContext#sample_after_gc when there is gc information to record when timeline is enabled creates a Garbage Collection sample using the timestamp set by on_gc_finish, converted to epoch ns Failure/Error: expect(gc_sample.labels.fetch(:end_timestamp_ns)).to be_between(@time_before, @time_after) expected 1724250478003178125 to be between 1724250478003215154 and 1724250478003295874 (inclusive) # ./spec/datadog/profiling/collectors/thread_context_spec.rb:1024:in 'block (5 levels) in ' # ./spec/spec_helper.rb:231:in 'block (2 levels) in ' # ./spec/spec_helper.rb:116:in 'block (2 levels) in ' # ./vendor/bundle/ruby/3.4.0+0/gems/webmock-3.23.1/lib/webmock/rspec.rb:39:in 'block (2 levels) in ' # ./vendor/bundle/ruby/3.4.0+0/gems/rspec-wait-0.0.10/lib/rspec/wait.rb:47:in 'block (2 levels) in ' Finished in 3 minutes 54.8 seconds (files took 25.77 seconds to load) 701 examples, 1 failure, 20 pending Failed examples: rspec ./spec/datadog/profiling/collectors/thread_context_spec.rb:1021 # Datadog::Profiling::Collectors::ThreadContext#sample_after_gc when there is gc information to record when timeline is enabled creates a Garbage Collection sample using the timestamp set by on_gc_finish, converted to epoch ns Randomized with seed 49098 ``` I couldn't quite reproduce it, but given the really small delta between the expected time and target time (37029 nanoseconds before start time), I'm strongly suspecting this failure may be due to our monotonic to system timestamp conversion code. In particular, we use monotonic timestamps in the profiler, and then we convert them to epoch timestamps by comparing the difference between the monotonic clock and the system clock. There can be some drift in this comparison because we cache the difference rather than compute it every time (see comments on `time_helpers.c:monotonic_to_system_epoch_ns`). So by resetting the cache before this test, hopefully any flakyness due to the drifting will go away. --- .../collectors_thread_context.c | 15 ++++++++++++--- .../profiling/collectors/thread_context_spec.rb | 6 +++--- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/ext/datadog_profiling_native_extension/collectors_thread_context.c b/ext/datadog_profiling_native_extension/collectors_thread_context.c index 6c8501aeb80..0a93488e8ef 100644 --- a/ext/datadog_profiling_native_extension/collectors_thread_context.c +++ b/ext/datadog_profiling_native_extension/collectors_thread_context.c @@ -182,7 +182,7 @@ static VALUE _native_initialize( static VALUE _native_sample(VALUE self, VALUE collector_instance, VALUE profiler_overhead_stack_thread); static VALUE _native_on_gc_start(VALUE self, VALUE collector_instance); static VALUE _native_on_gc_finish(VALUE self, VALUE collector_instance); -static VALUE _native_sample_after_gc(DDTRACE_UNUSED VALUE self, VALUE collector_instance); +static VALUE _native_sample_after_gc(DDTRACE_UNUSED VALUE self, VALUE collector_instance, VALUE reset_monotonic_to_system_state); void update_metrics_and_sample( struct thread_context_collector_state *state, VALUE thread_being_sampled, @@ -261,7 +261,7 @@ void collectors_thread_context_init(VALUE profiling_module) { rb_define_singleton_method(testing_module, "_native_sample_allocation", _native_sample_allocation, 3); rb_define_singleton_method(testing_module, "_native_on_gc_start", _native_on_gc_start, 1); rb_define_singleton_method(testing_module, "_native_on_gc_finish", _native_on_gc_finish, 1); - rb_define_singleton_method(testing_module, "_native_sample_after_gc", _native_sample_after_gc, 1); + rb_define_singleton_method(testing_module, "_native_sample_after_gc", _native_sample_after_gc, 2); rb_define_singleton_method(testing_module, "_native_thread_list", _native_thread_list, 0); rb_define_singleton_method(testing_module, "_native_per_thread_context", _native_per_thread_context, 1); rb_define_singleton_method(testing_module, "_native_stats", _native_stats, 1); @@ -439,7 +439,16 @@ static VALUE _native_on_gc_finish(DDTRACE_UNUSED VALUE self, VALUE collector_ins // This method exists only to enable testing Datadog::Profiling::Collectors::ThreadContext behavior using RSpec. // It SHOULD NOT be used for other purposes. -static VALUE _native_sample_after_gc(DDTRACE_UNUSED VALUE self, VALUE collector_instance) { +static VALUE _native_sample_after_gc(DDTRACE_UNUSED VALUE self, VALUE collector_instance, VALUE reset_monotonic_to_system_state) { + ENFORCE_BOOLEAN(reset_monotonic_to_system_state); + + struct thread_context_collector_state *state; + TypedData_Get_Struct(collector_instance, struct thread_context_collector_state, &thread_context_collector_typed_data, state); + + if (reset_monotonic_to_system_state == Qtrue) { + state->time_converter_state = (monotonic_to_system_epoch_state) MONOTONIC_TO_SYSTEM_EPOCH_INITIALIZER; + } + thread_context_collector_sample_after_gc(collector_instance); return Qtrue; } diff --git a/spec/datadog/profiling/collectors/thread_context_spec.rb b/spec/datadog/profiling/collectors/thread_context_spec.rb index a0112ae2f37..5c36608eb04 100644 --- a/spec/datadog/profiling/collectors/thread_context_spec.rb +++ b/spec/datadog/profiling/collectors/thread_context_spec.rb @@ -71,8 +71,8 @@ def on_gc_finish described_class::Testing._native_on_gc_finish(cpu_and_wall_time_collector) end - def sample_after_gc - described_class::Testing._native_sample_after_gc(cpu_and_wall_time_collector) + def sample_after_gc(reset_monotonic_to_system_state: false) + described_class::Testing._native_sample_after_gc(cpu_and_wall_time_collector, reset_monotonic_to_system_state) end def sample_allocation(weight:, new_object: Object.new) @@ -1019,7 +1019,7 @@ def self.otel_sdk_available? end it "creates a Garbage Collection sample using the timestamp set by on_gc_finish, converted to epoch ns" do - sample_after_gc + sample_after_gc(reset_monotonic_to_system_state: true) expect(gc_sample.labels.fetch(:end_timestamp_ns)).to be_between(@time_before, @time_after) end From 5bc00e1d0b11294d3ed38a008b493591b9fad9d4 Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Thu, 22 Aug 2024 15:02:28 +0100 Subject: [PATCH 18/18] Update gc benchmark with new argument on test method --- benchmarks/profiler_gc.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/profiler_gc.rb b/benchmarks/profiler_gc.rb index 0bc767bcfe4..8fa1ee615a9 100644 --- a/benchmarks/profiler_gc.rb +++ b/benchmarks/profiler_gc.rb @@ -38,7 +38,7 @@ def run_benchmark x.report('profiler gc') do Datadog::Profiling::Collectors::ThreadContext::Testing._native_on_gc_start(@collector) Datadog::Profiling::Collectors::ThreadContext::Testing._native_on_gc_finish(@collector) - Datadog::Profiling::Collectors::ThreadContext::Testing._native_sample_after_gc(@collector) + Datadog::Profiling::Collectors::ThreadContext::Testing._native_sample_after_gc(@collector, false) end x.save! "#{File.basename(__FILE__)}-results.json" unless VALIDATE_BENCHMARK_MODE @@ -61,7 +61,7 @@ def run_benchmark estimated_gc_per_minute.times do Datadog::Profiling::Collectors::ThreadContext::Testing._native_on_gc_start(@collector) Datadog::Profiling::Collectors::ThreadContext::Testing._native_on_gc_finish(@collector) - Datadog::Profiling::Collectors::ThreadContext::Testing._native_sample_after_gc(@collector) + Datadog::Profiling::Collectors::ThreadContext::Testing._native_sample_after_gc(@collector, false) end @recorder.serialize