Revert "Cache bounded CPU ID in register for global scratch buffers"

This reverts commit cd681f1.
bpftrace · Oct 9, 2024 · 2fa7894 · 2fa7894
1 parent f98a23c
commit 2fa7894
Show file tree

Hide file tree

Showing 34 changed files with 701 additions and 757 deletions.
diff --git a/src/ast/irbuilderbpf.cpp b/src/ast/irbuilderbpf.cpp
@@ -487,38 +487,46 @@ CallInst *IRBuilderBPF::CreateGetStrScratchMap(int idx,
                              idx);
 }
 
-Value *IRBuilderBPF::CreateGetFmtStringArgsScratchBuffer(Value *cpu_id)
+Value *IRBuilderBPF::CreateGetFmtStringArgsScratchBuffer(const location &loc)
 {
   return createScratchBuffer(
-      bpftrace::globalvars::GlobalVar::FMT_STRINGS_BUFFER, cpu_id);
+      bpftrace::globalvars::GlobalVar::FMT_STRINGS_BUFFER, loc);
 }
 
-Value *IRBuilderBPF::CreateTupleScratchBuffer(Value *cpu_id, int key)
+Value *IRBuilderBPF::CreateTupleScratchBuffer(const location &loc, int key)
 {
   return createScratchBuffer(bpftrace::globalvars::GlobalVar::TUPLE_BUFFER,
-                             cpu_id,
+                             loc,
                              key);
 }
 
 Value *IRBuilderBPF::createScratchBuffer(
     bpftrace::globalvars::GlobalVar globalvar,
-    Value *cpu_id,
+    const location &loc,
     size_t key)
 {
-  assert(cpu_id);
-
   auto config = globalvars::get_config(globalvar);
   // ValueType var[MAX_CPU_ID + 1][num_elements]
   auto type = globalvars::get_type(globalvar, bpftrace_.resources);
 
+  // Get CPU ID
+  auto cpu_id = CreateGetCpuId(loc);
+  auto max = CreateLoad(getInt64Ty(),
+                        module_.getGlobalVariable(to_string(
+                            bpftrace::globalvars::GlobalVar::MAX_CPU_ID)));
+  // Verify CPU ID is between 0 and MAX_CPU_ID to ensure we pass BPF
+  // verifer on older 5.2 kernels
+  auto cmp = CreateICmp(CmpInst::ICMP_ULE, cpu_id, max, "cpuid.min.cmp");
+  auto select = CreateSelect(cmp, cpu_id, max, "cpuid.min.select");
+
   // Note the 1st index is 0 because we're pointing to
   // ValueType var[MAX_CPU_ID + 1][num_elements]
   // 2nd/3rd/4th indexes actually index into the array
   // See https://llvm.org/docs/LangRef.html#id236
   return CreateGEP(GetType(type),
                    CreatePointerCast(module_.getGlobalVariable(config.name),
                                      GetType(type)->getPointerTo()),
-                   { getInt64(0), cpu_id, getInt64(key), getInt64(0) });
+                   { getInt64(0), select, getInt64(key), getInt64(0) });
 }
 
 /*

diff --git a/src/ast/irbuilderbpf.h b/src/ast/irbuilderbpf.h
@@ -168,8 +168,8 @@ class IRBuilderBPF : public IRBuilder<> {
   CallInst *CreateGetStrScratchMap(int idx,
                                    BasicBlock *failure_callback,
                                    const location &loc);
-  Value *CreateGetFmtStringArgsScratchBuffer(Value *cpu_id);
-  Value *CreateTupleScratchBuffer(Value *cpu_id, int key);
+  Value *CreateGetFmtStringArgsScratchBuffer(const location &loc);
+  Value *CreateTupleScratchBuffer(const location &loc, int key);
   void CreateCheckSetRecursion(const location &loc, int early_exit_ret);
   void CreateUnSetRecursion(const location &loc);
   CallInst *CreateHelperCall(libbpf::bpf_func_id func_id,
@@ -301,7 +301,7 @@ class IRBuilderBPF : public IRBuilder<> {
                                 BasicBlock *failure_callback,
                                 int key = 0);
   Value *createScratchBuffer(globalvars::GlobalVar globalvar,
-                             Value *cpu_id,
+                             const location &loc,
                              size_t key = 0);
   libbpf::bpf_func_id selectProbeReadHelper(AddrSpace as, bool str);
 

diff --git a/src/ast/passes/codegen_llvm.cpp b/src/ast/passes/codegen_llvm.cpp
@@ -2229,17 +2229,18 @@ void CodegenLLVM::compareStructure(SizedType &our_type, llvm::Type *llvm_type)
  */
 Value *CodegenLLVM::createTuple(
     const SizedType &tuple_type,
-    const std::vector<std::pair<llvm::Value *, const location *>> &vals)
+    const std::vector<std::pair<llvm::Value *, const location *>> &vals,
+    const location &loc)
 {
   auto tuple_ty = b_.GetType(tuple_type);
   size_t tuple_size = datalayout().getTypeAllocSize(tuple_ty);
   auto buf = b_.CreatePointerCast(
-      b_.CreateTupleScratchBuffer(cpu_id_stack_.top(), async_ids_.tuple()),
+      b_.CreateTupleScratchBuffer(loc, async_ids_.tuple()),
       tuple_ty->getPointerTo());
   b_.CreateMemsetBPF(buf, b_.getInt8(0), tuple_size);
 
   for (size_t i = 0; i < vals.size(); ++i) {
-    auto [val, loc] = vals[i];
+    auto [val, vloc] = vals[i];
     SizedType &type = tuple_type.GetField(i).type;
 
     Value *dst = b_.CreateGEP(tuple_ty,
@@ -2249,7 +2250,7 @@ Value *CodegenLLVM::createTuple(
     if (inBpfMemory(type))
       b_.CreateMemcpyBPF(dst, val, type.GetSize());
     else if (type.IsArrayTy() || type.IsRecordTy())
-      b_.CreateProbeRead(ctx_, dst, type, val, *loc);
+      b_.CreateProbeRead(ctx_, dst, type, val, *vloc);
     else
       b_.CreateStore(val, dst);
   }
@@ -2303,7 +2304,7 @@ void CodegenLLVM::visit(Tuple &tuple)
     scoped_dels.emplace_back(accept(elem));
     vals.push_back({ expr_, &elem->loc });
   }
-  auto buf = createTuple(tuple.type, vals);
+  auto buf = createTuple(tuple.type, vals, tuple.loc);
 
   // No need to end buf lifetime since tuple allocation is done via scratch maps
   // and not on stack
@@ -2647,37 +2648,6 @@ void CodegenLLVM::visit(AttachPoint &)
   // Empty
 }
 
-// For older 5.2 kernels, we need to bound CPU ID by MAX_CPU_ID to pass
-// BPF verifier on older kernels.
-//
-// However, for programs that do many scratch buffer lookups, this causes
-// many jumps which means a complex program may be rejected by the BPF
-// verifier for >= 8192 jumps (current limit).
-//
-// To minimize jumps, we bound the CPU ID at the beginning of each function
-// and cache the value in a register. This ensures we only add one
-// additional jump per function. Note we cannot cache this value in a global
-// variable since memory is shared between all CPUs or a per-CPU array since
-// that would be recursive.
-//
-// Note the only functions in scope are the ones where scratch buffer is used
-// which is currently subprograms, probes and map_for_each_cb. To account for
-// nesting, we use a stack to ensure we're referencing the correct instance.
-Value *CodegenLLVM::generateCachedCpuId(const location &loc)
-{
-  if (!bpftrace_.resources.needed_global_vars.count(
-          globalvars::GlobalVar::MAX_CPU_ID)) {
-    return nullptr;
-  }
-
-  auto cpu_id = b_.CreateGetCpuId(loc);
-  auto max = b_.CreateLoad(b_.getInt64Ty(),
-                           module_->getGlobalVariable(to_string(
-                               bpftrace::globalvars::GlobalVar::MAX_CPU_ID)));
-  auto cmp = b_.CreateICmp(CmpInst::ICMP_ULE, cpu_id, max);
-  return b_.CreateSelect(cmp, cpu_id, max);
-}
-
 void CodegenLLVM::generateProbe(Probe &probe,
                                 const std::string &full_func_id,
                                 const std::string &name,
@@ -2710,9 +2680,6 @@ void CodegenLLVM::generateProbe(Probe &probe,
                                getReturnValueForProbe(probe_type));
   }
 
-  assert(cpu_id_stack_.empty());
-  cpu_id_stack_.push(generateCachedCpuId(probe.loc));
-
   if (probe.pred) {
     auto scoped_del = accept(probe.pred);
   }
@@ -2723,9 +2690,6 @@ void CodegenLLVM::generateProbe(Probe &probe,
 
   createRet();
 
-  cpu_id_stack_.pop();
-  assert(cpu_id_stack_.empty());
-
   if (dummy) {
     func->eraseFromParent();
     return;
@@ -2808,18 +2772,12 @@ void CodegenLLVM::visit(Subprog &subprog)
     ++arg_index;
   }
 
-  assert(cpu_id_stack_.empty());
-  cpu_id_stack_.push(generateCachedCpuId(subprog.loc));
-
   for (Statement *stmt : subprog.stmts) {
     auto scoped_del = accept(stmt);
   }
   if (subprog.return_type.IsVoidTy())
     createRet();
 
-  cpu_id_stack_.pop();
-  assert(cpu_id_stack_.empty());
-
   FunctionPassManager fpm;
   FunctionAnalysisManager fam;
   llvm::PassBuilder pb;
@@ -3483,7 +3441,7 @@ void CodegenLLVM::createFormatStringCall(Call &call,
                << " does not match LLVM offset=" << expected_offset;
   }
 
-  Value *fmt_args = b_.CreateGetFmtStringArgsScratchBuffer(cpu_id_stack_.top());
+  Value *fmt_args = b_.CreateGetFmtStringArgsScratchBuffer(call.loc);
   // The struct is not packed so we need to memset it
   b_.CreateMemsetBPF(fmt_args, b_.getInt8(0), struct_size);
 
@@ -3627,7 +3585,7 @@ void CodegenLLVM::createPrintNonMapCall(Call &call, int id)
   StructType *print_struct = b_.GetStructType(struct_name.str(),
                                               elements,
                                               true);
-  Value *buf = b_.CreateGetFmtStringArgsScratchBuffer(cpu_id_stack_.top());
+  Value *buf = b_.CreateGetFmtStringArgsScratchBuffer(call.loc);
   size_t struct_size = datalayout().getTypeAllocSize(print_struct);
 
   // Store asyncactionid:
@@ -4462,11 +4420,10 @@ Function *CodegenLLVM::createForEachMapCallback(const For &f, llvm::Type *ctx_t)
     });
   }
 
-  cpu_id_stack_.push(generateCachedCpuId(f.decl->loc));
-
   // Create decl variable for use in this iteration of the loop
   auto tuple = createTuple(f.decl->type,
-                           { { key, &f.decl->loc }, { val, &f.decl->loc } });
+                           { { key, &f.decl->loc }, { val, &f.decl->loc } },
+                           f.decl->loc);
   variables_[f.decl->ident] = VariableLLVM{ tuple, b_.GetType(f.decl->type) };
 
   // 1. Save original locations of variables which will form part of the
@@ -4497,9 +4454,6 @@ Function *CodegenLLVM::createForEachMapCallback(const For &f, llvm::Type *ctx_t)
   }
   b_.CreateRet(b_.getInt64(0));
 
-  // Resore original cached CPU ID
-  cpu_id_stack_.pop();
-
   // Restore original non-context variables
   for (const auto &[ident, expr] : orig_ctx_vars) {
     variables_[ident].value = expr;

diff --git a/src/ast/passes/codegen_llvm.h b/src/ast/passes/codegen_llvm.h
@@ -4,7 +4,6 @@
 #include <iostream>
 #include <optional>
 #include <ostream>
-#include <stack>
 #include <tuple>
 
 #include <llvm/IR/IRBuilder.h>
@@ -94,7 +93,8 @@ class CodegenLLVM : public Visitor {
                            const SizedType &value_type);
   Value *createTuple(
       const SizedType &tuple_type,
-      const std::vector<std::pair<llvm::Value *, const location *>> &vals);
+      const std::vector<std::pair<llvm::Value *, const location *>> &vals,
+      const location &loc);
   void createTupleCopy(const SizedType &expr_type,
                        const SizedType &var_type,
                        Value *dst_val,
@@ -251,8 +251,6 @@ class CodegenLLVM : public Visitor {
   bool canAggPerCpuMapElems(const SizedType &val_type,
                             const SizedType &key_type);
 
-  Value *generateCachedCpuId(const location &loc);
-
   Node *root_ = nullptr;
 
   BPFtrace &bpftrace_;
@@ -298,10 +296,6 @@ class CodegenLLVM : public Visitor {
   Function *map_len_func_ = nullptr;
   MDNode *loop_metadata_ = nullptr;
 
-  // Used for caching bounded CPU ID in register to avoid jumps every time
-  // we use global scratch buffer
-  std::stack<Value *> cpu_id_stack_;
-
   size_t getStructSize(StructType *s)
   {
     return module_->getDataLayout().getTypeAllocSize(s);

diff --git a/tests/codegen/llvm/avg_cast_loop.ll b/tests/codegen/llvm/avg_cast_loop.ll
@@ -24,35 +24,31 @@ define i64 @kprobe_f_1(ptr %0) section "s_kprobe_f_1" !dbg !72 {
 entry:
   %avg_struct = alloca %avg_stas_val, align 8
   %"@x_key" = alloca i64, align 8
-  %get_cpu_id = call i64 inttoptr (i64 8 to ptr)()
-  %1 = load i64, ptr @max_cpu_id, align 8
-  %2 = icmp ule i64 %get_cpu_id, %1
-  %3 = select i1 %2, i64 %get_cpu_id, i64 %1
   call void @llvm.lifetime.start.p0(i64 -1, ptr %"@x_key")
   store i64 1, ptr %"@x_key", align 8
   %lookup_elem = call ptr inttoptr (i64 1 to ptr)(ptr @AT_x, ptr %"@x_key")
   %lookup_cond = icmp ne ptr %lookup_elem, null
   br i1 %lookup_cond, label %lookup_success, label %lookup_failure
 
 lookup_success:                                   ; preds = %entry
-  %4 = getelementptr %avg_stas_val, ptr %lookup_elem, i64 0, i32 0
-  %5 = load i64, ptr %4, align 8
-  %6 = getelementptr %avg_stas_val, ptr %lookup_elem, i64 0, i32 1
-  %7 = load i64, ptr %6, align 8
-  %8 = getelementptr %avg_stas_val, ptr %lookup_elem, i64 0, i32 0
-  %9 = add i64 %5, 2
-  store i64 %9, ptr %8, align 8
-  %10 = getelementptr %avg_stas_val, ptr %lookup_elem, i64 0, i32 1
-  %11 = add i64 1, %7
-  store i64 %11, ptr %10, align 8
+  %1 = getelementptr %avg_stas_val, ptr %lookup_elem, i64 0, i32 0
+  %2 = load i64, ptr %1, align 8
+  %3 = getelementptr %avg_stas_val, ptr %lookup_elem, i64 0, i32 1
+  %4 = load i64, ptr %3, align 8
+  %5 = getelementptr %avg_stas_val, ptr %lookup_elem, i64 0, i32 0
+  %6 = add i64 %2, 2
+  store i64 %6, ptr %5, align 8
+  %7 = getelementptr %avg_stas_val, ptr %lookup_elem, i64 0, i32 1
+  %8 = add i64 1, %4
+  store i64 %8, ptr %7, align 8
   br label %lookup_merge
 
 lookup_failure:                                   ; preds = %entry
   call void @llvm.lifetime.start.p0(i64 -1, ptr %avg_struct)
-  %12 = getelementptr %avg_stas_val, ptr %avg_struct, i64 0, i32 0
-  store i64 2, ptr %12, align 8
-  %13 = getelementptr %avg_stas_val, ptr %avg_struct, i64 0, i32 1
-  store i64 1, ptr %13, align 8
+  %9 = getelementptr %avg_stas_val, ptr %avg_struct, i64 0, i32 0
+  store i64 2, ptr %9, align 8
+  %10 = getelementptr %avg_stas_val, ptr %avg_struct, i64 0, i32 1
+  store i64 1, ptr %10, align 8
   %update_elem = call i64 inttoptr (i64 2 to ptr)(ptr @AT_x, ptr %"@x_key", ptr %avg_struct, i64 0)
   call void @llvm.lifetime.end.p0(i64 -1, ptr %avg_struct)
   br label %lookup_merge
@@ -160,17 +156,17 @@ is_negative_merge_block:                          ; preds = %is_positive, %is_ne
   call void @llvm.lifetime.end.p0(i64 -1, ptr %val_2)
   %get_cpu_id = call i64 inttoptr (i64 8 to ptr)()
   %31 = load i64, ptr @max_cpu_id, align 8
-  %32 = icmp ule i64 %get_cpu_id, %31
-  %33 = select i1 %32, i64 %get_cpu_id, i64 %31
-  %34 = getelementptr [1 x [1 x [16 x i8]]], ptr @tuple_buf, i64 0, i64 %33, i64 0, i64 0
-  call void @llvm.memset.p0.i64(ptr align 1 %34, i8 0, i64 16, i1 false)
-  %35 = getelementptr %int64_avg_t__tuple_t, ptr %34, i32 0, i32 0
-  store i64 %key, ptr %35, align 8
-  %36 = getelementptr %int64_avg_t__tuple_t, ptr %34, i32 0, i32 1
-  store i64 %30, ptr %36, align 8
-  %37 = getelementptr %int64_avg_t__tuple_t, ptr %34, i32 0, i32 1
-  %38 = load i64, ptr %37, align 8
-  store i64 %38, ptr %"$res", align 8
+  %cpuid.min.cmp = icmp ule i64 %get_cpu_id, %31
+  %cpuid.min.select = select i1 %cpuid.min.cmp, i64 %get_cpu_id, i64 %31
+  %32 = getelementptr [1 x [1 x [16 x i8]]], ptr @tuple_buf, i64 0, i64 %cpuid.min.select, i64 0, i64 0
+  call void @llvm.memset.p0.i64(ptr align 1 %32, i8 0, i64 16, i1 false)
+  %33 = getelementptr %int64_avg_t__tuple_t, ptr %32, i32 0, i32 0
+  store i64 %key, ptr %33, align 8
+  %34 = getelementptr %int64_avg_t__tuple_t, ptr %32, i32 0, i32 1
+  store i64 %30, ptr %34, align 8
+  %35 = getelementptr %int64_avg_t__tuple_t, ptr %32, i32 0, i32 1
+  %36 = load i64, ptr %35, align 8
+  store i64 %36, ptr %"$res", align 8
   ret i64 0
 }
 

diff --git a/tests/codegen/llvm/call_cat.ll b/tests/codegen/llvm/call_cat.ll
@@ -21,13 +21,13 @@ entry:
   %key = alloca i32, align 4
   %get_cpu_id = call i64 inttoptr (i64 8 to ptr)()
   %1 = load i64, ptr @max_cpu_id, align 8
-  %2 = icmp ule i64 %get_cpu_id, %1
-  %3 = select i1 %2, i64 %get_cpu_id, i64 %1
-  %4 = getelementptr [1 x [1 x [8 x i8]]], ptr @fmt_str_buf, i64 0, i64 %3, i64 0, i64 0
-  call void @llvm.memset.p0.i64(ptr align 1 %4, i8 0, i64 8, i1 false)
-  %5 = getelementptr %cat_t, ptr %4, i32 0, i32 0
-  store i64 20000, ptr %5, align 8
-  %ringbuf_output = call i64 inttoptr (i64 130 to ptr)(ptr @ringbuf, ptr %4, i64 8, i64 0)
+  %cpuid.min.cmp = icmp ule i64 %get_cpu_id, %1
+  %cpuid.min.select = select i1 %cpuid.min.cmp, i64 %get_cpu_id, i64 %1
+  %2 = getelementptr [1 x [1 x [8 x i8]]], ptr @fmt_str_buf, i64 0, i64 %cpuid.min.select, i64 0, i64 0
+  call void @llvm.memset.p0.i64(ptr align 1 %2, i8 0, i64 8, i1 false)
+  %3 = getelementptr %cat_t, ptr %2, i32 0, i32 0
+  store i64 20000, ptr %3, align 8
+  %ringbuf_output = call i64 inttoptr (i64 130 to ptr)(ptr @ringbuf, ptr %2, i64 8, i64 0)
   %ringbuf_loss = icmp slt i64 %ringbuf_output, 0
   br i1 %ringbuf_loss, label %event_loss_counter, label %counter_merge
 
@@ -42,7 +42,7 @@ counter_merge:                                    ; preds = %lookup_merge, %entr
   ret i64 0
 
 lookup_success:                                   ; preds = %event_loss_counter
-  %6 = atomicrmw add ptr %lookup_elem, i64 1 seq_cst, align 8
+  %4 = atomicrmw add ptr %lookup_elem, i64 1 seq_cst, align 8
   br label %lookup_merge
 
 lookup_failure:                                   ; preds = %event_loss_counter