Skip to content

Commit

Permalink
Revert "Cache bounded CPU ID in register for global scratch buffers"
Browse files Browse the repository at this point in the history
This reverts commit cd681f1.
  • Loading branch information
ryantimwilson authored and jordalgo committed Oct 9, 2024
1 parent f98a23c commit 2fa7894
Show file tree
Hide file tree
Showing 34 changed files with 701 additions and 757 deletions.
24 changes: 16 additions & 8 deletions src/ast/irbuilderbpf.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -487,38 +487,46 @@ CallInst *IRBuilderBPF::CreateGetStrScratchMap(int idx,
idx);
}

Value *IRBuilderBPF::CreateGetFmtStringArgsScratchBuffer(Value *cpu_id)
Value *IRBuilderBPF::CreateGetFmtStringArgsScratchBuffer(const location &loc)
{
return createScratchBuffer(
bpftrace::globalvars::GlobalVar::FMT_STRINGS_BUFFER, cpu_id);
bpftrace::globalvars::GlobalVar::FMT_STRINGS_BUFFER, loc);
}

Value *IRBuilderBPF::CreateTupleScratchBuffer(Value *cpu_id, int key)
Value *IRBuilderBPF::CreateTupleScratchBuffer(const location &loc, int key)
{
return createScratchBuffer(bpftrace::globalvars::GlobalVar::TUPLE_BUFFER,
cpu_id,
loc,
key);
}

Value *IRBuilderBPF::createScratchBuffer(
bpftrace::globalvars::GlobalVar globalvar,
Value *cpu_id,
const location &loc,
size_t key)
{
assert(cpu_id);

auto config = globalvars::get_config(globalvar);
// ValueType var[MAX_CPU_ID + 1][num_elements]
auto type = globalvars::get_type(globalvar, bpftrace_.resources);

// Get CPU ID
auto cpu_id = CreateGetCpuId(loc);
auto max = CreateLoad(getInt64Ty(),
module_.getGlobalVariable(to_string(
bpftrace::globalvars::GlobalVar::MAX_CPU_ID)));
// Verify CPU ID is between 0 and MAX_CPU_ID to ensure we pass BPF
// verifer on older 5.2 kernels
auto cmp = CreateICmp(CmpInst::ICMP_ULE, cpu_id, max, "cpuid.min.cmp");
auto select = CreateSelect(cmp, cpu_id, max, "cpuid.min.select");

// Note the 1st index is 0 because we're pointing to
// ValueType var[MAX_CPU_ID + 1][num_elements]
// 2nd/3rd/4th indexes actually index into the array
// See https://llvm.org/docs/LangRef.html#id236
return CreateGEP(GetType(type),
CreatePointerCast(module_.getGlobalVariable(config.name),
GetType(type)->getPointerTo()),
{ getInt64(0), cpu_id, getInt64(key), getInt64(0) });
{ getInt64(0), select, getInt64(key), getInt64(0) });
}

/*
Expand Down
6 changes: 3 additions & 3 deletions src/ast/irbuilderbpf.h
Original file line number Diff line number Diff line change
Expand Up @@ -168,8 +168,8 @@ class IRBuilderBPF : public IRBuilder<> {
CallInst *CreateGetStrScratchMap(int idx,
BasicBlock *failure_callback,
const location &loc);
Value *CreateGetFmtStringArgsScratchBuffer(Value *cpu_id);
Value *CreateTupleScratchBuffer(Value *cpu_id, int key);
Value *CreateGetFmtStringArgsScratchBuffer(const location &loc);
Value *CreateTupleScratchBuffer(const location &loc, int key);
void CreateCheckSetRecursion(const location &loc, int early_exit_ret);
void CreateUnSetRecursion(const location &loc);
CallInst *CreateHelperCall(libbpf::bpf_func_id func_id,
Expand Down Expand Up @@ -301,7 +301,7 @@ class IRBuilderBPF : public IRBuilder<> {
BasicBlock *failure_callback,
int key = 0);
Value *createScratchBuffer(globalvars::GlobalVar globalvar,
Value *cpu_id,
const location &loc,
size_t key = 0);
libbpf::bpf_func_id selectProbeReadHelper(AddrSpace as, bool str);

Expand Down
66 changes: 10 additions & 56 deletions src/ast/passes/codegen_llvm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2229,17 +2229,18 @@ void CodegenLLVM::compareStructure(SizedType &our_type, llvm::Type *llvm_type)
*/
Value *CodegenLLVM::createTuple(
const SizedType &tuple_type,
const std::vector<std::pair<llvm::Value *, const location *>> &vals)
const std::vector<std::pair<llvm::Value *, const location *>> &vals,
const location &loc)
{
auto tuple_ty = b_.GetType(tuple_type);
size_t tuple_size = datalayout().getTypeAllocSize(tuple_ty);
auto buf = b_.CreatePointerCast(
b_.CreateTupleScratchBuffer(cpu_id_stack_.top(), async_ids_.tuple()),
b_.CreateTupleScratchBuffer(loc, async_ids_.tuple()),
tuple_ty->getPointerTo());
b_.CreateMemsetBPF(buf, b_.getInt8(0), tuple_size);

for (size_t i = 0; i < vals.size(); ++i) {
auto [val, loc] = vals[i];
auto [val, vloc] = vals[i];
SizedType &type = tuple_type.GetField(i).type;

Value *dst = b_.CreateGEP(tuple_ty,
Expand All @@ -2249,7 +2250,7 @@ Value *CodegenLLVM::createTuple(
if (inBpfMemory(type))
b_.CreateMemcpyBPF(dst, val, type.GetSize());
else if (type.IsArrayTy() || type.IsRecordTy())
b_.CreateProbeRead(ctx_, dst, type, val, *loc);
b_.CreateProbeRead(ctx_, dst, type, val, *vloc);
else
b_.CreateStore(val, dst);
}
Expand Down Expand Up @@ -2303,7 +2304,7 @@ void CodegenLLVM::visit(Tuple &tuple)
scoped_dels.emplace_back(accept(elem));
vals.push_back({ expr_, &elem->loc });
}
auto buf = createTuple(tuple.type, vals);
auto buf = createTuple(tuple.type, vals, tuple.loc);

// No need to end buf lifetime since tuple allocation is done via scratch maps
// and not on stack
Expand Down Expand Up @@ -2647,37 +2648,6 @@ void CodegenLLVM::visit(AttachPoint &)
// Empty
}

// For older 5.2 kernels, we need to bound CPU ID by MAX_CPU_ID to pass
// BPF verifier on older kernels.
//
// However, for programs that do many scratch buffer lookups, this causes
// many jumps which means a complex program may be rejected by the BPF
// verifier for >= 8192 jumps (current limit).
//
// To minimize jumps, we bound the CPU ID at the beginning of each function
// and cache the value in a register. This ensures we only add one
// additional jump per function. Note we cannot cache this value in a global
// variable since memory is shared between all CPUs or a per-CPU array since
// that would be recursive.
//
// Note the only functions in scope are the ones where scratch buffer is used
// which is currently subprograms, probes and map_for_each_cb. To account for
// nesting, we use a stack to ensure we're referencing the correct instance.
Value *CodegenLLVM::generateCachedCpuId(const location &loc)
{
if (!bpftrace_.resources.needed_global_vars.count(
globalvars::GlobalVar::MAX_CPU_ID)) {
return nullptr;
}

auto cpu_id = b_.CreateGetCpuId(loc);
auto max = b_.CreateLoad(b_.getInt64Ty(),
module_->getGlobalVariable(to_string(
bpftrace::globalvars::GlobalVar::MAX_CPU_ID)));
auto cmp = b_.CreateICmp(CmpInst::ICMP_ULE, cpu_id, max);
return b_.CreateSelect(cmp, cpu_id, max);
}

void CodegenLLVM::generateProbe(Probe &probe,
const std::string &full_func_id,
const std::string &name,
Expand Down Expand Up @@ -2710,9 +2680,6 @@ void CodegenLLVM::generateProbe(Probe &probe,
getReturnValueForProbe(probe_type));
}

assert(cpu_id_stack_.empty());
cpu_id_stack_.push(generateCachedCpuId(probe.loc));

if (probe.pred) {
auto scoped_del = accept(probe.pred);
}
Expand All @@ -2723,9 +2690,6 @@ void CodegenLLVM::generateProbe(Probe &probe,

createRet();

cpu_id_stack_.pop();
assert(cpu_id_stack_.empty());

if (dummy) {
func->eraseFromParent();
return;
Expand Down Expand Up @@ -2808,18 +2772,12 @@ void CodegenLLVM::visit(Subprog &subprog)
++arg_index;
}

assert(cpu_id_stack_.empty());
cpu_id_stack_.push(generateCachedCpuId(subprog.loc));

for (Statement *stmt : subprog.stmts) {
auto scoped_del = accept(stmt);
}
if (subprog.return_type.IsVoidTy())
createRet();

cpu_id_stack_.pop();
assert(cpu_id_stack_.empty());

FunctionPassManager fpm;
FunctionAnalysisManager fam;
llvm::PassBuilder pb;
Expand Down Expand Up @@ -3483,7 +3441,7 @@ void CodegenLLVM::createFormatStringCall(Call &call,
<< " does not match LLVM offset=" << expected_offset;
}

Value *fmt_args = b_.CreateGetFmtStringArgsScratchBuffer(cpu_id_stack_.top());
Value *fmt_args = b_.CreateGetFmtStringArgsScratchBuffer(call.loc);
// The struct is not packed so we need to memset it
b_.CreateMemsetBPF(fmt_args, b_.getInt8(0), struct_size);

Expand Down Expand Up @@ -3627,7 +3585,7 @@ void CodegenLLVM::createPrintNonMapCall(Call &call, int id)
StructType *print_struct = b_.GetStructType(struct_name.str(),
elements,
true);
Value *buf = b_.CreateGetFmtStringArgsScratchBuffer(cpu_id_stack_.top());
Value *buf = b_.CreateGetFmtStringArgsScratchBuffer(call.loc);
size_t struct_size = datalayout().getTypeAllocSize(print_struct);

// Store asyncactionid:
Expand Down Expand Up @@ -4462,11 +4420,10 @@ Function *CodegenLLVM::createForEachMapCallback(const For &f, llvm::Type *ctx_t)
});
}

cpu_id_stack_.push(generateCachedCpuId(f.decl->loc));

// Create decl variable for use in this iteration of the loop
auto tuple = createTuple(f.decl->type,
{ { key, &f.decl->loc }, { val, &f.decl->loc } });
{ { key, &f.decl->loc }, { val, &f.decl->loc } },
f.decl->loc);
variables_[f.decl->ident] = VariableLLVM{ tuple, b_.GetType(f.decl->type) };

// 1. Save original locations of variables which will form part of the
Expand Down Expand Up @@ -4497,9 +4454,6 @@ Function *CodegenLLVM::createForEachMapCallback(const For &f, llvm::Type *ctx_t)
}
b_.CreateRet(b_.getInt64(0));

// Resore original cached CPU ID
cpu_id_stack_.pop();

// Restore original non-context variables
for (const auto &[ident, expr] : orig_ctx_vars) {
variables_[ident].value = expr;
Expand Down
10 changes: 2 additions & 8 deletions src/ast/passes/codegen_llvm.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
#include <iostream>
#include <optional>
#include <ostream>
#include <stack>
#include <tuple>

#include <llvm/IR/IRBuilder.h>
Expand Down Expand Up @@ -94,7 +93,8 @@ class CodegenLLVM : public Visitor {
const SizedType &value_type);
Value *createTuple(
const SizedType &tuple_type,
const std::vector<std::pair<llvm::Value *, const location *>> &vals);
const std::vector<std::pair<llvm::Value *, const location *>> &vals,
const location &loc);
void createTupleCopy(const SizedType &expr_type,
const SizedType &var_type,
Value *dst_val,
Expand Down Expand Up @@ -251,8 +251,6 @@ class CodegenLLVM : public Visitor {
bool canAggPerCpuMapElems(const SizedType &val_type,
const SizedType &key_type);

Value *generateCachedCpuId(const location &loc);

Node *root_ = nullptr;

BPFtrace &bpftrace_;
Expand Down Expand Up @@ -298,10 +296,6 @@ class CodegenLLVM : public Visitor {
Function *map_len_func_ = nullptr;
MDNode *loop_metadata_ = nullptr;

// Used for caching bounded CPU ID in register to avoid jumps every time
// we use global scratch buffer
std::stack<Value *> cpu_id_stack_;

size_t getStructSize(StructType *s)
{
return module_->getDataLayout().getTypeAllocSize(s);
Expand Down
54 changes: 25 additions & 29 deletions tests/codegen/llvm/avg_cast_loop.ll
Original file line number Diff line number Diff line change
Expand Up @@ -24,35 +24,31 @@ define i64 @kprobe_f_1(ptr %0) section "s_kprobe_f_1" !dbg !72 {
entry:
%avg_struct = alloca %avg_stas_val, align 8
%"@x_key" = alloca i64, align 8
%get_cpu_id = call i64 inttoptr (i64 8 to ptr)()
%1 = load i64, ptr @max_cpu_id, align 8
%2 = icmp ule i64 %get_cpu_id, %1
%3 = select i1 %2, i64 %get_cpu_id, i64 %1
call void @llvm.lifetime.start.p0(i64 -1, ptr %"@x_key")
store i64 1, ptr %"@x_key", align 8
%lookup_elem = call ptr inttoptr (i64 1 to ptr)(ptr @AT_x, ptr %"@x_key")
%lookup_cond = icmp ne ptr %lookup_elem, null
br i1 %lookup_cond, label %lookup_success, label %lookup_failure

lookup_success: ; preds = %entry
%4 = getelementptr %avg_stas_val, ptr %lookup_elem, i64 0, i32 0
%5 = load i64, ptr %4, align 8
%6 = getelementptr %avg_stas_val, ptr %lookup_elem, i64 0, i32 1
%7 = load i64, ptr %6, align 8
%8 = getelementptr %avg_stas_val, ptr %lookup_elem, i64 0, i32 0
%9 = add i64 %5, 2
store i64 %9, ptr %8, align 8
%10 = getelementptr %avg_stas_val, ptr %lookup_elem, i64 0, i32 1
%11 = add i64 1, %7
store i64 %11, ptr %10, align 8
%1 = getelementptr %avg_stas_val, ptr %lookup_elem, i64 0, i32 0
%2 = load i64, ptr %1, align 8
%3 = getelementptr %avg_stas_val, ptr %lookup_elem, i64 0, i32 1
%4 = load i64, ptr %3, align 8
%5 = getelementptr %avg_stas_val, ptr %lookup_elem, i64 0, i32 0
%6 = add i64 %2, 2
store i64 %6, ptr %5, align 8
%7 = getelementptr %avg_stas_val, ptr %lookup_elem, i64 0, i32 1
%8 = add i64 1, %4
store i64 %8, ptr %7, align 8
br label %lookup_merge

lookup_failure: ; preds = %entry
call void @llvm.lifetime.start.p0(i64 -1, ptr %avg_struct)
%12 = getelementptr %avg_stas_val, ptr %avg_struct, i64 0, i32 0
store i64 2, ptr %12, align 8
%13 = getelementptr %avg_stas_val, ptr %avg_struct, i64 0, i32 1
store i64 1, ptr %13, align 8
%9 = getelementptr %avg_stas_val, ptr %avg_struct, i64 0, i32 0
store i64 2, ptr %9, align 8
%10 = getelementptr %avg_stas_val, ptr %avg_struct, i64 0, i32 1
store i64 1, ptr %10, align 8
%update_elem = call i64 inttoptr (i64 2 to ptr)(ptr @AT_x, ptr %"@x_key", ptr %avg_struct, i64 0)
call void @llvm.lifetime.end.p0(i64 -1, ptr %avg_struct)
br label %lookup_merge
Expand Down Expand Up @@ -160,17 +156,17 @@ is_negative_merge_block: ; preds = %is_positive, %is_ne
call void @llvm.lifetime.end.p0(i64 -1, ptr %val_2)
%get_cpu_id = call i64 inttoptr (i64 8 to ptr)()
%31 = load i64, ptr @max_cpu_id, align 8
%32 = icmp ule i64 %get_cpu_id, %31
%33 = select i1 %32, i64 %get_cpu_id, i64 %31
%34 = getelementptr [1 x [1 x [16 x i8]]], ptr @tuple_buf, i64 0, i64 %33, i64 0, i64 0
call void @llvm.memset.p0.i64(ptr align 1 %34, i8 0, i64 16, i1 false)
%35 = getelementptr %int64_avg_t__tuple_t, ptr %34, i32 0, i32 0
store i64 %key, ptr %35, align 8
%36 = getelementptr %int64_avg_t__tuple_t, ptr %34, i32 0, i32 1
store i64 %30, ptr %36, align 8
%37 = getelementptr %int64_avg_t__tuple_t, ptr %34, i32 0, i32 1
%38 = load i64, ptr %37, align 8
store i64 %38, ptr %"$res", align 8
%cpuid.min.cmp = icmp ule i64 %get_cpu_id, %31
%cpuid.min.select = select i1 %cpuid.min.cmp, i64 %get_cpu_id, i64 %31
%32 = getelementptr [1 x [1 x [16 x i8]]], ptr @tuple_buf, i64 0, i64 %cpuid.min.select, i64 0, i64 0
call void @llvm.memset.p0.i64(ptr align 1 %32, i8 0, i64 16, i1 false)
%33 = getelementptr %int64_avg_t__tuple_t, ptr %32, i32 0, i32 0
store i64 %key, ptr %33, align 8
%34 = getelementptr %int64_avg_t__tuple_t, ptr %32, i32 0, i32 1
store i64 %30, ptr %34, align 8
%35 = getelementptr %int64_avg_t__tuple_t, ptr %32, i32 0, i32 1
%36 = load i64, ptr %35, align 8
store i64 %36, ptr %"$res", align 8
ret i64 0
}

Expand Down
16 changes: 8 additions & 8 deletions tests/codegen/llvm/call_cat.ll
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,13 @@ entry:
%key = alloca i32, align 4
%get_cpu_id = call i64 inttoptr (i64 8 to ptr)()
%1 = load i64, ptr @max_cpu_id, align 8
%2 = icmp ule i64 %get_cpu_id, %1
%3 = select i1 %2, i64 %get_cpu_id, i64 %1
%4 = getelementptr [1 x [1 x [8 x i8]]], ptr @fmt_str_buf, i64 0, i64 %3, i64 0, i64 0
call void @llvm.memset.p0.i64(ptr align 1 %4, i8 0, i64 8, i1 false)
%5 = getelementptr %cat_t, ptr %4, i32 0, i32 0
store i64 20000, ptr %5, align 8
%ringbuf_output = call i64 inttoptr (i64 130 to ptr)(ptr @ringbuf, ptr %4, i64 8, i64 0)
%cpuid.min.cmp = icmp ule i64 %get_cpu_id, %1
%cpuid.min.select = select i1 %cpuid.min.cmp, i64 %get_cpu_id, i64 %1
%2 = getelementptr [1 x [1 x [8 x i8]]], ptr @fmt_str_buf, i64 0, i64 %cpuid.min.select, i64 0, i64 0
call void @llvm.memset.p0.i64(ptr align 1 %2, i8 0, i64 8, i1 false)
%3 = getelementptr %cat_t, ptr %2, i32 0, i32 0
store i64 20000, ptr %3, align 8
%ringbuf_output = call i64 inttoptr (i64 130 to ptr)(ptr @ringbuf, ptr %2, i64 8, i64 0)
%ringbuf_loss = icmp slt i64 %ringbuf_output, 0
br i1 %ringbuf_loss, label %event_loss_counter, label %counter_merge

Expand All @@ -42,7 +42,7 @@ counter_merge: ; preds = %lookup_merge, %entr
ret i64 0

lookup_success: ; preds = %event_loss_counter
%6 = atomicrmw add ptr %lookup_elem, i64 1 seq_cst, align 8
%4 = atomicrmw add ptr %lookup_elem, i64 1 seq_cst, align 8
br label %lookup_merge

lookup_failure: ; preds = %event_loss_counter
Expand Down
Loading

0 comments on commit 2fa7894

Please sign in to comment.